Merge pull request #9612 from adrianeboyd/chore/switch-to-master-v3.2.0

Switch v3.2.0 to master
This commit is contained in:
Adriane Boyd 2021-11-03 16:27:34 +01:00 committed by GitHub
commit 2bf52c44b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
148 changed files with 3381 additions and 1690 deletions

View File

@ -65,8 +65,11 @@ steps:
condition: eq(${{ parameters.gpu }}, true) condition: eq(${{ parameters.gpu }}, true)
- script: | - script: |
python -m spacy download ca_core_news_sm #python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md #python -m spacy download ca_core_news_md
# temporarily install the v3.1.0 models
pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.1.0/ca_core_news_sm-3.1.0-py3-none-any.whl
pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.1.0/ca_core_news_md-3.1.0-py3-none-any.whl
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
displayName: 'Test download CLI' displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
@ -95,7 +98,8 @@ steps:
- script: | - script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir # temporarily ignore W095
PYTHONWARNINGS="error,ignore:[W095]:UserWarning,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
displayName: 'Test assemble CLI' displayName: 'Test assemble CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')

106
.github/contributors/avi197.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Son Pham |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 09/10/2021 |
| GitHub username | Avi197 |
| Website (optional) | |

106
.github/contributors/fgaim.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Fitsum Gaim |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 2021-08-07 |
| GitHub username | fgaim |
| Website (optional) | |

106
.github/contributors/syrull.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Dimitar Ganev |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 2021/8/2 |
| GitHub username | syrull |
| Website (optional) | |

1
.gitignore vendored
View File

@ -9,6 +9,7 @@ keys/
spacy/tests/package/setup.cfg spacy/tests/package/setup.cfg
spacy/tests/package/pyproject.toml spacy/tests/package/pyproject.toml
spacy/tests/package/requirements.txt spacy/tests/package/requirements.txt
spacy/tests/universe/universe.json
# Website # Website
website/.cache/ website/.cache/

View File

@ -1,5 +1,6 @@
# Our libraries # Our libraries
spacy-legacy>=3.0.8,<3.1.0 spacy-legacy>=3.0.8,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.12,<8.1.0 thinc>=8.0.12,<8.1.0
@ -17,6 +18,7 @@ requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
jinja2 jinja2
langcodes>=3.2.0,<4.0.0
# Official Python utilities # Official Python utilities
setuptools setuptools
packaging>=20.0 packaging>=20.0

View File

@ -42,6 +42,7 @@ setup_requires =
install_requires = install_requires =
# Our libraries # Our libraries
spacy-legacy>=3.0.8,<3.1.0 spacy-legacy>=3.0.8,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
@ -62,6 +63,7 @@ install_requires =
setuptools setuptools
packaging>=20.0 packaging>=20.0
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8" typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points] [options.entry_points]
console_scripts = console_scripts =
@ -69,9 +71,9 @@ console_scripts =
[options.extras_require] [options.extras_require]
lookups = lookups =
spacy_lookups_data>=1.0.2,<1.1.0 spacy_lookups_data>=1.0.3,<1.1.0
transformers = transformers =
spacy_transformers>=1.0.1,<1.2.0 spacy_transformers>=1.1.2,<1.2.0
ray = ray =
spacy_ray>=0.1.0,<1.0.0 spacy_ray>=0.1.0,<1.0.0
cuda = cuda =

View File

@ -81,6 +81,7 @@ COPY_FILES = {
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package", ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package", ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package", ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
} }

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.1.4" __version__ = "3.2.0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -142,7 +142,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
for name, value in stringy_attrs.items(): for name, value in stringy_attrs.items():
int_key = intify_attr(name) int_key = intify_attr(name)
if int_key is not None: if int_key is not None:
if strings_map is not None and isinstance(value, basestring): if strings_map is not None and isinstance(value, str):
if hasattr(strings_map, 'add'): if hasattr(strings_map, 'add'):
value = strings_map.add(value) value = strings_map.add(value)
else: else:

View File

@ -20,6 +20,7 @@ def init_vectors_cli(
output_dir: Path = Arg(..., help="Pipeline output directory"), output_dir: Path = Arg(..., help="Pipeline output directory"),
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
@ -34,7 +35,14 @@ def init_vectors_cli(
nlp = util.get_lang_class(lang)() nlp = util.get_lang_class(lang)()
if jsonl_loc is not None: if jsonl_loc is not None:
update_lexemes(nlp, jsonl_loc) update_lexemes(nlp, jsonl_loc)
convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name) convert_vectors(
nlp,
vectors_loc,
truncate=truncate,
prune=prune,
name=name,
mode=mode,
)
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
nlp.to_disk(output_dir) nlp.to_disk(output_dir)
msg.good( msg.good(

View File

@ -5,6 +5,7 @@ raw_text = null
max_epochs = 1000 max_epochs = 1000
dropout = 0.2 dropout = 0.2
n_save_every = null n_save_every = null
n_save_epoch = null
component = "tok2vec" component = "tok2vec"
layer = "" layer = ""
corpus = "corpora.pretrain" corpus = "corpora.pretrain"

View File

@ -22,6 +22,9 @@ def setup_default_warnings():
# warn once about lemmatizer without required POS # warn once about lemmatizer without required POS
filter_warning("once", error_msg=Warnings.W108) filter_warning("once", error_msg=Warnings.W108)
# floret vector table cannot be modified
filter_warning("once", error_msg="[W114]")
def filter_warning(action: str, error_msg: str): def filter_warning(action: str, error_msg: str):
"""Customize how spaCy should handle a certain warning. """Customize how spaCy should handle a certain warning.
@ -186,6 +189,8 @@ class Warnings(metaclass=ErrorsWithCodes):
"vectors are not identical to current pipeline vectors.") "vectors are not identical to current pipeline vectors.")
W114 = ("Using multiprocessing with GPU models is not recommended and may " W114 = ("Using multiprocessing with GPU models is not recommended and may "
"lead to errors.") "lead to errors.")
W115 = ("Skipping {method}: the floret vector table cannot be modified. "
"Vectors are calculated from character ngrams.")
class Errors(metaclass=ErrorsWithCodes): class Errors(metaclass=ErrorsWithCodes):
@ -277,7 +282,7 @@ class Errors(metaclass=ErrorsWithCodes):
"you forget to call the `set_extension` method?") "you forget to call the `set_extension` method?")
E047 = ("Can't assign a value to unregistered extension attribute " E047 = ("Can't assign a value to unregistered extension attribute "
"'{name}'. Did you forget to call the `set_extension` method?") "'{name}'. Did you forget to call the `set_extension` method?")
E048 = ("Can't import language {lang} from spacy.lang: {err}") E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}")
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python " E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
"package or a valid path to a data directory.") "package or a valid path to a data directory.")
E052 = ("Can't find model directory: {path}") E052 = ("Can't find model directory: {path}")
@ -511,13 +516,24 @@ class Errors(metaclass=ErrorsWithCodes):
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.") E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the " E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
# New errors added in v3.x # New errors added in v3.x
E866 = ("A SpanGroup is not functional after the corresponding Doc has " E858 = ("The {mode} vector table does not support this operation. "
"{alternative}")
E859 = ("The floret vector table cannot be modified.")
E860 = ("Can't truncate fasttext-bloom vectors.")
E861 = ("No 'keys' should be provided when initializing floret vectors "
"with 'minn' and 'maxn'.")
E862 = ("'hash_count' must be between 1-4 for floret vectors.")
E863 = ("'maxn' must be greater than or equal to 'minn'.")
E864 = ("The complete vector table 'data' is required to initialize floret "
"vectors.")
E865 = ("A SpanGroup is not functional after the corresponding Doc has "
"been garbage collected. To keep using the spans, make sure that " "been garbage collected. To keep using the spans, make sure that "
"the corresponding Doc object is still available in the scope of " "the corresponding Doc object is still available in the scope of "
"your function.") "your function.")
E866 = ("Expected a string or 'Doc' as input, but got: {type}.")
E867 = ("The 'textcat' component requires at least two labels because it " E867 = ("The 'textcat' component requires at least two labels because it "
"uses mutually exclusive classes where exactly one label is True " "uses mutually exclusive classes where exactly one label is True "
"for each doc. For binary classification tasks, you can use two " "for each doc. For binary classification tasks, you can use two "

View File

@ -124,7 +124,7 @@ cdef class KnowledgeBase:
def get_alias_strings(self): def get_alias_strings(self):
return [self.vocab.strings[x] for x in self._alias_index] return [self.vocab.strings[x] for x in self._alias_index]
def add_entity(self, unicode entity, float freq, vector[float] entity_vector): def add_entity(self, str entity, float freq, vector[float] entity_vector):
""" """
Add an entity to the KB, optionally specifying its log probability based on corpus frequency Add an entity to the KB, optionally specifying its log probability based on corpus frequency
Return the hash of the entity ID/name at the end. Return the hash of the entity ID/name at the end.
@ -185,15 +185,15 @@ cdef class KnowledgeBase:
i += 1 i += 1
def contains_entity(self, unicode entity): def contains_entity(self, str entity):
cdef hash_t entity_hash = self.vocab.strings.add(entity) cdef hash_t entity_hash = self.vocab.strings.add(entity)
return entity_hash in self._entry_index return entity_hash in self._entry_index
def contains_alias(self, unicode alias): def contains_alias(self, str alias):
cdef hash_t alias_hash = self.vocab.strings.add(alias) cdef hash_t alias_hash = self.vocab.strings.add(alias)
return alias_hash in self._alias_index return alias_hash in self._alias_index
def add_alias(self, unicode alias, entities, probabilities): def add_alias(self, str alias, entities, probabilities):
""" """
For a given alias, add its potential entities and prior probabilies to the KB. For a given alias, add its potential entities and prior probabilies to the KB.
Return the alias_hash at the end Return the alias_hash at the end
@ -239,7 +239,7 @@ cdef class KnowledgeBase:
raise RuntimeError(Errors.E891.format(alias=alias)) raise RuntimeError(Errors.E891.format(alias=alias))
return alias_hash return alias_hash
def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False): def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
""" """
For an alias already existing in the KB, extend its potential entities with one more. For an alias already existing in the KB, extend its potential entities with one more.
Throw a warning if either the alias or the entity is unknown, Throw a warning if either the alias or the entity is unknown,
@ -286,7 +286,7 @@ cdef class KnowledgeBase:
alias_entry.probs = probs alias_entry.probs = probs
self._aliases_table[alias_index] = alias_entry self._aliases_table[alias_index] = alias_entry
def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]: def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
""" """
Return candidate entities for an alias. Each candidate defines the entity, the original alias, Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity. and the prior probability of that alias resolving to that entity.
@ -307,7 +307,7 @@ cdef class KnowledgeBase:
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
if entry_index != 0] if entry_index != 0]
def get_vector(self, unicode entity): def get_vector(self, str entity):
cdef hash_t entity_hash = self.vocab.strings[entity] cdef hash_t entity_hash = self.vocab.strings[entity]
# Return an empty list if this entity is unknown in this KB # Return an empty list if this entity is unknown in this KB
@ -317,7 +317,7 @@ cdef class KnowledgeBase:
return self._vectors_table[self._entries[entry_index].vector_index] return self._vectors_table[self._entries[entry_index].vector_index]
def get_prior_prob(self, unicode entity, unicode alias): def get_prior_prob(self, str entity, str alias):
""" Return the prior probability of a given alias being linked to a given entity, """ Return the prior probability of a given alias being linked to a given entity,
or return 0.0 when this combination is not known in the knowledge base""" or return 0.0 when this combination is not known in the knowledge base"""
cdef hash_t alias_hash = self.vocab.strings[alias] cdef hash_t alias_hash = self.vocab.strings[alias]
@ -587,7 +587,7 @@ cdef class Writer:
def __init__(self, path): def __init__(self, path):
assert isinstance(path, Path) assert isinstance(path, Path)
content = bytes(path) content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
self._fp = fopen(<char*>bytes_loc, 'wb') self._fp = fopen(<char*>bytes_loc, 'wb')
if not self._fp: if not self._fp:
raise IOError(Errors.E146.format(path=path)) raise IOError(Errors.E146.format(path=path))
@ -629,7 +629,7 @@ cdef class Writer:
cdef class Reader: cdef class Reader:
def __init__(self, path): def __init__(self, path):
content = bytes(path) content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
self._fp = fopen(<char*>bytes_loc, 'rb') self._fp = fopen(<char*>bytes_loc, 'rb')
if not self._fp: if not self._fp:
PyErr_SetFromErrno(IOError) PyErr_SetFromErrno(IOError)

View File

@ -1,7 +1,7 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import UNITS, ALPHA_UPPER from ..char_classes import UNITS, ALPHA_UPPER
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split() _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
_suffixes = ( _suffixes = (
_list_punct _list_punct

View File

@ -1,265 +1,79 @@
# Source: https://github.com/Alir3z4/stop-words """
References:
https://github.com/Alir3z4/stop-words - Original list, serves as a base.
https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it.
"""
STOP_WORDS = set( STOP_WORDS = set(
""" """
а а автентичен аз ако ала
автентичен
аз бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат
ако бъде бъда бяха
ала
бе в вас ваш ваша вашата вашият вероятно вече взема ви вие винаги внимава време все
без всеки всички вместо всичко вследствие всъщност всяка втори във въпреки върху
беше вътре веднъж
би
бивш г ги главен главна главно глас го годно година години годишен
бивша
бившо д да дали далеч далече два двама двамата две двете ден днес дни до добра добре
бил добро добър достатъчно докато докога дори досега доста друг друга другаде други
била
били е евтин едва един една еднаква еднакви еднакъв едно екип ето
било
благодаря живот жив
близо
бъдат за здравей здрасти знае зная забавям зад зададени заедно заради засега заспал
бъде затова запазва започвам защо защото завинаги
бяха
в и из или им има имат иска искам използвайки изглежда изглеждаше изглеждайки
вас извън имайки
ваш
ваша й йо
вероятно
вече каза казва казвайки казвам как каква какво както какъв като кога кауза каузи
взема когато когото което които кой който колко която къде където към край кратък
ви кръгъл
вие
винаги лесен лесно ли летя летиш летим лош
внимава
време м май малко макар малцина междувременно минус ме между мек мен месец ми мис
все мисля много мнозина мога могат може мой можем мокър моля момента му
всеки
всички н на над назад най наш навсякъде навътре нагоре направи напред надолу наистина
всичко например наопаки наполовина напоследък нека независимо нас насам наскоро
всяка настрана необходимо него негов нещо нея ни ние никой нито нищо но нов някак нова
във нови новина някои някой някога някъде няколко няма
въпреки
върху о обаче около описан опитах опитва опитвайки опитвам определен определено освен
г обикновено осигурява обратно означава особен особено от ох отвъд отгоре отдолу
ги отново отива отивам отидох отсега отделно отколкото откъдето очевидно оттам
главен относно още
главна
главно п пак по повече повечето под поне просто пряко поради после последен последно
глас посочен почти прави прав прави правя пред преди през при пък първата първи първо
го път пъти плюс
година
години равен равна различен различни разумен разумно
годишен
д с са сам само себе сериозно сигурен сигурно се сега си син скоро скорошен след
да следващ следващия следва следното следователно случва сме смях собствен
дали сравнително смея според сред става срещу съвсем съдържа съдържащ съжалявам
два съответен съответно сте съм със също
двама
двамата т така техен техни такива такъв твърде там трета твой те тези ти то това
две тогава този той търси толкова точно три трябва тук тъй тя тях
двете
ден у утре ужасно употреба успоредно уточнен уточняване
днес
дни харесва харесали хиляди
до
добра ч часа ценя цяло цялостен че често чрез чудя
добре
добро ще щеше щом щяха
добър
докато
докога
дори
досега
доста
друг
друга
други
е
евтин
едва
един
една
еднаква
еднакви
еднакъв
едно
екип
ето
живот
за
забавям
зад
заедно
заради
засега
заспал
затова
защо
защото
и
из
или
им
има
имат
иска
й
каза
как
каква
какво
както
какъв
като
кога
когато
което
които
кой
който
колко
която
къде
където
към
лесен
лесно
ли
лош
м
май
малко
ме
между
мек
мен
месец
ми
много
мнозина
мога
могат
може
мокър
моля
момента
му
н
на
над
назад
най
направи
напред
например
нас
не
него
нещо
нея
ни
ние
никой
нито
нищо
но
нов
нова
нови
новина
някои
някой
няколко
няма
обаче
около
освен
особено
от
отгоре
отново
още
пак
по
повече
повечето
под
поне
поради
после
почти
прави
пред
преди
през
при
пък
първата
първи
първо
пъти
равен
равна
с
са
сам
само
се
сега
си
син
скоро
след
следващ
сме
смях
според
сред
срещу
сте
съм
със
също
т
тази
така
такива
такъв
там
твой
те
тези
ти
т.н.
то
това
тогава
този
той
толкова
точно
три
трябва
тук
тъй
тя
тях
у
утре
харесва
хиляди
ч
часа
че
често
чрез
ще
щом
юмрук юмрук
я
як я як
""".split() """.split()
) )

View File

@ -1,10 +1,16 @@
"""
References:
https://slovored.com/bg/abbr/grammar/ - Additional refs for abbreviations
(countries, occupations, fields of studies and more).
"""
from ...symbols import ORTH, NORM from ...symbols import ORTH, NORM
_exc = {} _exc = {}
# measurements
_abbr_exc = [ for abbr in [
{ORTH: "м", NORM: "метър"}, {ORTH: "м", NORM: "метър"},
{ORTH: "мм", NORM: "милиметър"}, {ORTH: "мм", NORM: "милиметър"},
{ORTH: "см", NORM: "сантиметър"}, {ORTH: "см", NORM: "сантиметър"},
@ -17,51 +23,191 @@ _abbr_exc = [
{ORTH: "хл", NORM: "хектолиър"}, {ORTH: "хл", NORM: "хектолиър"},
{ORTH: "дкл", NORM: "декалитър"}, {ORTH: "дкл", NORM: "декалитър"},
{ORTH: "л", NORM: "литър"}, {ORTH: "л", NORM: "литър"},
] ]:
for abbr in _abbr_exc:
_exc[abbr[ORTH]] = [abbr] _exc[abbr[ORTH]] = [abbr]
_abbr_line_exc = [ # line abbreviations
for abbr in [
{ORTH: "г-жа", NORM: "госпожа"}, {ORTH: "г-жа", NORM: "госпожа"},
{ORTH: "г", NORM: "господин"}, {ORTH: "г", NORM: "господин"},
{ORTH: "г-ца", NORM: "госпожица"}, {ORTH: "г-ца", NORM: "госпожица"},
{ORTH: "д-р", NORM: "доктор"}, {ORTH: "д-р", NORM: "доктор"},
{ORTH: "о", NORM: "остров"}, {ORTH: "о", NORM: "остров"},
{ORTH: "п-в", NORM: "полуостров"}, {ORTH: "п-в", NORM: "полуостров"},
] {ORTH: "с-у", NORM: "срещу"},
{ORTH: "в-у", NORM: "върху"},
for abbr in _abbr_line_exc: {ORTH: "м-у", NORM: "между"},
]:
_exc[abbr[ORTH]] = [abbr] _exc[abbr[ORTH]] = [abbr]
_abbr_dot_exc = [ # foreign language related abbreviations
for abbr in [
{ORTH: "англ.", NORM: "английски"},
{ORTH: "ан.", NORM: "английски термин"},
{ORTH: "араб.", NORM: "арабски"},
{ORTH: "афр.", NORM: "африкански"},
{ORTH: "гр.", NORM: "гръцки"},
{ORTH: "лат.", NORM: "латински"},
{ORTH: "рим.", NORM: "римски"},
{ORTH: "старогр.", NORM: "старогръцки"},
{ORTH: "староевр.", NORM: "староеврейски"},
{ORTH: "фр.", NORM: "френски"},
{ORTH: "хол.", NORM: "холандски"},
{ORTH: "швед.", NORM: "шведски"},
{ORTH: "шотл.", NORM: "шотландски"},
{ORTH: "яп.", NORM: "японски"},
]:
_exc[abbr[ORTH]] = [abbr]
# profession and academic titles abbreviations
for abbr in [
{ORTH: "акад.", NORM: "академик"}, {ORTH: "акад.", NORM: "академик"},
{ORTH: "ал.", NORM: "алинея"},
{ORTH: "арх.", NORM: "архитект"}, {ORTH: "арх.", NORM: "архитект"},
{ORTH: "инж.", NORM: "инженер"},
{ORTH: "канц.", NORM: "канцлер"},
{ORTH: "проф.", NORM: "професор"},
{ORTH: "св.", NORM: "свети"},
]:
_exc[abbr[ORTH]] = [abbr]
# fields of studies
for abbr in [
{ORTH: "агр.", NORM: "агрономия"},
{ORTH: "ав.", NORM: "авиация"},
{ORTH: "агр.", NORM: "агрономия"},
{ORTH: "археол.", NORM: "археология"},
{ORTH: "астр.", NORM: "астрономия"},
{ORTH: "геод.", NORM: "геодезия"},
{ORTH: "геол.", NORM: "геология"},
{ORTH: "геом.", NORM: "геометрия"},
{ORTH: "гимн.", NORM: "гимнастика"},
{ORTH: "грам.", NORM: "граматика"},
{ORTH: "жур.", NORM: "журналистика"},
{ORTH: "журн.", NORM: "журналистика"},
{ORTH: "зем.", NORM: "земеделие"},
{ORTH: "икон.", NORM: "икономика"},
{ORTH: "лит.", NORM: "литература"},
{ORTH: "мат.", NORM: "математика"},
{ORTH: "мед.", NORM: "медицина"},
{ORTH: "муз.", NORM: "музика"},
{ORTH: "печ.", NORM: "печатарство"},
{ORTH: "пол.", NORM: "политика"},
{ORTH: "псих.", NORM: "психология"},
{ORTH: "соц.", NORM: "социология"},
{ORTH: "стат.", NORM: "статистика"},
{ORTH: "стил.", NORM: "стилистика"},
{ORTH: "топогр.", NORM: "топография"},
{ORTH: "търг.", NORM: "търговия"},
{ORTH: "фарм.", NORM: "фармацевтика"},
{ORTH: "фехт.", NORM: "фехтовка"},
{ORTH: "физиол.", NORM: "физиология"},
{ORTH: "физ.", NORM: "физика"},
{ORTH: "фил.", NORM: "философия"},
{ORTH: "фин.", NORM: "финанси"},
{ORTH: "фолкл.", NORM: "фолклор"},
{ORTH: "фон.", NORM: "фонетика"},
{ORTH: "фот.", NORM: "фотография"},
{ORTH: "футб.", NORM: "футбол"},
{ORTH: "хим.", NORM: "химия"},
{ORTH: "хир.", NORM: "хирургия"},
{ORTH: "ел.", NORM: "електротехника"},
]:
_exc[abbr[ORTH]] = [abbr]
for abbr in [
{ORTH: "ал.", NORM: "алинея"},
{ORTH: "авт.", NORM: "автоматично"},
{ORTH: "адм.", NORM: "администрация"},
{ORTH: "арт.", NORM: "артилерия"},
{ORTH: "бл.", NORM: "блок"}, {ORTH: "бл.", NORM: "блок"},
{ORTH: "бр.", NORM: "брой"}, {ORTH: "бр.", NORM: "брой"},
{ORTH: "бул.", NORM: "булевард"}, {ORTH: "бул.", NORM: "булевард"},
{ORTH: "букв.", NORM: "буквално"},
{ORTH: "в.", NORM: "век"}, {ORTH: "в.", NORM: "век"},
{ORTH: "вр.", NORM: "време"},
{ORTH: "вм.", NORM: "вместо"},
{ORTH: "воен.", NORM: "военен термин"},
{ORTH: "г.", NORM: "година"}, {ORTH: "г.", NORM: "година"},
{ORTH: "гр.", NORM: "град"}, {ORTH: "гр.", NORM: "град"},
{ORTH: "гл.", NORM: "глагол"},
{ORTH: "др.", NORM: "други"},
{ORTH: "ез.", NORM: "езеро"},
{ORTH: "ж.р.", NORM: "женски род"}, {ORTH: "ж.р.", NORM: "женски род"},
{ORTH: "инж.", NORM: "инженер"}, {ORTH: "жп.", NORM: "железопът"},
{ORTH: "застр.", NORM: "застрахователно дело"},
{ORTH: "знач.", NORM: "значение"},
{ORTH: "и др.", NORM: "и други"},
{ORTH: "и под.", NORM: "и подобни"},
{ORTH: "и пр.", NORM: "и прочие"},
{ORTH: "изр.", NORM: "изречение"},
{ORTH: "изт.", NORM: "източен"},
{ORTH: "конкр.", NORM: "конкретно"},
{ORTH: "лв.", NORM: "лев"}, {ORTH: "лв.", NORM: "лев"},
{ORTH: "л.", NORM: "лице"},
{ORTH: "м.р.", NORM: "мъжки род"}, {ORTH: "м.р.", NORM: "мъжки род"},
{ORTH: "мат.", NORM: "математика"}, {ORTH: "мин.вр.", NORM: "минало време"},
{ORTH: "мед.", NORM: "медицина"}, {ORTH: "мн.ч.", NORM: "множествено число"},
{ORTH: "напр.", NORM: "например"},
{ORTH: "нар.", NORM: "наречие"},
{ORTH: "науч.", NORM: "научен термин"},
{ORTH: "непр.", NORM: "неправилно"},
{ORTH: "обик.", NORM: "обикновено"},
{ORTH: "опред.", NORM: "определение"},
{ORTH: "особ.", NORM: "особено"},
{ORTH: "ост.", NORM: "остаряло"},
{ORTH: "относ.", NORM: "относително"},
{ORTH: "отр.", NORM: "отрицателно"},
{ORTH: "пл.", NORM: "площад"}, {ORTH: "пл.", NORM: "площад"},
{ORTH: "проф.", NORM: "професор"}, {ORTH: "пад.", NORM: "падеж"},
{ORTH: "парл.", NORM: "парламентарен"},
{ORTH: "погов.", NORM: "поговорка"},
{ORTH: "пон.", NORM: "понякога"},
{ORTH: "правосл.", NORM: "православен"},
{ORTH: "прибл.", NORM: "приблизително"},
{ORTH: "прил.", NORM: "прилагателно име"},
{ORTH: "пр.", NORM: "прочие"},
{ORTH: "с.", NORM: "село"}, {ORTH: "с.", NORM: "село"},
{ORTH: "с.р.", NORM: "среден род"}, {ORTH: "с.р.", NORM: "среден род"},
{ORTH: "св.", NORM: "свети"},
{ORTH: "сп.", NORM: "списание"}, {ORTH: "сп.", NORM: "списание"},
{ORTH: "стр.", NORM: "страница"}, {ORTH: "стр.", NORM: "страница"},
{ORTH: "сз.", NORM: "съюз"},
{ORTH: "сег.", NORM: "сегашно"},
{ORTH: "сп.", NORM: "спорт"},
{ORTH: "срв.", NORM: "сравни"},
{ORTH: "с.ст.", NORM: "селскостопанска техника"},
{ORTH: "счет.", NORM: "счетоводство"},
{ORTH: "съкр.", NORM: "съкратено"},
{ORTH: "съобщ.", NORM: "съобщение"},
{ORTH: "същ.", NORM: "съществително"},
{ORTH: "текст.", NORM: "текстилен"},
{ORTH: "телев.", NORM: "телевизия"},
{ORTH: "тел.", NORM: "телефон"},
{ORTH: "т.е.", NORM: "тоест"},
{ORTH: "т.н.", NORM: "така нататък"},
{ORTH: "т.нар.", NORM: "така наречен"},
{ORTH: "търж.", NORM: "тържествено"},
{ORTH: "ул.", NORM: "улица"}, {ORTH: "ул.", NORM: "улица"},
{ORTH: "уч.", NORM: "училище"},
{ORTH: "унив.", NORM: "университет"},
{ORTH: "харт.", NORM: "хартия"},
{ORTH: "хидр.", NORM: "хидравлика"},
{ORTH: "хран.", NORM: "хранителна"},
{ORTH: "църк.", NORM: "църковен термин"},
{ORTH: "числ.", NORM: "числително"},
{ORTH: "чл.", NORM: "член"}, {ORTH: "чл.", NORM: "член"},
] {ORTH: "ч.", NORM: "число"},
{ORTH: "числ.", NORM: "числително"},
for abbr in _abbr_dot_exc: {ORTH: "шахм.", NORM: "шахмат"},
{ORTH: "шах.", NORM: "шахмат"},
{ORTH: "юр.", NORM: "юридически"},
]:
_exc[abbr[ORTH]] = [abbr] _exc[abbr[ORTH]] = [abbr]
# slash abbreviations
for abbr in [
{ORTH: "м/у", NORM: "между"},
{ORTH: "с/у", NORM: "срещу"},
]:
_exc[abbr[ORTH]] = [abbr]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
@ -23,13 +23,25 @@ class Bengali(Language):
@Bengali.factory( @Bengali.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Bengali"] __all__ = ["Bengali"]

23
spacy/lang/ca/__init__.py Normal file → Executable file
View File

@ -1,9 +1,9 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
@ -15,6 +15,7 @@ class CatalanDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
prefixes = TOKENIZER_PREFIXES
stop_words = STOP_WORDS stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
@ -28,13 +29,25 @@ class Catalan(Language):
@Catalan.factory( @Catalan.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return CatalanLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Catalan"] __all__ = ["Catalan"]

11
spacy/lang/ca/punctuation.py Normal file → Executable file
View File

@ -1,4 +1,5 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import LIST_CURRENCY
from ..char_classes import CURRENCY from ..char_classes import CURRENCY
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
from ..char_classes import merge_chars, _units from ..char_classes import merge_chars, _units
@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units
ELISION = " ' ".strip().replace(" ", "").replace("\n", "") ELISION = " ' ".strip().replace(" ", "").replace("\n", "")
_prefixes = (
["§", "%", "=", "", "", "-", r"\+(?![0-9])"]
+ LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_CURRENCY
+ LIST_ICONS
)
_infixes = ( _infixes = (
LIST_ELLIPSES LIST_ELLIPSES
@ -18,6 +27,7 @@ _infixes = (
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION), r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')",
] ]
) )
@ -44,3 +54,4 @@ _suffixes = (
TOKENIZER_INFIXES = _infixes TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_PREFIXES = _prefixes

21
spacy/lang/ca/tokenizer_exceptions.py Normal file → Executable file
View File

@ -18,12 +18,21 @@ for exc_data in [
{ORTH: "nov.", NORM: "novembre"}, {ORTH: "nov.", NORM: "novembre"},
{ORTH: "dec.", NORM: "desembre"}, {ORTH: "dec.", NORM: "desembre"},
{ORTH: "Dr.", NORM: "doctor"}, {ORTH: "Dr.", NORM: "doctor"},
{ORTH: "Dra.", NORM: "doctora"},
{ORTH: "Sr.", NORM: "senyor"}, {ORTH: "Sr.", NORM: "senyor"},
{ORTH: "Sra.", NORM: "senyora"}, {ORTH: "Sra.", NORM: "senyora"},
{ORTH: "Srta.", NORM: "senyoreta"}, {ORTH: "Srta.", NORM: "senyoreta"},
{ORTH: "núm", NORM: "número"}, {ORTH: "núm", NORM: "número"},
{ORTH: "St.", NORM: "sant"}, {ORTH: "St.", NORM: "sant"},
{ORTH: "Sta.", NORM: "santa"}, {ORTH: "Sta.", NORM: "santa"},
{ORTH: "pl.", NORM: "plaça"},
{ORTH: "à."},
{ORTH: "è."},
{ORTH: "é."},
{ORTH: "í."},
{ORTH: "ò."},
{ORTH: "ó."},
{ORTH: "ú."},
{ORTH: "'l"}, {ORTH: "'l"},
{ORTH: "'ls"}, {ORTH: "'ls"},
{ORTH: "'m"}, {ORTH: "'m"},
@ -34,6 +43,18 @@ for exc_data in [
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}]
_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}]
_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}]
_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}]
_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}]
_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}]
_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}]
# Times # Times
_exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}] _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -28,13 +28,25 @@ class Greek(Language):
@Greek.factory( @Greek.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return GreekLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Greek"] __all__ = ["Greek"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -26,13 +26,25 @@ class English(Language):
@English.factory( @English.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return EnglishLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["English"] __all__ = ["English"]

View File

@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer):
Check whether we're dealing with an uninflected paradigm, so we can Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely. avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag. univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the morphology (dict): The token's morphological features following the
Universal Dependencies scheme. Universal Dependencies scheme.
""" """

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -26,13 +26,25 @@ class Spanish(Language):
@Spanish.factory( @Spanish.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return SpanishLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Spanish"] __all__ = ["Spanish"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
@ -26,13 +26,25 @@ class Persian(Language):
@Persian.factory( @Persian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Persian"] __all__ = ["Persian"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
@ -31,13 +31,25 @@ class French(Language):
@French.factory( @French.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return FrenchLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["French"] __all__ = ["French"]

View File

@ -1,6 +1,11 @@
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from .lemmatizer import IrishLemmatizer
class IrishDefaults(BaseDefaults): class IrishDefaults(BaseDefaults):
@ -13,4 +18,16 @@ class Irish(Language):
Defaults = IrishDefaults Defaults = IrishDefaults
@Irish.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "overwrite": False},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
):
return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Irish"] __all__ = ["Irish"]

View File

@ -1,35 +0,0 @@
# fmt: off
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"]
broad_vowels = ["a", "á", "o", "ó", "u", "ú"]
slender_vowels = ["e", "é", "i", "í"]
vowels = broad_vowels + slender_vowels
# fmt: on
def ends_dentals(word):
if word != "" and word[-1] in ["d", "n", "t", "s"]:
return True
else:
return False
def devoice(word):
if len(word) > 2 and word[-2] == "s" and word[-1] == "d":
return word[:-1] + "t"
else:
return word
def ends_with_vowel(word):
return word != "" and word[-1] in vowels
def starts_with_vowel(word):
return word != "" and word[0] in vowels
def deduplicate(word):
if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants:
return word[:-1]
else:
return word

162
spacy/lang/ga/lemmatizer.py Normal file
View File

@ -0,0 +1,162 @@
from typing import List, Dict, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
class IrishLemmatizer(Lemmatizer):
# This is a lookup-based lemmatiser using data extracted from
# BuNaMo (https://github.com/michmech/BuNaMo)
@classmethod
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "pos_lookup":
# fmt: off
required = [
"lemma_lookup_adj", "lemma_lookup_adp",
"lemma_lookup_noun", "lemma_lookup_verb"
]
# fmt: on
return (required, [])
else:
return super().get_lookups_config(mode)
def pos_lookup_lemmatize(self, token: Token) -> List[str]:
univ_pos = token.pos_
string = unponc(token.text)
if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]:
return [string.lower()]
demutated = demutate(string)
secondary = ""
if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú":
secondary = string[1:]
lookup_pos = univ_pos.lower()
if univ_pos == "PROPN":
lookup_pos = "noun"
if token.has_morph():
# TODO: lookup is actually required for the genitive forms, but
# this is not in BuNaMo, and would not be of use with IDT.
if univ_pos == "NOUN" and (
"VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph
):
hpref = "Form=HPref" in token.morph
return [demutate(string, hpref).lower()]
elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph:
return [demutate(string).lower()]
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
def to_list(value):
if value is None:
value = []
elif not isinstance(value, list):
value = [value]
return value
if univ_pos == "ADP":
return to_list(lookup_table.get(string, string.lower()))
ret = []
if univ_pos == "PROPN":
ret.extend(to_list(lookup_table.get(demutated)))
ret.extend(to_list(lookup_table.get(secondary)))
else:
ret.extend(to_list(lookup_table.get(demutated.lower())))
ret.extend(to_list(lookup_table.get(secondary.lower())))
if len(ret) == 0:
ret = [string.lower()]
return ret
def demutate(word: str, is_hpref: bool = False) -> str:
UVOWELS = "AÁEÉIÍOÓUÚ"
LVOWELS = "aáeéiíoóuú"
lc = word.lower()
# remove eclipsis
if lc.startswith("bhf"):
word = word[2:]
elif lc.startswith("mb"):
word = word[1:]
elif lc.startswith("gc"):
word = word[1:]
elif lc.startswith("nd"):
word = word[1:]
elif lc.startswith("ng"):
word = word[1:]
elif lc.startswith("bp"):
word = word[1:]
elif lc.startswith("dt"):
word = word[1:]
elif word[0:1] == "n" and word[1:2] in UVOWELS:
word = word[1:]
elif lc.startswith("n-") and word[2:3] in LVOWELS:
word = word[2:]
# non-standard eclipsis
elif lc.startswith("bh-f"):
word = word[3:]
elif lc.startswith("m-b"):
word = word[2:]
elif lc.startswith("g-c"):
word = word[2:]
elif lc.startswith("n-d"):
word = word[2:]
elif lc.startswith("n-g"):
word = word[2:]
elif lc.startswith("b-p"):
word = word[2:]
elif lc.startswith("d-t"):
word = word[2:]
# t-prothesis
elif lc.startswith("ts"):
word = word[1:]
elif lc.startswith("t-s"):
word = word[2:]
# h-prothesis, if known to be present
elif is_hpref and word[0:1] == "h":
word = word[1:]
# h-prothesis, simple case
# words can also begin with 'h', but unlike eclipsis,
# a hyphen is not used, so that needs to be handled
# elsewhere
elif word[0:1] == "h" and word[1:2] in UVOWELS:
word = word[1:]
# lenition
# this breaks the previous if, to handle super-non-standard
# text where both eclipsis and lenition were used.
if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h":
word = word[0:1] + word[2:]
return word
def unponc(word: str) -> str:
# fmt: off
PONC = {
"": "bh",
"ċ": "ch",
"": "dh",
"": "fh",
"ġ": "gh",
"": "mh",
"": "ph",
"": "sh",
"": "th",
"": "BH",
"Ċ": "CH",
"": "DH",
"": "FH",
"Ġ": "GH",
"": "MH",
"": "PH",
"": "SH",
"": "TH"
}
# fmt: on
buf = []
for ch in word:
if ch in PONC:
buf.append(PONC[ch])
else:
buf.append(ch)
return "".join(buf)

View File

@ -9,6 +9,8 @@ _exc = {
"ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}], "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}],
"lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}], "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}],
"led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}], "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}],
"théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
"tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}],
} }
for exc_data in [ for exc_data in [

View File

@ -646,5 +646,10 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
) )
for u in "cfkCFK":
_exc[f"°{u}"] = [{ORTH: f"°{u}"}]
_exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -23,13 +23,25 @@ class Italian(Language):
@Italian.factory( @Italian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, default_config={
"model": None,
"mode": "pos_lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return ItalianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Italian"] __all__ = ["Italian"]

View File

@ -1,21 +1,25 @@
from typing import Optional, Union, Dict, Any from typing import Optional, Union, Dict, Any, Callable
from pathlib import Path from pathlib import Path
import srsly import srsly
from collections import namedtuple from collections import namedtuple
from thinc.api import Model
import re
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP from .tag_bigram_map import TAG_BIGRAM_MAP
from ...compat import copy_reg
from ...errors import Errors from ...errors import Errors
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from ...pipeline import Morphologizer
from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
from ...scorer import Scorer from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...tokens import Doc from ...tokens import Doc, MorphAnalysis
from ...training import validate_examples from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
from ... import util from ... import util
@ -31,16 +35,21 @@ split_mode = null
@registry.tokenizers("spacy.ja.JapaneseTokenizer") @registry.tokenizers("spacy.ja.JapaneseTokenizer")
def create_tokenizer(split_mode: Optional[str] = None): def create_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp): def japanese_tokenizer_factory(nlp):
return JapaneseTokenizer(nlp, split_mode=split_mode) return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
return japanese_tokenizer_factory return japanese_tokenizer_factory
class JapaneseTokenizer(DummyTokenizer): class JapaneseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None: def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None:
self.vocab = nlp.vocab self.vocab = vocab
self.split_mode = split_mode self.split_mode = split_mode
self.tokenizer = try_sudachi_import(self.split_mode) self.tokenizer = try_sudachi_import(self.split_mode)
# if we're using split mode A we don't need subtokens
self.need_subtokens = not (split_mode is None or split_mode == "A")
def __reduce__(self):
return JapaneseTokenizer, (self.vocab, self.split_mode)
def __call__(self, text: str) -> Doc: def __call__(self, text: str) -> Doc:
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
@ -49,8 +58,8 @@ class JapaneseTokenizer(DummyTokenizer):
dtokens, spaces = get_dtokens_and_spaces(dtokens, text) dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
# create Doc with tag bi-gram based part-of-speech identification rules # create Doc with tag bi-gram based part-of-speech identification rules
words, tags, inflections, lemmas, readings, sub_tokens_list = ( words, tags, inflections, lemmas, norms, readings, sub_tokens_list = (
zip(*dtokens) if dtokens else [[]] * 6 zip(*dtokens) if dtokens else [[]] * 7
) )
sub_tokens_list = list(sub_tokens_list) sub_tokens_list = list(sub_tokens_list)
doc = Doc(self.vocab, words=words, spaces=spaces) doc = Doc(self.vocab, words=words, spaces=spaces)
@ -68,9 +77,18 @@ class JapaneseTokenizer(DummyTokenizer):
) )
# if there's no lemma info (it's an unk) just use the surface # if there's no lemma info (it's an unk) just use the surface
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
doc.user_data["inflections"] = inflections morph = {}
doc.user_data["reading_forms"] = readings if dtoken.inf:
doc.user_data["sub_tokens"] = sub_tokens_list # it's normal for this to be empty for non-inflecting types
morph["Inflection"] = dtoken.inf
token.norm_ = dtoken.norm
if dtoken.reading:
# punctuation is its own reading, but we don't want values like
# "=" here
morph["Reading"] = re.sub("[=|]", "_", dtoken.reading)
token.morph = MorphAnalysis(self.vocab, morph)
if self.need_subtokens:
doc.user_data["sub_tokens"] = sub_tokens_list
return doc return doc
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True): def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
@ -81,9 +99,10 @@ class JapaneseTokenizer(DummyTokenizer):
DetailedToken( DetailedToken(
token.surface(), # orth token.surface(), # orth
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf ";".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
token.dictionary_form(), # lemma token.dictionary_form(), # lemma
token.reading_form(), # user_data['reading_forms'] token.normalized_form(),
token.reading_form(),
sub_tokens_list[idx] sub_tokens_list[idx]
if sub_tokens_list if sub_tokens_list
else None, # user_data['sub_tokens'] else None, # user_data['sub_tokens']
@ -105,9 +124,8 @@ class JapaneseTokenizer(DummyTokenizer):
] ]
def _get_sub_tokens(self, sudachipy_tokens): def _get_sub_tokens(self, sudachipy_tokens):
if ( # do nothing for default split mode
self.split_mode is None or self.split_mode == "A" if not self.need_subtokens:
): # do nothing for default split mode
return None return None
sub_tokens_list = [] # list of (list of list of DetailedToken | None) sub_tokens_list = [] # list of (list of list of DetailedToken | None)
@ -176,9 +194,33 @@ class Japanese(Language):
Defaults = JapaneseDefaults Defaults = JapaneseDefaults
@Japanese.factory(
"morphologizer",
assigns=["token.morph", "token.pos"],
default_config={
"model": DEFAULT_MORPH_MODEL,
"overwrite": True,
"extend": True,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
},
default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None},
)
def make_morphologizer(
nlp: Language,
model: Model,
name: str,
overwrite: bool,
extend: bool,
scorer: Optional[Callable],
):
return Morphologizer(
nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer
)
# Hold the attributes we need with convenient names # Hold the attributes we need with convenient names
DetailedToken = namedtuple( DetailedToken = namedtuple(
"DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"] "DetailedToken", ["surface", "tag", "inf", "lemma", "norm", "reading", "sub_tokens"]
) )
@ -254,7 +296,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
return text_dtokens, text_spaces return text_dtokens, text_spaces
elif len([word for word in words if not word.isspace()]) == 0: elif len([word for word in words if not word.isspace()]) == 0:
assert text.isspace() assert text.isspace()
text_dtokens = [DetailedToken(text, gap_tag, "", text, None, None)] text_dtokens = [DetailedToken(text, gap_tag, "", text, text, None, None)]
text_spaces = [False] text_spaces = [False]
return text_dtokens, text_spaces return text_dtokens, text_spaces
@ -271,7 +313,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
# space token # space token
if word_start > 0: if word_start > 0:
w = text[text_pos : text_pos + word_start] w = text[text_pos : text_pos + word_start]
text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None)) text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
text_spaces.append(False) text_spaces.append(False)
text_pos += word_start text_pos += word_start
@ -287,16 +329,10 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
# trailing space token # trailing space token
if text_pos < len(text): if text_pos < len(text):
w = text[text_pos:] w = text[text_pos:]
text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None)) text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None))
text_spaces.append(False) text_spaces.append(False)
return text_dtokens, text_spaces return text_dtokens, text_spaces
def pickle_japanese(instance):
return Japanese, tuple()
copy_reg.pickle(Japanese, pickle_japanese)
__all__ = ["Japanese"] __all__ = ["Japanese"]

View File

@ -5,11 +5,11 @@ from .tag_map import TAG_MAP
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from ...tokens import Doc from ...tokens import Doc
from ...compat import copy_reg
from ...scorer import Scorer from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
from ...training import validate_examples from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -23,17 +23,20 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.ko.KoreanTokenizer") @registry.tokenizers("spacy.ko.KoreanTokenizer")
def create_tokenizer(): def create_tokenizer():
def korean_tokenizer_factory(nlp): def korean_tokenizer_factory(nlp):
return KoreanTokenizer(nlp) return KoreanTokenizer(nlp.vocab)
return korean_tokenizer_factory return korean_tokenizer_factory
class KoreanTokenizer(DummyTokenizer): class KoreanTokenizer(DummyTokenizer):
def __init__(self, nlp: Language): def __init__(self, vocab: Vocab):
self.vocab = nlp.vocab self.vocab = vocab
MeCab = try_mecab_import() # type: ignore[func-returns-value] MeCab = try_mecab_import() # type: ignore[func-returns-value]
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
def __reduce__(self):
return KoreanTokenizer, (self.vocab,)
def __del__(self): def __del__(self):
self.mecab_tokenizer.__del__() self.mecab_tokenizer.__del__()
@ -106,10 +109,4 @@ def check_spaces(text, tokens):
yield False yield False
def pickle_korean(instance):
return Korean, tuple()
copy_reg.pickle(Korean, pickle_korean)
__all__ = ["Korean"] __all__ = ["Korean"]

View File

@ -3,6 +3,7 @@ import unicodedata
import re import re
from .. import attrs from .. import attrs
from .tokenizer_exceptions import URL_MATCH
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match _like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
@ -109,6 +110,8 @@ def like_url(text: str) -> bool:
return True return True
if tld.isalpha() and tld in _tlds: if tld.isalpha() and tld in _tlds:
return True return True
if URL_MATCH(text):
return True
return False return False

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .lemmatizer import MacedonianLemmatizer from .lemmatizer import MacedonianLemmatizer
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -38,13 +38,25 @@ class Macedonian(Language):
@Macedonian.factory( @Macedonian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return MacedonianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Macedonian"] __all__ = ["Macedonian"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
@ -26,13 +26,25 @@ class Norwegian(Language):
@Norwegian.factory( @Norwegian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Norwegian"] __all__ = ["Norwegian"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
@ -30,13 +30,25 @@ class Dutch(Language):
@Dutch.factory( @Dutch.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return DutchLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Dutch"] __all__ = ["Dutch"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
@ -33,13 +33,25 @@ class Polish(Language):
@Polish.factory( @Polish.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, default_config={
"model": None,
"mode": "pos_lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return PolishLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Polish"] __all__ = ["Polish"]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -22,7 +22,12 @@ class Russian(Language):
@Russian.factory( @Russian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "overwrite": False}, default_config={
"model": None,
"mode": "pymorphy2",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
@ -31,8 +36,11 @@ def make_lemmatizer(
name: str, name: str,
mode: str, mode: str,
overwrite: bool, overwrite: bool,
scorer: Optional[Callable],
): ):
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return RussianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Russian"] __all__ = ["Russian"]

View File

@ -1,8 +1,9 @@
from typing import Optional, List, Dict, Tuple from typing import Optional, List, Dict, Tuple, Callable
from thinc.api import Model from thinc.api import Model
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from ...pipeline.lemmatizer import lemmatizer_score
from ...symbols import POS from ...symbols import POS
from ...tokens import Token from ...tokens import Token
from ...vocab import Vocab from ...vocab import Vocab
@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer):
*, *,
mode: str = "pymorphy2", mode: str = "pymorphy2",
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
if mode == "pymorphy2": if mode == "pymorphy2":
try: try:
@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer):
) from None ) from None
if getattr(self, "_morph", None) is None: if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer() self._morph = MorphAnalyzer()
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
def pymorphy2_lemmatize(self, token: Token) -> List[str]: def pymorphy2_lemmatize(self, token: Token) -> List[str]:
string = token.text string = token.text

View File

@ -1,47 +1,195 @@
STOP_WORDS = set( STOP_WORDS = set(
""" """
අතර සහ
එචචර සමග
එපමණ සමඟ
එල අහ
එව ආහ
කට ඕහ
කද අන
අඳ
අප
අප
අය
ආය
ඌය
නම
පමණ
පමණ
චර
පමණ
බඳ
වන
අය
ලද අය
වග
බවට
බව
බව
නම
මහ
මහ
පමණ
පමණ
පමන
වන වන
තර
වත ඇත
වද
සමඟ වශය
යන
සඳහ
මග
ඉත
එම
අතර
සමග
බඳව
බඳ
බව
මහ
තට
වට
අන
නව
බඳ
නට
එහ
එහ
තවත
තව
සහ සහ
දක
බවත
බවද
මත
ඇත
ඇත
වඩ
වඩ
තර
ඉක
යල
ඉත
ටන
පටන
දක
වක
පව
වත
ඇය
මන
වත වත
පත
තව
ඉත
වහ
හන
එම
එමබල
නම
වන
කල
ඉඳ
අන
ඔන
උද
සඳහ
අරබය
එන
එබ
අන
පර
වට
නම
එනම
වස
පර
එහ
""".split() """.split()
) )

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -29,13 +29,25 @@ class Swedish(Language):
@Swedish.factory( @Swedish.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False}, default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Swedish"] __all__ = ["Swedish"]

View File

@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer, registry, load_config_from_str from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
DEFAULT_CONFIG = """ DEFAULT_CONFIG = """
@ -16,13 +17,13 @@ DEFAULT_CONFIG = """
@registry.tokenizers("spacy.th.ThaiTokenizer") @registry.tokenizers("spacy.th.ThaiTokenizer")
def create_thai_tokenizer(): def create_thai_tokenizer():
def thai_tokenizer_factory(nlp): def thai_tokenizer_factory(nlp):
return ThaiTokenizer(nlp) return ThaiTokenizer(nlp.vocab)
return thai_tokenizer_factory return thai_tokenizer_factory
class ThaiTokenizer(DummyTokenizer): class ThaiTokenizer(DummyTokenizer):
def __init__(self, nlp: Language) -> None: def __init__(self, vocab: Vocab) -> None:
try: try:
from pythainlp.tokenize import word_tokenize from pythainlp.tokenize import word_tokenize
except ImportError: except ImportError:
@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer):
"https://github.com/PyThaiNLP/pythainlp" "https://github.com/PyThaiNLP/pythainlp"
) from None ) from None
self.word_tokenize = word_tokenize self.word_tokenize = word_tokenize
self.vocab = nlp.vocab self.vocab = vocab
def __call__(self, text: str) -> Doc: def __call__(self, text: str) -> Doc:
words = list(self.word_tokenize(text)) words = list(self.word_tokenize(text))

View File

@ -2,7 +2,7 @@ from ...attrs import LIKE_NUM
_num_words = [ _num_words = [
"ዜሮ", "ዜሮ",
"", "",
"ክልተ", "ክልተ",
"ሰለስተ", "ሰለስተ",
"ኣርባዕተ", "ኣርባዕተ",
@ -11,66 +11,37 @@ _num_words = [
"ሸውዓተ", "ሸውዓተ",
"ሽሞንተ", "ሽሞንተ",
"ትሽዓተ", "ትሽዓተ",
"ኣሰርተ", "ዓሰርተ",
"ኣሰርተ ሐደ",
"ኣሰርተ ክልተ",
"ኣሰርተ ሰለስተ",
"ኣሰርተ ኣርባዕተ",
"ኣሰርተ ሓሙሽተ",
"ኣሰርተ ሽድሽተ",
"ኣሰርተ ሸውዓተ",
"ኣሰርተ ሽሞንተ",
"ኣሰርተ ትሽዓተ",
"ዕስራ", "ዕስራ",
"ሰላሳ", "ሰላሳ",
"ኣርብዓ", "ኣርብዓ",
"ምሳ", "ሓምሳ",
"ስል", "ሱሳ",
"ሰብዓ", "ሰብዓ",
"ሰማንያ", "ሰማንያ",
"ስዓ", "ቴስዓ",
"ሚእቲ", "ሚእቲ",
"ሺሕ", "ሺሕ",
"ሚልዮን", "ሚልዮን",
"ቢልዮን", "ቢልዮን",
"ትሪልዮን", "ትሪልዮን",
"ኳድሪልዮን", "ኳድሪልዮን",
"ገጅልዮን", "ጋዚልዮን",
"ዝልዮን", "ዚልዮን"
] ]
# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል "
_ordinal_words = [ _ordinal_words = [
"ቀዳማይ", "ቀዳማይ",
"ካልኣይ", "ካልኣይ",
"ሳልሳይ", "ሳልሳይ",
"ራብ", "ራብ",
"ሓምሻይ", "ሓምሻይ",
"ሻድሻይ", "ሻድሻይ",
"ሻውዓይ", "ሻውዓይ",
"ሻምናይ", "ሻምናይ",
"ዘጠነኛ", "ታሽዓይ",
"አስረኛ", "ዓስራይ"
"ኣሰርተ አንደኛ",
"ኣሰርተ ሁለተኛ",
"ኣሰርተ ሶስተኛ",
"ኣሰርተ አራተኛ",
"ኣሰርተ አምስተኛ",
"ኣሰርተ ስድስተኛ",
"ኣሰርተ ሰባተኛ",
"ኣሰርተ ስምንተኛ",
"ኣሰርተ ዘጠነኛ",
"ሃያኛ",
"ሰላሳኛ" "አርባኛ",
"አምሳኛ",
"ስድሳኛ",
"ሰባኛ",
"ሰማንያኛ",
"ዘጠናኛ",
"መቶኛ",
"ሺኛ",
"ሚሊዮንኛ",
"ቢሊዮንኛ",
"ትሪሊዮንኛ",
] ]
@ -92,7 +63,7 @@ def like_num(text):
# Check ordinal number # Check ordinal number
if text_lower in _ordinal_words: if text_lower in _ordinal_words:
return True return True
if text_lower.endswith(""): if text_lower.endswith(""):
if text_lower[:-2].isdigit(): if text_lower[:-2].isdigit():
return True return True

View File

@ -1,7 +1,7 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import UNITS, ALPHA_UPPER from ..char_classes import UNITS, ALPHA_UPPER
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split() _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
_suffixes = ( _suffixes = (
_list_punct _list_punct

View File

@ -1,6 +1,27 @@
# Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt
# Stop words # Stop words
STOP_WORDS = set( STOP_WORDS = set(
""" """
ግን ግና ንስኻ ንስኺ ንስኻትክን ንስኻትኩም ናትካ ናትኪ ናትክን ናትኩም 'ምበር ' '' ''ውን '' '' 'ዮም 'ዮን
ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል
ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም
ስለ ስለዚ ስለዝበላ ሽዑ ቅድሚ በለ በቲ በዚ ብምባል ብተወሳኺ ብኸመይ
ብዘይ ብዘይካ ብዙሕ ብዛዕባ ብፍላይ ተባሂሉ ነበረ ነቲ ነታ ነቶም
ነዚ ነይሩ ነገራት ነገር ናብ ናብቲ ናትኩም ናትኪ ናትካ ናትክን
ናይ ናይቲ ንሕና ንሱ ንሳ ንሳቶም ንስኺ ንስኻ ንስኻትኩም ንስኻትክን ንዓይ
ኢለ ኢሉ ኢላ ኢልካ ኢሎም ኢና ኢኻ ኢዩ ኣለኹ
ኣለዉ ኣለዎ ኣሎ ኣብ ኣብቲ ኣብታ ኣብኡ ኣብዚ ኣነ ኣዝዩ ኣይኮነን ኣይኰነን
እምበር እሞ እተን እቲ እታ እቶም እንተ እንተሎ
ኣላ እንተኾነ እንታይ እንከሎ እኳ እዋን እውን እዚ እዛ እዞም
እየ እየን እዩ እያ እዮም
ከሎ ከመይ ከም ከምቲ ከምኡ ከምዘሎ
ከምዚ ከኣ ኩሉ ካልእ ካልኦት ካብ ካብቲ ካብቶም ክሳብ ክሳዕ ክብል
ክንደይ ክንዲ ክኸውን ኮይኑ ኰይኑ ኵሉ ኸም ኸኣ ወይ
ዋላ ዘለና ዘለዉ ዘለዋ ዘለዎ ዘለዎም ዘላ ዘሎ ዘይብሉ
ዝርከብ ዝበሃል ዝበለ ዝብል ዝተባህለ ዝተኻየደ ዝተፈላለየ ዝተፈላለዩ
ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ
የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ
ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ
""".split() """.split()
) )

View File

@ -250,3 +250,9 @@ o.0
for orth in emoticons: for orth in emoticons:
BASE_EXCEPTIONS[orth] = [{ORTH: orth}] BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
# Moved from a suffix setting due to #9155 removing prefixes from consideration
# for lookbehinds
for u in "cfkCFK":
BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}]

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
@ -23,13 +23,25 @@ class Ukrainian(Language):
@Ukrainian.factory( @Ukrainian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "overwrite": False}, default_config={
"model": None,
"mode": "pymorphy2",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return UkrainianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Ukrainian"] __all__ = ["Ukrainian"]

View File

@ -1,8 +1,9 @@
from typing import Optional from typing import Optional, Callable
from thinc.api import Model from thinc.api import Model
from ..ru.lemmatizer import RussianLemmatizer from ..ru.lemmatizer import RussianLemmatizer
from ...pipeline.lemmatizer import lemmatizer_score
from ...vocab import Vocab from ...vocab import Vocab
@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
*, *,
mode: str = "pymorphy2", mode: str = "pymorphy2",
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
if mode == "pymorphy2": if mode == "pymorphy2":
try: try:
@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer):
) from None ) from None
if getattr(self, "_morph", None) is None: if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="uk") self._morph = MorphAnalyzer(lang="uk")
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)

View File

@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from ...tokens import Doc from ...tokens import Doc
from ...util import DummyTokenizer, registry, load_config_from_str from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
from ... import util from ... import util
@ -24,14 +25,14 @@ use_pyvi = true
@registry.tokenizers("spacy.vi.VietnameseTokenizer") @registry.tokenizers("spacy.vi.VietnameseTokenizer")
def create_vietnamese_tokenizer(use_pyvi: bool = True): def create_vietnamese_tokenizer(use_pyvi: bool = True):
def vietnamese_tokenizer_factory(nlp): def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi) return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
return vietnamese_tokenizer_factory return vietnamese_tokenizer_factory
class VietnameseTokenizer(DummyTokenizer): class VietnameseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, use_pyvi: bool = False): def __init__(self, vocab: Vocab, use_pyvi: bool = False):
self.vocab = nlp.vocab self.vocab = vocab
self.use_pyvi = use_pyvi self.use_pyvi = use_pyvi
if self.use_pyvi: if self.use_pyvi:
try: try:
@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer):
) )
raise ImportError(msg) from None raise ImportError(msg) from None
def __reduce__(self):
return VietnameseTokenizer, (self.vocab, self.use_pyvi)
def __call__(self, text: str) -> Doc: def __call__(self, text: str) -> Doc:
if self.use_pyvi: if self.use_pyvi:
words = self.pyvi_tokenize(text) words = self.pyvi_tokenize(text)

18
spacy/lang/vi/examples.py Normal file
View File

@ -0,0 +1,18 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.vi.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Đây là đâu, tôi là ai?",
"Căn phòng có nhiều cửa sổ nên nó khá sáng",
"Đại dịch COVID vừa qua đã gây ảnh hưởng rất lớn tới nhiều doanh nghiệp lớn nhỏ.",
"Thành phố Hồ Chí Minh đã bị ảnh hưởng nặng nề trong thời gian vừa qua.",
"Ông bạn đang ở đâu thế?",
"Ai là người giải phóng đất nước Việt Nam khỏi ách đô hộ?",
"Vị tướng nào là người đã làm nên chiến thắng lịch sử Điện Biên Phủ?",
"Làm việc nhiều chán quá, đi chơi đâu đi?",
]

View File

@ -9,11 +9,14 @@ _num_words = [
"bốn", "bốn",
"năm", "năm",
"sáu", "sáu",
"bảy",
"bẩy", "bẩy",
"tám", "tám",
"chín", "chín",
"mười", "mười",
"chục",
"trăm", "trăm",
"nghìn",
"tỷ", "tỷ",
] ]

View File

@ -11,6 +11,7 @@ from ...scorer import Scorer
from ...tokens import Doc from ...tokens import Doc
from ...training import validate_examples, Example from ...training import validate_examples, Example
from ...util import DummyTokenizer, registry, load_config_from_str from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ... import util from ... import util
@ -48,14 +49,14 @@ class Segmenter(str, Enum):
@registry.tokenizers("spacy.zh.ChineseTokenizer") @registry.tokenizers("spacy.zh.ChineseTokenizer")
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char): def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
def chinese_tokenizer_factory(nlp): def chinese_tokenizer_factory(nlp):
return ChineseTokenizer(nlp, segmenter=segmenter) return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
return chinese_tokenizer_factory return chinese_tokenizer_factory
class ChineseTokenizer(DummyTokenizer): class ChineseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char): def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char):
self.vocab = nlp.vocab self.vocab = vocab
self.segmenter = ( self.segmenter = (
segmenter.value if isinstance(segmenter, Segmenter) else segmenter segmenter.value if isinstance(segmenter, Segmenter) else segmenter
) )

View File

@ -115,7 +115,7 @@ class Language:
Defaults (class): Settings, data and factory methods for creating the `nlp` Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline. object and processing pipeline.
lang (str): Two-letter language ID, i.e. ISO code. lang (str): IETF language code, such as 'en'.
DOCS: https://spacy.io/api/language DOCS: https://spacy.io/api/language
""" """
@ -228,6 +228,7 @@ class Language:
"vectors": len(self.vocab.vectors), "vectors": len(self.vocab.vectors),
"keys": self.vocab.vectors.n_keys, "keys": self.vocab.vectors.n_keys,
"name": self.vocab.vectors.name, "name": self.vocab.vectors.name,
"mode": self.vocab.vectors.mode,
} }
self._meta["labels"] = dict(self.pipe_labels) self._meta["labels"] = dict(self.pipe_labels)
# TODO: Adding this back to prevent breaking people's code etc., but # TODO: Adding this back to prevent breaking people's code etc., but
@ -978,7 +979,7 @@ class Language:
def __call__( def __call__(
self, self,
text: str, text: Union[str, Doc],
*, *,
disable: Iterable[str] = SimpleFrozenList(), disable: Iterable[str] = SimpleFrozenList(),
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
@ -987,7 +988,9 @@ class Language:
and can contain arbitrary whitespace. Alignment into the original string and can contain arbitrary whitespace. Alignment into the original string
is preserved. is preserved.
text (str): The text to be processed. text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`,
the doc will be passed directly to the pipeline, skipping
`Language.make_doc`.
disable (List[str]): Names of the pipeline components to disable. disable (List[str]): Names of the pipeline components to disable.
component_cfg (Dict[str, dict]): An optional dictionary with extra component_cfg (Dict[str, dict]): An optional dictionary with extra
keyword arguments for specific components. keyword arguments for specific components.
@ -995,7 +998,7 @@ class Language:
DOCS: https://spacy.io/api/language#call DOCS: https://spacy.io/api/language#call
""" """
doc = self.make_doc(text) doc = self._ensure_doc(text)
if component_cfg is None: if component_cfg is None:
component_cfg = {} component_cfg = {}
for name, proc in self.pipeline: for name, proc in self.pipeline:
@ -1080,6 +1083,20 @@ class Language:
) )
return self.tokenizer(text) return self.tokenizer(text)
def _ensure_doc(self, doc_like: Union[str, Doc]) -> Doc:
"""Create a Doc if need be, or raise an error if the input is not a Doc or a string."""
if isinstance(doc_like, Doc):
return doc_like
if isinstance(doc_like, str):
return self.make_doc(doc_like)
raise ValueError(Errors.E866.format(type=type(doc_like)))
def _ensure_doc_with_context(self, doc_like: Union[str, Doc], context: Any) -> Doc:
"""Create a Doc if need be and add as_tuples context, or raise an error if the input is not a Doc or a string."""
doc = self._ensure_doc(doc_like)
doc._context = context
return doc
def update( def update(
self, self,
examples: Iterable[Example], examples: Iterable[Example],
@ -1450,7 +1467,7 @@ class Language:
@overload @overload
def pipe( def pipe(
self, self,
texts: Iterable[str], texts: Iterable[Union[str, Doc]],
*, *,
as_tuples: Literal[False] = ..., as_tuples: Literal[False] = ...,
batch_size: Optional[int] = ..., batch_size: Optional[int] = ...,
@ -1463,7 +1480,7 @@ class Language:
@overload @overload
def pipe( # noqa: F811 def pipe( # noqa: F811
self, self,
texts: Iterable[Tuple[str, _AnyContext]], texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
*, *,
as_tuples: Literal[True] = ..., as_tuples: Literal[True] = ...,
batch_size: Optional[int] = ..., batch_size: Optional[int] = ...,
@ -1475,7 +1492,9 @@ class Language:
def pipe( # noqa: F811 def pipe( # noqa: F811
self, self,
texts: Union[Iterable[str], Iterable[Tuple[str, _AnyContext]]], texts: Union[
Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
],
*, *,
as_tuples: bool = False, as_tuples: bool = False,
batch_size: Optional[int] = None, batch_size: Optional[int] = None,
@ -1485,7 +1504,8 @@ class Language:
) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]: ) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
"""Process texts as a stream, and yield `Doc` objects in order. """Process texts as a stream, and yield `Doc` objects in order.
texts (Iterable[str]): A sequence of texts to process. texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
process.
as_tuples (bool): If set to True, inputs should be a sequence of as_tuples (bool): If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of (text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False. (doc, context) tuples. Defaults to False.
@ -1500,23 +1520,24 @@ class Language:
""" """
# Handle texts with context as tuples # Handle texts with context as tuples
if as_tuples: if as_tuples:
texts = cast(Iterable[Tuple[str, _AnyContext]], texts) texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
text_context1, text_context2 = itertools.tee(texts) docs_with_contexts = (
texts = (tc[0] for tc in text_context1) self._ensure_doc_with_context(text, context) for text, context in texts
contexts = (tc[1] for tc in text_context2) )
docs = self.pipe( docs = self.pipe(
texts, docs_with_contexts,
batch_size=batch_size, batch_size=batch_size,
disable=disable, disable=disable,
n_process=n_process, n_process=n_process,
component_cfg=component_cfg, component_cfg=component_cfg,
) )
for doc, context in zip(docs, contexts): for doc in docs:
context = doc._context
doc._context = None
yield (doc, context) yield (doc, context)
return return
# At this point, we know that we're dealing with an iterable of plain texts texts = cast(Iterable[Union[str, Doc]], texts)
texts = cast(Iterable[str], texts)
# Set argument defaults # Set argument defaults
if n_process == -1: if n_process == -1:
@ -1551,7 +1572,7 @@ class Language:
docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size) docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
else: else:
# if n_process == 1, no processes are forked. # if n_process == 1, no processes are forked.
docs = (self.make_doc(text) for text in texts) docs = (self._ensure_doc(text) for text in texts)
for pipe in pipes: for pipe in pipes:
docs = pipe(docs) docs = pipe(docs)
for doc in docs: for doc in docs:
@ -1570,7 +1591,7 @@ class Language:
def _multiprocessing_pipe( def _multiprocessing_pipe(
self, self,
texts: Iterable[str], texts: Iterable[Union[str, Doc]],
pipes: Iterable[Callable[..., Iterator[Doc]]], pipes: Iterable[Callable[..., Iterator[Doc]]],
n_process: int, n_process: int,
batch_size: int, batch_size: int,
@ -1596,7 +1617,7 @@ class Language:
procs = [ procs = [
mp.Process( mp.Process(
target=_apply_pipes, target=_apply_pipes,
args=(self.make_doc, pipes, rch, sch, Underscore.get_state()), args=(self._ensure_doc, pipes, rch, sch, Underscore.get_state()),
) )
for rch, sch in zip(texts_q, bytedocs_send_ch) for rch, sch in zip(texts_q, bytedocs_send_ch)
] ]
@ -1609,11 +1630,12 @@ class Language:
recv.recv() for recv in cycle(bytedocs_recv_ch) recv.recv() for recv in cycle(bytedocs_recv_ch)
) )
try: try:
for i, (_, (byte_doc, byte_error)) in enumerate( for i, (_, (byte_doc, byte_context, byte_error)) in enumerate(
zip(raw_texts, byte_tuples), 1 zip(raw_texts, byte_tuples), 1
): ):
if byte_doc is not None: if byte_doc is not None:
doc = Doc(self.vocab).from_bytes(byte_doc) doc = Doc(self.vocab).from_bytes(byte_doc)
doc._context = byte_context
yield doc yield doc
elif byte_error is not None: elif byte_error is not None:
error = srsly.msgpack_loads(byte_error) error = srsly.msgpack_loads(byte_error)
@ -2138,7 +2160,7 @@ def _copy_examples(examples: Iterable[Example]) -> List[Example]:
def _apply_pipes( def _apply_pipes(
make_doc: Callable[[str], Doc], ensure_doc: Callable[[Union[str, Doc]], Doc],
pipes: Iterable[Callable[..., Iterator[Doc]]], pipes: Iterable[Callable[..., Iterator[Doc]]],
receiver, receiver,
sender, sender,
@ -2146,7 +2168,8 @@ def _apply_pipes(
) -> None: ) -> None:
"""Worker for Language.pipe """Worker for Language.pipe
make_doc (Callable[[str,] Doc]): Function to create Doc from text. ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text
or raise an error if the input is neither a Doc nor a string.
pipes (Iterable[Pipe]): The components to apply. pipes (Iterable[Pipe]): The components to apply.
receiver (multiprocessing.Connection): Pipe to receive text. Usually receiver (multiprocessing.Connection): Pipe to receive text. Usually
created by `multiprocessing.Pipe()` created by `multiprocessing.Pipe()`
@ -2159,16 +2182,16 @@ def _apply_pipes(
while True: while True:
try: try:
texts = receiver.get() texts = receiver.get()
docs = (make_doc(text) for text in texts) docs = (ensure_doc(text) for text in texts)
for pipe in pipes: for pipe in pipes:
docs = pipe(docs) # type: ignore[arg-type, assignment] docs = pipe(docs) # type: ignore[arg-type, assignment]
# Connection does not accept unpickable objects, so send list. # Connection does not accept unpickable objects, so send list.
byte_docs = [(doc.to_bytes(), None) for doc in docs] byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
padding = [(None, None)] * (len(texts) - len(byte_docs)) padding = [(None, None, None)] * (len(texts) - len(byte_docs))
sender.send(byte_docs + padding) # type: ignore[operator] sender.send(byte_docs + padding) # type: ignore[operator]
except Exception: except Exception:
error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))] error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
padding = [(None, None)] * (len(texts) - 1) padding = [(None, None, None)] * (len(texts) - 1)
sender.send(error_msg + padding) sender.send(error_msg + padding)

View File

@ -284,7 +284,7 @@ cdef class Lexeme:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lower] return self.vocab.strings[self.c.lower]
def __set__(self, unicode x): def __set__(self, str x):
self.c.lower = self.vocab.strings.add(x) self.c.lower = self.vocab.strings.add(x)
property norm_: property norm_:
@ -294,7 +294,7 @@ cdef class Lexeme:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.norm] return self.vocab.strings[self.c.norm]
def __set__(self, unicode x): def __set__(self, str x):
self.norm = self.vocab.strings.add(x) self.norm = self.vocab.strings.add(x)
property shape_: property shape_:
@ -304,7 +304,7 @@ cdef class Lexeme:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.shape] return self.vocab.strings[self.c.shape]
def __set__(self, unicode x): def __set__(self, str x):
self.c.shape = self.vocab.strings.add(x) self.c.shape = self.vocab.strings.add(x)
property prefix_: property prefix_:
@ -314,7 +314,7 @@ cdef class Lexeme:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.prefix] return self.vocab.strings[self.c.prefix]
def __set__(self, unicode x): def __set__(self, str x):
self.c.prefix = self.vocab.strings.add(x) self.c.prefix = self.vocab.strings.add(x)
property suffix_: property suffix_:
@ -324,7 +324,7 @@ cdef class Lexeme:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.suffix] return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x): def __set__(self, str x):
self.c.suffix = self.vocab.strings.add(x) self.c.suffix = self.vocab.strings.add(x)
property lang_: property lang_:
@ -332,7 +332,7 @@ cdef class Lexeme:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lang] return self.vocab.strings[self.c.lang]
def __set__(self, unicode x): def __set__(self, str x):
self.c.lang = self.vocab.strings.add(x) self.c.lang = self.vocab.strings.add(x)
property flags: property flags:

View File

@ -148,9 +148,9 @@ cdef class DependencyMatcher:
Creates a token key to be used by the matcher Creates a token key to be used by the matcher
""" """
return self._normalize_key( return self._normalize_key(
unicode(key) + DELIMITER + str(key) + DELIMITER +
unicode(pattern_idx) + DELIMITER + str(pattern_idx) + DELIMITER +
unicode(token_idx) str(token_idx)
) )
def add(self, key, patterns, *, on_match=None): def add(self, key, patterns, *, on_match=None):
@ -424,7 +424,7 @@ cdef class DependencyMatcher:
return [doc[child.i] for child in doc[node].head.children if child.i < node] return [doc[child.i] for child in doc[node].head.children if child.i < node]
def _normalize_key(self, key): def _normalize_key(self, key):
if isinstance(key, basestring): if isinstance(key, str):
return self.vocab.strings.add(key) return self.vocab.strings.add(key)
else: else:
return key return key

View File

@ -312,7 +312,7 @@ cdef class Matcher:
return final_results return final_results
def _normalize_key(self, key): def _normalize_key(self, key):
if isinstance(key, basestring): if isinstance(key, str):
return self.vocab.strings.add(key) return self.vocab.strings.add(key)
else: else:
return key return key
@ -360,7 +360,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
for i, token in enumerate(doclike): for i, token in enumerate(doclike):
for name, index in extensions.items(): for name, index in extensions.items():
value = token._.get(name) value = token._.get(name)
if isinstance(value, basestring): if isinstance(value, str):
value = token.vocab.strings[value] value = token.vocab.strings[value]
extra_attr_values[i * nr_extra_attr + index] = value extra_attr_values[i * nr_extra_attr + index] = value
# Main loop # Main loop
@ -786,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
def _get_attr_values(spec, string_store): def _get_attr_values(spec, string_store):
attr_values = [] attr_values = []
for attr, value in spec.items(): for attr, value in spec.items():
if isinstance(attr, basestring): if isinstance(attr, str):
attr = attr.upper() attr = attr.upper()
if attr == '_': if attr == '_':
continue continue
@ -797,7 +797,7 @@ def _get_attr_values(spec, string_store):
if attr == "IS_SENT_START": if attr == "IS_SENT_START":
attr = "SENT_START" attr = "SENT_START"
attr = IDS.get(attr) attr = IDS.get(attr)
if isinstance(value, basestring): if isinstance(value, str):
value = string_store.add(value) value = string_store.add(value)
elif isinstance(value, bool): elif isinstance(value, bool):
value = int(value) value = int(value)
@ -938,7 +938,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
seen_predicates = {pred.key: pred.i for pred in extra_predicates} seen_predicates = {pred.key: pred.i for pred in extra_predicates}
output = [] output = []
for attr, value in spec.items(): for attr, value in spec.items():
if isinstance(attr, basestring): if isinstance(attr, str):
if attr == "_": if attr == "_":
output.extend( output.extend(
_get_extension_extra_predicates( _get_extension_extra_predicates(
@ -995,7 +995,7 @@ def _get_operators(spec):
"?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)} "?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)}
# Fix casing # Fix casing
spec = {key.upper(): values for key, values in spec.items() spec = {key.upper(): values for key, values in spec.items()
if isinstance(key, basestring)} if isinstance(key, str)}
if "OP" not in spec: if "OP" not in spec:
return (ONE,) return (ONE,)
elif spec["OP"] in lookup: elif spec["OP"] in lookup:
@ -1013,7 +1013,7 @@ def _get_extensions(spec, string_store, name2index):
if isinstance(value, dict): if isinstance(value, dict):
# Handle predicates (e.g. "IN", in the extra_predicates, not here. # Handle predicates (e.g. "IN", in the extra_predicates, not here.
continue continue
if isinstance(value, basestring): if isinstance(value, str):
value = string_store.add(value) value = string_store.add(value)
if name not in name2index: if name not in name2index:
name2index[name] = len(name2index) name2index[name] = len(name2index)

View File

@ -1,11 +1,13 @@
from typing import List, Tuple, Callable, Optional, cast from typing import List, Tuple, Callable, Optional, Sequence, cast
from thinc.initializers import glorot_uniform_init from thinc.initializers import glorot_uniform_init
from thinc.util import partial from thinc.util import partial
from thinc.types import Ragged, Floats2d, Floats1d from thinc.types import Ragged, Floats2d, Floats1d, Ints1d
from thinc.api import Model, Ops, registry from thinc.api import Model, Ops, registry
from ..tokens import Doc from ..tokens import Doc
from ..errors import Errors from ..errors import Errors
from ..vectors import Mode
from ..vocab import Vocab
@registry.layers("spacy.StaticVectors.v2") @registry.layers("spacy.StaticVectors.v2")
@ -34,20 +36,32 @@ def StaticVectors(
def forward( def forward(
model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool
) -> Tuple[Ragged, Callable]: ) -> Tuple[Ragged, Callable]:
if not sum(len(doc) for doc in docs): token_count = sum(len(doc) for doc in docs)
if not token_count:
return _handle_empty(model.ops, model.get_dim("nO")) return _handle_empty(model.ops, model.get_dim("nO"))
key_attr = model.attrs["key_attr"] key_attr: int = model.attrs["key_attr"]
W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) keys: Ints1d = model.ops.flatten(
V = cast(Floats2d, model.ops.asarray(docs[0].vocab.vectors.data)) cast(Sequence, [doc.to_array(key_attr) for doc in docs])
rows = model.ops.flatten(
[doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
) )
vocab: Vocab = docs[0].vocab
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
if vocab.vectors.mode == Mode.default:
V = cast(Floats2d, model.ops.asarray(vocab.vectors.data))
rows = vocab.vectors.find(keys=keys)
V = model.ops.as_contig(V[rows])
elif vocab.vectors.mode == Mode.floret:
V = cast(Floats2d, vocab.vectors.get_batch(keys))
V = model.ops.as_contig(V)
else:
raise RuntimeError(Errors.E896)
try: try:
vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True) vectors_data = model.ops.gemm(V, W, trans2=True)
except ValueError: except ValueError:
raise RuntimeError(Errors.E896) raise RuntimeError(Errors.E896)
# Convert negative indices to 0-vectors (TODO: more options for UNK tokens) if vocab.vectors.mode == Mode.default:
vectors_data[rows < 0] = 0 # Convert negative indices to 0-vectors
# TODO: more options for UNK tokens
vectors_data[rows < 0] = 0
output = Ragged( output = Ragged(
vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") # type: ignore vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") # type: ignore
) )
@ -63,7 +77,7 @@ def forward(
model.inc_grad( model.inc_grad(
"W", "W",
model.ops.gemm( model.ops.gemm(
cast(Floats2d, d_output.data), model.ops.as_contig(V[rows]), trans1=True cast(Floats2d, d_output.data), model.ops.as_contig(V), trans1=True
), ),
) )
return [] return []

View File

@ -17,7 +17,7 @@ from ...errors import Errors
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
cdef weight_t MIN_SCORE = -90000 cdef weight_t MIN_SCORE = -90000
cdef attr_t SUBTOK_LABEL = hash_string(u'subtok') cdef attr_t SUBTOK_LABEL = hash_string('subtok')
DEF NON_MONOTONIC = True DEF NON_MONOTONIC = True

View File

@ -5,15 +5,15 @@ from pathlib import Path
from .pipe import Pipe from .pipe import Pipe
from ..errors import Errors from ..errors import Errors
from ..training import validate_examples, Example from ..training import Example
from ..language import Language from ..language import Language
from ..matcher import Matcher from ..matcher import Matcher
from ..scorer import Scorer from ..scorer import Scorer
from ..symbols import IDS, TAG, POS, MORPH, LEMMA from ..symbols import IDS
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
from ..vocab import Vocab from ..vocab import Vocab
from ..util import SimpleFrozenList from ..util import SimpleFrozenList, registry
from .. import util from .. import util
@ -23,9 +23,41 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
@Language.factory("attribute_ruler", default_config={"validate": False}) @Language.factory(
def make_attribute_ruler(nlp: Language, name: str, validate: bool): "attribute_ruler",
return AttributeRuler(nlp.vocab, name, validate=validate) default_config={
"validate": False,
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
},
)
def make_attribute_ruler(
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
):
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
def morph_key_getter(token, attr):
return getattr(token, attr).key
results = {}
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(
Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)
)
results.update(
Scorer.score_token_attr_per_feat(
examples, "morph", getter=morph_key_getter, **kwargs
)
)
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return results
@registry.scorers("spacy.attribute_ruler_scorer.v1")
def make_attribute_ruler_scorer():
return attribute_ruler_score
class AttributeRuler(Pipe): class AttributeRuler(Pipe):
@ -36,7 +68,12 @@ class AttributeRuler(Pipe):
""" """
def __init__( def __init__(
self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False self,
vocab: Vocab,
name: str = "attribute_ruler",
*,
validate: bool = False,
scorer: Optional[Callable] = attribute_ruler_score,
) -> None: ) -> None:
"""Create the AttributeRuler. After creation, you can add patterns """Create the AttributeRuler. After creation, you can add patterns
with the `.initialize()` or `.add_patterns()` methods, or load patterns with the `.initialize()` or `.add_patterns()` methods, or load patterns
@ -45,6 +82,10 @@ class AttributeRuler(Pipe):
vocab (Vocab): The vocab. vocab (Vocab): The vocab.
name (str): The pipe name. Defaults to "attribute_ruler". name (str): The pipe name. Defaults to "attribute_ruler".
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attributes "tag", "pos", "morph" and
"lemma" and Scorer.score_token_attr_per_feat for the attribute
"morph".
RETURNS (AttributeRuler): The AttributeRuler component. RETURNS (AttributeRuler): The AttributeRuler component.
@ -57,6 +98,7 @@ class AttributeRuler(Pipe):
self.attrs: List[Dict] = [] self.attrs: List[Dict] = []
self._attrs_unnormed: List[Dict] = [] # store for reference self._attrs_unnormed: List[Dict] = [] # store for reference
self.indices: List[int] = [] self.indices: List[int] = []
self.scorer = scorer
def clear(self) -> None: def clear(self) -> None:
"""Reset all patterns.""" """Reset all patterns."""
@ -228,45 +270,6 @@ class AttributeRuler(Pipe):
all_patterns.append(p) all_patterns.append(p)
return all_patterns # type: ignore[return-value] return all_patterns # type: ignore[return-value]
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by
Scorer.score_token_attr for the attributes "tag", "pos", "morph"
and "lemma" for the target token attributes.
DOCS: https://spacy.io/api/tagger#score
"""
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "AttributeRuler.score")
results = {}
attrs = set() # type: ignore
for token_attrs in self.attrs:
attrs.update(token_attrs)
for attr in attrs:
if attr == TAG:
results.update(Scorer.score_token_attr(examples, "tag", **kwargs))
elif attr == POS:
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
elif attr == MORPH:
results.update(
Scorer.score_token_attr(
examples, "morph", getter=morph_key_getter, **kwargs
)
)
results.update(
Scorer.score_token_attr_per_feat(
examples, "morph", getter=morph_key_getter, **kwargs
)
)
elif attr == LEMMA:
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return results
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the AttributeRuler to a bytestring. """Serialize the AttributeRuler to a bytestring.

View File

@ -1,6 +1,6 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from collections import defaultdict from collections import defaultdict
from typing import Optional, Iterable from typing import Optional, Iterable, Callable
from thinc.api import Model, Config from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem from ._parser_internals.transition_system import TransitionSystem
@ -12,7 +12,7 @@ from ..language import Language
from ._parser_internals import nonproj from ._parser_internals import nonproj
from ._parser_internals.nonproj import DELIMITER from ._parser_internals.nonproj import DELIMITER
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples from ..util import registry
default_model_config = """ default_model_config = """
@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
"learn_tokens": False, "learn_tokens": False,
"min_action_freq": 30, "min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL, "model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
}, },
default_score_weights={ default_score_weights={
"dep_uas": 0.5, "dep_uas": 0.5,
@ -63,7 +64,8 @@ def make_parser(
moves: Optional[TransitionSystem], moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int, update_with_oracle_cut_size: int,
learn_tokens: bool, learn_tokens: bool,
min_action_freq: int min_action_freq: int,
scorer: Optional[Callable],
): ):
"""Create a transition-based DependencyParser component. The dependency parser """Create a transition-based DependencyParser component. The dependency parser
jointly learns sentence segmentation and labelled dependency parsing, and can jointly learns sentence segmentation and labelled dependency parsing, and can
@ -100,6 +102,7 @@ def make_parser(
primarily affects the label accuracy, it can also affect the attachment primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity structure, as the labels are used to represent the pseudo-projectivity
transformation. transformation.
scorer (Optional[Callable]): The scoring method.
""" """
return DependencyParser( return DependencyParser(
nlp.vocab, nlp.vocab,
@ -115,7 +118,8 @@ def make_parser(
beam_update_prob=0.0, beam_update_prob=0.0,
# At some point in the future we can try to implement support for # At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective. # partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None incorrect_spans_key=None,
scorer=scorer,
) )
@Language.factory( @Language.factory(
@ -130,6 +134,7 @@ def make_parser(
"learn_tokens": False, "learn_tokens": False,
"min_action_freq": 30, "min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL, "model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
}, },
default_score_weights={ default_score_weights={
"dep_uas": 0.5, "dep_uas": 0.5,
@ -151,6 +156,7 @@ def make_beam_parser(
beam_width: int, beam_width: int,
beam_density: float, beam_density: float,
beam_update_prob: float, beam_update_prob: float,
scorer: Optional[Callable],
): ):
"""Create a transition-based DependencyParser component that uses beam-search. """Create a transition-based DependencyParser component that uses beam-search.
The dependency parser jointly learns sentence segmentation and labelled The dependency parser jointly learns sentence segmentation and labelled
@ -207,10 +213,41 @@ def make_beam_parser(
min_action_freq=min_action_freq, min_action_freq=min_action_freq,
# At some point in the future we can try to implement support for # At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective. # partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None incorrect_spans_key=None,
scorer=scorer,
) )
def parser_score(examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
and Scorer.score_deps.
DOCS: https://spacy.io/api/dependencyparser#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
def dep_getter(token, attr):
dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower()
return dep
results = {}
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs))
del results["sents_per_type"]
return results
@registry.scorers("spacy.parser_scorer.v1")
def make_parser_scorer():
return parser_score
cdef class DependencyParser(Parser): cdef class DependencyParser(Parser):
"""Pipeline component for dependency parsing. """Pipeline component for dependency parsing.
@ -233,6 +270,7 @@ cdef class DependencyParser(Parser):
beam_update_prob=0.0, beam_update_prob=0.0,
multitasks=tuple(), multitasks=tuple(),
incorrect_spans_key=None, incorrect_spans_key=None,
scorer=parser_score,
): ):
"""Create a DependencyParser. """Create a DependencyParser.
""" """
@ -249,6 +287,7 @@ cdef class DependencyParser(Parser):
beam_update_prob=beam_update_prob, beam_update_prob=beam_update_prob,
multitasks=multitasks, multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key, incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
) )
@property @property
@ -281,31 +320,6 @@ cdef class DependencyParser(Parser):
labels.add(label) labels.add(label)
return tuple(sorted(labels)) return tuple(sorted(labels))
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
and Scorer.score_deps.
DOCS: https://spacy.io/api/dependencyparser#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "DependencyParser.score")
def dep_getter(token, attr):
dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower()
return dep
results = {}
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs))
del results["sents_per_type"]
return results
def scored_parses(self, beams): def scored_parses(self, beams):
"""Return two dictionaries with scores for each beam/doc that was processed: """Return two dictionaries with scores for each beam/doc that was processed:
one containing (i, head) keys, and another containing (i, label) keys. one containing (i, head) keys, and another containing (i, label) keys.

View File

@ -17,10 +17,12 @@ from ..language import Language
from ..vocab import Vocab from ..vocab import Vocab
from ..training import Example, validate_examples, validate_get_examples from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import SimpleFrozenList from ..util import SimpleFrozenList, registry
from .. import util from .. import util
from ..scorer import Scorer from ..scorer import Scorer
# See #9050
BACKWARD_OVERWRITE = True
default_model_config = """ default_model_config = """
[model] [model]
@ -51,6 +53,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"incl_context": True, "incl_context": True,
"entity_vector_length": 64, "entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
"overwrite": True,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
}, },
default_score_weights={ default_score_weights={
"nel_micro_f": 1.0, "nel_micro_f": 1.0,
@ -69,6 +73,8 @@ def make_entity_linker(
incl_context: bool, incl_context: bool,
entity_vector_length: int, entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
overwrite: bool,
scorer: Optional[Callable],
): ):
"""Construct an EntityLinker component. """Construct an EntityLinker component.
@ -82,6 +88,7 @@ def make_entity_linker(
entity_vector_length (int): Size of encoding vectors in the KB. entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention. produces a list of candidates, given a certain knowledge base and a textual mention.
scorer (Optional[Callable]): The scoring method.
""" """
return EntityLinker( return EntityLinker(
nlp.vocab, nlp.vocab,
@ -93,9 +100,20 @@ def make_entity_linker(
incl_context=incl_context, incl_context=incl_context,
entity_vector_length=entity_vector_length, entity_vector_length=entity_vector_length,
get_candidates=get_candidates, get_candidates=get_candidates,
overwrite=overwrite,
scorer=scorer,
) )
def entity_linker_score(examples, **kwargs):
return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
@registry.scorers("spacy.entity_linker_scorer.v1")
def make_entity_linker_scorer():
return entity_linker_score
class EntityLinker(TrainablePipe): class EntityLinker(TrainablePipe):
"""Pipeline component for named entity linking. """Pipeline component for named entity linking.
@ -116,6 +134,8 @@ class EntityLinker(TrainablePipe):
incl_context: bool, incl_context: bool,
entity_vector_length: int, entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
overwrite: bool = BACKWARD_OVERWRITE,
scorer: Optional[Callable] = entity_linker_score,
) -> None: ) -> None:
"""Initialize an entity linker. """Initialize an entity linker.
@ -130,6 +150,8 @@ class EntityLinker(TrainablePipe):
entity_vector_length (int): Size of encoding vectors in the KB. entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention. produces a list of candidates, given a certain knowledge base and a textual mention.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_links.
DOCS: https://spacy.io/api/entitylinker#init DOCS: https://spacy.io/api/entitylinker#init
""" """
@ -141,11 +163,12 @@ class EntityLinker(TrainablePipe):
self.incl_prior = incl_prior self.incl_prior = incl_prior
self.incl_context = incl_context self.incl_context = incl_context
self.get_candidates = get_candidates self.get_candidates = get_candidates
self.cfg: Dict[str, Any] = {} self.cfg: Dict[str, Any] = {"overwrite": overwrite}
self.distance = CosineDistance(normalize=False) self.distance = CosineDistance(normalize=False)
# how many neighbour sentences to take into account # how many neighbour sentences to take into account
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
self.kb = empty_kb(entity_vector_length)(self.vocab) self.kb = empty_kb(entity_vector_length)(self.vocab)
self.scorer = scorer
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will """Define the KB of this pipe by providing a function that will
@ -384,23 +407,14 @@ class EntityLinker(TrainablePipe):
if count_ents != len(kb_ids): if count_ents != len(kb_ids):
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
i = 0 i = 0
overwrite = self.cfg["overwrite"]
for doc in docs: for doc in docs:
for ent in doc.ents: for ent in doc.ents:
kb_id = kb_ids[i] kb_id = kb_ids[i]
i += 1 i += 1
for token in ent: for token in ent:
token.ent_kb_id_ = kb_id if token.ent_kb_id == 0 or overwrite:
token.ent_kb_id_ = kb_id
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores.
DOCS TODO: https://spacy.io/api/entity_linker#score
"""
validate_examples(examples, "EntityLinker.score")
return Scorer.score_links(examples, negative_labels=[self.NIL])
def to_bytes(self, *, exclude=tuple()): def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring. """Serialize the pipe to a bytestring.

View File

@ -9,11 +9,10 @@ from .pipe import Pipe
from ..training import Example from ..training import Example
from ..language import Language from ..language import Language
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
from ..scorer import get_ner_prf from ..scorer import get_ner_prf
from ..training import validate_examples
DEFAULT_ENT_ID_SEP = "||" DEFAULT_ENT_ID_SEP = "||"
@ -28,6 +27,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
"validate": False, "validate": False,
"overwrite_ents": False, "overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP, "ent_id_sep": DEFAULT_ENT_ID_SEP,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
}, },
default_score_weights={ default_score_weights={
"ents_f": 1.0, "ents_f": 1.0,
@ -43,6 +43,7 @@ def make_entity_ruler(
validate: bool, validate: bool,
overwrite_ents: bool, overwrite_ents: bool,
ent_id_sep: str, ent_id_sep: str,
scorer: Optional[Callable],
): ):
return EntityRuler( return EntityRuler(
nlp, nlp,
@ -51,9 +52,19 @@ def make_entity_ruler(
validate=validate, validate=validate,
overwrite_ents=overwrite_ents, overwrite_ents=overwrite_ents,
ent_id_sep=ent_id_sep, ent_id_sep=ent_id_sep,
scorer=scorer,
) )
def entity_ruler_score(examples, **kwargs):
return get_ner_prf(examples)
@registry.scorers("spacy.entity_ruler_scorer.v1")
def make_entity_ruler_scorer():
return entity_ruler_score
class EntityRuler(Pipe): class EntityRuler(Pipe):
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based """The EntityRuler lets you add spans to the `Doc.ents` using token-based
rules or exact phrase matches. It can be combined with the statistical rules or exact phrase matches. It can be combined with the statistical
@ -75,6 +86,7 @@ class EntityRuler(Pipe):
overwrite_ents: bool = False, overwrite_ents: bool = False,
ent_id_sep: str = DEFAULT_ENT_ID_SEP, ent_id_sep: str = DEFAULT_ENT_ID_SEP,
patterns: Optional[List[PatternType]] = None, patterns: Optional[List[PatternType]] = None,
scorer: Optional[Callable] = entity_ruler_score,
) -> None: ) -> None:
"""Initialize the entity ruler. If patterns are supplied here, they """Initialize the entity ruler. If patterns are supplied here, they
need to be a list of dictionaries with a `"label"` and `"pattern"` need to be a list of dictionaries with a `"label"` and `"pattern"`
@ -95,6 +107,8 @@ class EntityRuler(Pipe):
overwrite_ents (bool): If existing entities are present, e.g. entities overwrite_ents (bool): If existing entities are present, e.g. entities
added by the model, overwrite them by matches if necessary. added by the model, overwrite them by matches if necessary.
ent_id_sep (str): Separator used internally for entity IDs. ent_id_sep (str): Separator used internally for entity IDs.
scorer (Optional[Callable]): The scoring method. Defaults to
spacy.scorer.get_ner_prf.
DOCS: https://spacy.io/api/entityruler#init DOCS: https://spacy.io/api/entityruler#init
""" """
@ -113,6 +127,7 @@ class EntityRuler(Pipe):
self._ent_ids = defaultdict(tuple) # type: ignore self._ent_ids = defaultdict(tuple) # type: ignore
if patterns is not None: if patterns is not None:
self.add_patterns(patterns) self.add_patterns(patterns)
self.scorer = scorer
def __len__(self) -> int: def __len__(self) -> int:
"""The number of all patterns added to the entity ruler.""" """The number of all patterns added to the entity ruler."""
@ -363,10 +378,6 @@ class EntityRuler(Pipe):
label = f"{label}{self.ent_id_sep}{ent_id}" label = f"{label}{self.ent_id_sep}{ent_id}"
return label return label
def score(self, examples, **kwargs):
validate_examples(examples, "EntityRuler.score")
return get_ner_prf(examples)
def from_bytes( def from_bytes(
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityRuler": ) -> "EntityRuler":

View File

@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups
from ..scorer import Scorer from ..scorer import Scorer
from ..tokens import Doc, Token from ..tokens import Doc, Token
from ..vocab import Vocab from ..vocab import Vocab
from ..training import validate_examples from ..util import logger, SimpleFrozenList, registry
from ..util import logger, SimpleFrozenList
from .. import util from .. import util
@Language.factory( @Language.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "lookup", "overwrite": False}, default_config={
"model": None,
"mode": "lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
): ):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_token_attr(examples, "lemma", **kwargs)
@registry.scorers("spacy.lemmatizer_scorer.v1")
def make_lemmatizer_scorer():
return lemmatizer_score
class Lemmatizer(Pipe): class Lemmatizer(Pipe):
@ -60,6 +80,7 @@ class Lemmatizer(Pipe):
*, *,
mode: str = "lookup", mode: str = "lookup",
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
"""Initialize a Lemmatizer. """Initialize a Lemmatizer.
@ -69,6 +90,8 @@ class Lemmatizer(Pipe):
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup". mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
overwrite (bool): Whether to overwrite existing lemmas. Defaults to overwrite (bool): Whether to overwrite existing lemmas. Defaults to
`False`. `False`.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attribute "lemma".
DOCS: https://spacy.io/api/lemmatizer#init DOCS: https://spacy.io/api/lemmatizer#init
""" """
@ -89,6 +112,7 @@ class Lemmatizer(Pipe):
raise ValueError(Errors.E1003.format(mode=mode)) raise ValueError(Errors.E1003.format(mode=mode))
self.lemmatize = getattr(self, mode_attr) self.lemmatize = getattr(self, mode_attr)
self.cache = {} # type: ignore[var-annotated] self.cache = {} # type: ignore[var-annotated]
self.scorer = scorer
@property @property
def mode(self): def mode(self):
@ -247,17 +271,6 @@ class Lemmatizer(Pipe):
""" """
return False return False
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores.
DOCS: https://spacy.io/api/lemmatizer#score
"""
validate_examples(examples, "Lemmatizer.score")
return Scorer.score_token_attr(examples, "lemma", **kwargs)
def to_disk( def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
): ):

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from typing import Optional, Union, Dict from typing import Optional, Union, Dict, Callable
import srsly import srsly
from thinc.api import SequenceCategoricalCrossentropy, Model, Config from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from itertools import islice from itertools import islice
@ -17,7 +17,11 @@ from .tagger import Tagger
from .. import util from .. import util
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples from ..training import validate_examples, validate_get_examples
from ..util import registry
# See #9050
BACKWARD_OVERWRITE = True
BACKWARD_EXTEND = False
default_model_config = """ default_model_config = """
[model] [model]
@ -48,15 +52,35 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory( @Language.factory(
"morphologizer", "morphologizer",
assigns=["token.morph", "token.pos"], assigns=["token.morph", "token.pos"],
default_config={"model": DEFAULT_MORPH_MODEL}, default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
) )
def make_morphologizer( def make_morphologizer(
nlp: Language, nlp: Language,
model: Model, model: Model,
name: str, name: str,
overwrite: bool,
extend: bool,
scorer: Optional[Callable],
): ):
return Morphologizer(nlp.vocab, model, name) return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
def morphologizer_score(examples, **kwargs):
def morph_key_getter(token, attr):
return getattr(token, attr).key
results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples,
"morph", getter=morph_key_getter, **kwargs))
return results
@registry.scorers("spacy.morphologizer_scorer.v1")
def make_morphologizer_scorer():
return morphologizer_score
class Morphologizer(Tagger): class Morphologizer(Tagger):
@ -67,6 +91,10 @@ class Morphologizer(Tagger):
vocab: Vocab, vocab: Vocab,
model: Model, model: Model,
name: str = "morphologizer", name: str = "morphologizer",
*,
overwrite: bool = BACKWARD_OVERWRITE,
extend: bool = BACKWARD_EXTEND,
scorer: Optional[Callable] = morphologizer_score,
): ):
"""Initialize a morphologizer. """Initialize a morphologizer.
@ -74,6 +102,9 @@ class Morphologizer(Tagger):
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attributes "pos" and "morph" and
Scorer.score_token_attr_per_feat for the attribute "morph".
DOCS: https://spacy.io/api/morphologizer#init DOCS: https://spacy.io/api/morphologizer#init
""" """
@ -85,8 +116,14 @@ class Morphologizer(Tagger):
# store mappings from morph+POS labels to token-level annotations: # store mappings from morph+POS labels to token-level annotations:
# 1) labels_morph stores a mapping from morph+POS->morph # 1) labels_morph stores a mapping from morph+POS->morph
# 2) labels_pos stores a mapping from morph+POS->POS # 2) labels_pos stores a mapping from morph+POS->POS
cfg = {"labels_morph": {}, "labels_pos": {}} cfg = {
"labels_morph": {},
"labels_pos": {},
"overwrite": overwrite,
"extend": extend,
}
self.cfg = dict(sorted(cfg.items())) self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
@property @property
def labels(self): def labels(self):
@ -192,14 +229,34 @@ class Morphologizer(Tagger):
docs = [docs] docs = [docs]
cdef Doc doc cdef Doc doc
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
cdef bint overwrite = self.cfg["overwrite"]
cdef bint extend = self.cfg["extend"]
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i] doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"): if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get() doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids): for j, tag_id in enumerate(doc_tag_ids):
morph = self.labels[tag_id] morph = self.labels[tag_id]
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0)) # set morph
doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0) if doc.c[j].morph == 0 or overwrite or extend:
if overwrite and extend:
# morphologizer morph overwrites any existing features
# while extending
extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
doc.c[j].morph = self.vocab.morphology.add(extended_morph)
elif extend:
# existing features are preserved and any new features
# are added
extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
doc.c[j].morph = self.vocab.morphology.add(extended_morph)
else:
# clobber
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
# set POS
if doc.c[j].pos == 0 or overwrite:
doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and """Find the loss and gradient of loss for the batch of documents and
@ -246,24 +303,3 @@ class Morphologizer(Tagger):
if self.model.ops.xp.isnan(loss): if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name)) raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores return float(loss), d_scores
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by
Scorer.score_token_attr for the attributes "pos" and "morph" and
Scorer.score_token_attr_per_feat for the attribute "morph".
DOCS: https://spacy.io/api/morphologizer#score
"""
def morph_key_getter(token, attr):
return getattr(token, attr).key
validate_examples(examples, "Morphologizer.score")
results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
results.update(Scorer.score_token_attr_per_feat(examples,
"morph", getter=morph_key_getter, **kwargs))
return results

View File

@ -1,6 +1,6 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from collections import defaultdict from collections import defaultdict
from typing import Optional, Iterable from typing import Optional, Iterable, Callable
from thinc.api import Model, Config from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem from ._parser_internals.transition_system import TransitionSystem
@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown
from ..language import Language from ..language import Language
from ..scorer import get_ner_prf, PRFScore from ..scorer import get_ner_prf, PRFScore
from ..training import validate_examples from ..util import registry
default_model_config = """ default_model_config = """
@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
"moves": None, "moves": None,
"update_with_oracle_cut_size": 100, "update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL, "model": DEFAULT_NER_MODEL,
"incorrect_spans_key": None "incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
}, },
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
@ -52,7 +53,8 @@ def make_ner(
model: Model, model: Model,
moves: Optional[TransitionSystem], moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int, update_with_oracle_cut_size: int,
incorrect_spans_key: Optional[str]=None incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
): ):
"""Create a transition-based EntityRecognizer component. The entity recognizer """Create a transition-based EntityRecognizer component. The entity recognizer
identifies non-overlapping labelled spans of tokens. identifies non-overlapping labelled spans of tokens.
@ -80,6 +82,7 @@ def make_ner(
incorrect_spans_key (Optional[str]): Identifies spans that are known incorrect_spans_key (Optional[str]): Identifies spans that are known
to be incorrect entity annotations. The incorrect entity annotations to be incorrect entity annotations. The incorrect entity annotations
can be stored in the span group, under this key. can be stored in the span group, under this key.
scorer (Optional[Callable]): The scoring method.
""" """
return EntityRecognizer( return EntityRecognizer(
nlp.vocab, nlp.vocab,
@ -92,6 +95,7 @@ def make_ner(
beam_width=1, beam_width=1,
beam_density=0.0, beam_density=0.0,
beam_update_prob=0.0, beam_update_prob=0.0,
scorer=scorer,
) )
@Language.factory( @Language.factory(
@ -104,7 +108,8 @@ def make_ner(
"beam_density": 0.01, "beam_density": 0.01,
"beam_update_prob": 0.5, "beam_update_prob": 0.5,
"beam_width": 32, "beam_width": 32,
"incorrect_spans_key": None "incorrect_spans_key": None,
"scorer": None,
}, },
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
) )
@ -117,7 +122,8 @@ def make_beam_ner(
beam_width: int, beam_width: int,
beam_density: float, beam_density: float,
beam_update_prob: float, beam_update_prob: float,
incorrect_spans_key: Optional[str]=None incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
): ):
"""Create a transition-based EntityRecognizer component that uses beam-search. """Create a transition-based EntityRecognizer component that uses beam-search.
The entity recognizer identifies non-overlapping labelled spans of tokens. The entity recognizer identifies non-overlapping labelled spans of tokens.
@ -153,6 +159,7 @@ def make_beam_ner(
and are faster to compute. and are faster to compute.
incorrect_spans_key (Optional[str]): Optional key into span groups of incorrect_spans_key (Optional[str]): Optional key into span groups of
entities known to be non-entities. entities known to be non-entities.
scorer (Optional[Callable]): The scoring method.
""" """
return EntityRecognizer( return EntityRecognizer(
nlp.vocab, nlp.vocab,
@ -164,10 +171,20 @@ def make_beam_ner(
beam_width=beam_width, beam_width=beam_width,
beam_density=beam_density, beam_density=beam_density,
beam_update_prob=beam_update_prob, beam_update_prob=beam_update_prob,
incorrect_spans_key=incorrect_spans_key incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
) )
def ner_score(examples, **kwargs):
return get_ner_prf(examples, **kwargs)
@registry.scorers("spacy.ner_scorer.v1")
def make_ner_scorer():
return ner_score
cdef class EntityRecognizer(Parser): cdef class EntityRecognizer(Parser):
"""Pipeline component for named entity recognition. """Pipeline component for named entity recognition.
@ -188,6 +205,7 @@ cdef class EntityRecognizer(Parser):
beam_update_prob=0.0, beam_update_prob=0.0,
multitasks=tuple(), multitasks=tuple(),
incorrect_spans_key=None, incorrect_spans_key=None,
scorer=ner_score,
): ):
"""Create an EntityRecognizer. """Create an EntityRecognizer.
""" """
@ -204,6 +222,7 @@ cdef class EntityRecognizer(Parser):
beam_update_prob=beam_update_prob, beam_update_prob=beam_update_prob,
multitasks=multitasks, multitasks=multitasks,
incorrect_spans_key=incorrect_spans_key, incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
) )
def add_multitask_objective(self, mt_component): def add_multitask_objective(self, mt_component):
@ -227,17 +246,6 @@ cdef class EntityRecognizer(Parser):
if move[0] in ("B", "I", "L", "U")) if move[0] in ("B", "I", "L", "U"))
return tuple(sorted(labels)) return tuple(sorted(labels))
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The NER precision, recall and f-scores.
DOCS: https://spacy.io/api/entityrecognizer#score
"""
validate_examples(examples, "EntityRecognizer.score")
return get_ner_prf(examples)
def scored_ents(self, beams): def scored_ents(self, beams):
"""Return a dictionary of (start, end, label) tuples with corresponding scores """Return a dictionary of (start, end, label) tuples with corresponding scores
for each beam/doc that was processed. for each beam/doc that was processed.

View File

@ -81,6 +81,17 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe#score DOCS: https://spacy.io/api/pipe#score
""" """
if hasattr(self, "scorer") and self.scorer is not None:
scorer_kwargs = {}
# use default settings from cfg (e.g., threshold)
if hasattr(self, "cfg") and isinstance(self.cfg, dict):
scorer_kwargs.update(self.cfg)
# override self.cfg["labels"] with self.labels
if hasattr(self, "labels"):
scorer_kwargs["labels"] = self.labels
# override with kwargs settings
scorer_kwargs.update(kwargs)
return self.scorer(examples, **scorer_kwargs)
return {} return {}
@property @property

View File

@ -1,26 +1,32 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from typing import Optional, List from typing import Optional, List, Callable
import srsly import srsly
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from .pipe import Pipe from .pipe import Pipe
from .senter import senter_score
from ..language import Language from ..language import Language
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples
from .. import util from .. import util
# see #9050
BACKWARD_OVERWRITE = False
@Language.factory( @Language.factory(
"sentencizer", "sentencizer",
assigns=["token.is_sent_start", "doc.sents"], assigns=["token.is_sent_start", "doc.sents"],
default_config={"punct_chars": None}, default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
) )
def make_sentencizer( def make_sentencizer(
nlp: Language, nlp: Language,
name: str, name: str,
punct_chars: Optional[List[str]] punct_chars: Optional[List[str]],
overwrite: bool,
scorer: Optional[Callable],
): ):
return Sentencizer(name, punct_chars=punct_chars) return Sentencizer(name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer)
class Sentencizer(Pipe): class Sentencizer(Pipe):
@ -41,12 +47,20 @@ class Sentencizer(Pipe):
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
'', ''] '', '']
def __init__(self, name="sentencizer", *, punct_chars=None): def __init__(
self,
name="sentencizer",
*,
punct_chars=None,
overwrite=BACKWARD_OVERWRITE,
scorer=senter_score,
):
"""Initialize the sentencizer. """Initialize the sentencizer.
punct_chars (list): Punctuation characters to split on. Will be punct_chars (list): Punctuation characters to split on. Will be
serialized with the nlp object. serialized with the nlp object.
RETURNS (Sentencizer): The sentencizer component. scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the attribute "sents".
DOCS: https://spacy.io/api/sentencizer#init DOCS: https://spacy.io/api/sentencizer#init
""" """
@ -55,6 +69,8 @@ class Sentencizer(Pipe):
self.punct_chars = set(punct_chars) self.punct_chars = set(punct_chars)
else: else:
self.punct_chars = set(self.default_punct_chars) self.punct_chars = set(self.default_punct_chars)
self.overwrite = overwrite
self.scorer = scorer
def __call__(self, doc): def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start. """Apply the sentencizer to a Doc and set Token.is_sent_start.
@ -115,29 +131,12 @@ class Sentencizer(Pipe):
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i] doc_tag_ids = batch_tag_ids[i]
for j, tag_id in enumerate(doc_tag_ids): for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber existing sentence boundaries if doc.c[j].sent_start == 0 or self.overwrite:
if doc.c[j].sent_start == 0:
if tag_id: if tag_id:
doc.c[j].sent_start = 1 doc.c[j].sent_start = 1
else: else:
doc.c[j].sent_start = -1 doc.c[j].sent_start = -1
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
DOCS: https://spacy.io/api/sentencizer#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "Sentencizer.score")
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results
def to_bytes(self, *, exclude=tuple()): def to_bytes(self, *, exclude=tuple()):
"""Serialize the sentencizer to a bytestring. """Serialize the sentencizer to a bytestring.
@ -145,7 +144,7 @@ class Sentencizer(Pipe):
DOCS: https://spacy.io/api/sentencizer#to_bytes DOCS: https://spacy.io/api/sentencizer#to_bytes
""" """
return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)}) return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
def from_bytes(self, bytes_data, *, exclude=tuple()): def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load the sentencizer from a bytestring. """Load the sentencizer from a bytestring.
@ -157,6 +156,7 @@ class Sentencizer(Pipe):
""" """
cfg = srsly.msgpack_loads(bytes_data) cfg = srsly.msgpack_loads(bytes_data)
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
self.overwrite = cfg.get("overwrite", self.overwrite)
return self return self
def to_disk(self, path, *, exclude=tuple()): def to_disk(self, path, *, exclude=tuple()):
@ -166,7 +166,7 @@ class Sentencizer(Pipe):
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
path = path.with_suffix(".json") path = path.with_suffix(".json")
srsly.write_json(path, {"punct_chars": list(self.punct_chars)}) srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
def from_disk(self, path, *, exclude=tuple()): def from_disk(self, path, *, exclude=tuple()):
@ -178,4 +178,5 @@ class Sentencizer(Pipe):
path = path.with_suffix(".json") path = path.with_suffix(".json")
cfg = srsly.read_json(path) cfg = srsly.read_json(path)
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
self.overwrite = cfg.get("overwrite", self.overwrite)
return self return self

View File

@ -1,5 +1,6 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from itertools import islice from itertools import islice
from typing import Optional, Callable
import srsly import srsly
from thinc.api import Model, SequenceCategoricalCrossentropy, Config from thinc.api import Model, SequenceCategoricalCrossentropy, Config
@ -11,8 +12,11 @@ from ..language import Language
from ..errors import Errors from ..errors import Errors
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples from ..training import validate_examples, validate_get_examples
from ..util import registry
from .. import util from .. import util
# See #9050
BACKWARD_OVERWRITE = False
default_model_config = """ default_model_config = """
[model] [model]
@ -34,11 +38,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory( @Language.factory(
"senter", "senter",
assigns=["token.is_sent_start"], assigns=["token.is_sent_start"],
default_config={"model": DEFAULT_SENTER_MODEL}, default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
) )
def make_senter(nlp: Language, name: str, model: Model): def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
return SentenceRecognizer(nlp.vocab, model, name) return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
def senter_score(examples, **kwargs):
def has_sents(doc):
return doc.has_annotation("SENT_START")
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results
@registry.scorers("spacy.senter_scorer.v1")
def make_senter_scorer():
return senter_score
class SentenceRecognizer(Tagger): class SentenceRecognizer(Tagger):
@ -46,13 +64,23 @@ class SentenceRecognizer(Tagger):
DOCS: https://spacy.io/api/sentencerecognizer DOCS: https://spacy.io/api/sentencerecognizer
""" """
def __init__(self, vocab, model, name="senter"): def __init__(
self,
vocab,
model,
name="senter",
*,
overwrite=BACKWARD_OVERWRITE,
scorer=senter_score,
):
"""Initialize a sentence recognizer. """Initialize a sentence recognizer.
vocab (Vocab): The shared vocabulary. vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the attribute "sents".
DOCS: https://spacy.io/api/sentencerecognizer#init DOCS: https://spacy.io/api/sentencerecognizer#init
""" """
@ -60,7 +88,8 @@ class SentenceRecognizer(Tagger):
self.model = model self.model = model
self.name = name self.name = name
self._rehearsal_model = None self._rehearsal_model = None
self.cfg = {} self.cfg = {"overwrite": overwrite}
self.scorer = scorer
@property @property
def labels(self): def labels(self):
@ -85,13 +114,13 @@ class SentenceRecognizer(Tagger):
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
cdef Doc doc cdef Doc doc
cdef bint overwrite = self.cfg["overwrite"]
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i] doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"): if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get() doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids): for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber existing sentence boundaries if doc.c[j].sent_start == 0 or overwrite:
if doc.c[j].sent_start == 0:
if tag_id == 1: if tag_id == 1:
doc.c[j].sent_start = 1 doc.c[j].sent_start = 1
else: else:
@ -153,18 +182,3 @@ class SentenceRecognizer(Tagger):
def add_label(self, label, values=None): def add_label(self, label, values=None):
raise NotImplementedError raise NotImplementedError
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans.
DOCS: https://spacy.io/api/sentencerecognizer#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "SentenceRecognizer.score")
results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
del results["sents_per_type"]
return results

View File

@ -104,6 +104,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
"max_positive": None, "max_positive": None,
"model": DEFAULT_SPANCAT_MODEL, "model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
}, },
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
) )
@ -113,8 +114,9 @@ def make_spancat(
suggester: Suggester, suggester: Suggester,
model: Model[Tuple[List[Doc], Ragged], Floats2d], model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str, spans_key: str,
threshold: float = 0.5, scorer: Optional[Callable],
max_positive: Optional[int] = None, threshold: float,
max_positive: Optional[int],
) -> "SpanCategorizer": ) -> "SpanCategorizer":
"""Create a SpanCategorizer component. The span categorizer consists of two """Create a SpanCategorizer component. The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller parts: a suggester function that proposes candidate spans, and a labeller
@ -144,9 +146,28 @@ def make_spancat(
threshold=threshold, threshold=threshold,
max_positive=max_positive, max_positive=max_positive,
name=name, name=name,
scorer=scorer,
) )
def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
kwargs = dict(kwargs)
attr_prefix = "spans_"
key = kwargs["spans_key"]
kwargs.setdefault("attr", f"{attr_prefix}{key}")
kwargs.setdefault("allow_overlap", True)
kwargs.setdefault(
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
)
kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
return Scorer.score_spans(examples, **kwargs)
@registry.scorers("spacy.spancat_scorer.v1")
def make_spancat_scorer():
return spancat_score
class SpanCategorizer(TrainablePipe): class SpanCategorizer(TrainablePipe):
"""Pipeline component to label spans of text. """Pipeline component to label spans of text.
@ -163,8 +184,25 @@ class SpanCategorizer(TrainablePipe):
spans_key: str = "spans", spans_key: str = "spans",
threshold: float = 0.5, threshold: float = 0.5,
max_positive: Optional[int] = None, max_positive: Optional[int] = None,
scorer: Optional[Callable] = spancat_score,
) -> None: ) -> None:
"""Initialize the span categorizer. """Initialize the span categorizer.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
spans_key (str): Key of the Doc.spans dict to save the spans under.
During initialization and training, the component will look for
spans on the reference document under the same key. Defaults to
`"spans"`.
threshold (float): Minimum probability to consider a prediction
positive. Spans with a positive prediction will be saved on the Doc.
Defaults to 0.5.
max_positive (Optional[int]): Maximum number of labels to consider
positive per span. Defaults to None, indicating no limit.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
DOCS: https://spacy.io/api/spancategorizer#init DOCS: https://spacy.io/api/spancategorizer#init
""" """
@ -178,6 +216,7 @@ class SpanCategorizer(TrainablePipe):
self.suggester = suggester self.suggester = suggester
self.model = model self.model = model
self.name = name self.name = name
self.scorer = scorer
@property @property
def key(self) -> str: def key(self) -> str:
@ -379,26 +418,6 @@ class SpanCategorizer(TrainablePipe):
else: else:
self.model.initialize() self.model.initialize()
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
DOCS: https://spacy.io/api/spancategorizer#score
"""
validate_examples(examples, "SpanCategorizer.score")
self._validate_categories(examples)
kwargs = dict(kwargs)
attr_prefix = "spans_"
kwargs.setdefault("attr", f"{attr_prefix}{self.key}")
kwargs.setdefault("allow_overlap", True)
kwargs.setdefault(
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
)
kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans)
return Scorer.score_spans(examples, **kwargs)
def _validate_categories(self, examples: Iterable[Example]): def _validate_categories(self, examples: Iterable[Example]):
# TODO # TODO
pass pass

View File

@ -1,4 +1,5 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from typing import Callable, Optional
import numpy import numpy
import srsly import srsly
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
@ -18,8 +19,11 @@ from ..parts_of_speech import X
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples from ..training import validate_examples, validate_get_examples
from ..util import registry
from .. import util from .. import util
# See #9050
BACKWARD_OVERWRITE = False
default_model_config = """ default_model_config = """
[model] [model]
@ -41,10 +45,16 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory( @Language.factory(
"tagger", "tagger",
assigns=["token.tag"], assigns=["token.tag"],
default_config={"model": DEFAULT_TAGGER_MODEL}, default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
default_score_weights={"tag_acc": 1.0}, default_score_weights={"tag_acc": 1.0},
) )
def make_tagger(nlp: Language, name: str, model: Model): def make_tagger(
nlp: Language,
name: str,
model: Model,
overwrite: bool,
scorer: Optional[Callable],
):
"""Construct a part-of-speech tagger component. """Construct a part-of-speech tagger component.
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
@ -52,7 +62,16 @@ def make_tagger(nlp: Language, name: str, model: Model):
in size, and be normalized as probabilities (all scores between 0 and 1, in size, and be normalized as probabilities (all scores between 0 and 1,
with the rows summing to 1). with the rows summing to 1).
""" """
return Tagger(nlp.vocab, model, name) return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
def tagger_score(examples, **kwargs):
return Scorer.score_token_attr(examples, "tag", **kwargs)
@registry.scorers("spacy.tagger_scorer.v1")
def make_tagger_scorer():
return tagger_score
class Tagger(TrainablePipe): class Tagger(TrainablePipe):
@ -60,13 +79,23 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger DOCS: https://spacy.io/api/tagger
""" """
def __init__(self, vocab, model, name="tagger"): def __init__(
self,
vocab,
model,
name="tagger",
*,
overwrite=BACKWARD_OVERWRITE,
scorer=tagger_score,
):
"""Initialize a part-of-speech tagger. """Initialize a part-of-speech tagger.
vocab (Vocab): The shared vocabulary. vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attribute "tag".
DOCS: https://spacy.io/api/tagger#init DOCS: https://spacy.io/api/tagger#init
""" """
@ -74,8 +103,9 @@ class Tagger(TrainablePipe):
self.model = model self.model = model
self.name = name self.name = name
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": []} cfg = {"labels": [], "overwrite": overwrite}
self.cfg = dict(sorted(cfg.items())) self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
@property @property
def labels(self): def labels(self):
@ -135,13 +165,13 @@ class Tagger(TrainablePipe):
docs = [docs] docs = [docs]
cdef Doc doc cdef Doc doc
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
cdef bint overwrite = self.cfg["overwrite"]
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i] doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"): if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get() doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids): for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber preset POS tags if doc.c[j].tag == 0 or overwrite:
if doc.c[j].tag == 0:
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
def update(self, examples, *, drop=0., sgd=None, losses=None): def update(self, examples, *, drop=0., sgd=None, losses=None):
@ -289,15 +319,3 @@ class Tagger(TrainablePipe):
self.cfg["labels"].append(label) self.cfg["labels"].append(label)
self.vocab.strings.add(label) self.vocab.strings.add(label)
return 1 return 1
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by
Scorer.score_token_attr for the attributes "tag".
DOCS: https://spacy.io/api/tagger#score
"""
validate_examples(examples, "Tagger.score")
return Scorer.score_token_attr(examples, "tag", **kwargs)

View File

@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors from ..errors import Errors
from ..scorer import Scorer from ..scorer import Scorer
from ..tokens import Doc from ..tokens import Doc
from ..util import registry
from ..vocab import Vocab from ..vocab import Vocab
@ -70,7 +71,11 @@ subword_features = true
@Language.factory( @Language.factory(
"textcat", "textcat",
assigns=["doc.cats"], assigns=["doc.cats"],
default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL}, default_config={
"threshold": 0.5,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
},
default_score_weights={ default_score_weights={
"cats_score": 1.0, "cats_score": 1.0,
"cats_score_desc": None, "cats_score_desc": None,
@ -86,7 +91,11 @@ subword_features = true
}, },
) )
def make_textcat( def make_textcat(
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
) -> "TextCategorizer": ) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories """Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered over a whole document. It can learn one or more labels, and the labels are considered
@ -95,8 +104,23 @@ def make_textcat(
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category. scores for each category.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
""" """
return TextCategorizer(nlp.vocab, model, name, threshold=threshold) return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_cats(
examples,
"cats",
multi_label=False,
**kwargs,
)
@registry.scorers("spacy.textcat_scorer.v1")
def make_textcat_scorer():
return textcat_score
class TextCategorizer(TrainablePipe): class TextCategorizer(TrainablePipe):
@ -106,7 +130,13 @@ class TextCategorizer(TrainablePipe):
""" """
def __init__( def __init__(
self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float self,
vocab: Vocab,
model: Model,
name: str = "textcat",
*,
threshold: float,
scorer: Optional[Callable] = textcat_score,
) -> None: ) -> None:
"""Initialize a text categorizer for single-label classification. """Initialize a text categorizer for single-label classification.
@ -115,6 +145,8 @@ class TextCategorizer(TrainablePipe):
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_cats for the attribute "cats".
DOCS: https://spacy.io/api/textcategorizer#init DOCS: https://spacy.io/api/textcategorizer#init
""" """
@ -124,6 +156,7 @@ class TextCategorizer(TrainablePipe):
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold, "positive_label": None} cfg = {"labels": [], "threshold": threshold, "positive_label": None}
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.scorer = scorer
@property @property
def labels(self) -> Tuple[str]: def labels(self) -> Tuple[str]:
@ -353,26 +386,6 @@ class TextCategorizer(TrainablePipe):
assert len(label_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample) self.model.initialize(X=doc_sample, Y=label_sample)
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
DOCS: https://spacy.io/api/textcategorizer#score
"""
validate_examples(examples, "TextCategorizer.score")
self._validate_categories(examples)
kwargs.setdefault("threshold", self.cfg["threshold"])
kwargs.setdefault("positive_label", self.cfg["positive_label"])
return Scorer.score_cats(
examples,
"cats",
labels=self.labels,
multi_label=False,
**kwargs,
)
def _validate_categories(self, examples: Iterable[Example]): def _validate_categories(self, examples: Iterable[Example]):
"""Check whether the provided examples all have single-label cats annotations.""" """Check whether the provided examples all have single-label cats annotations."""
for ex in examples: for ex in examples:

View File

@ -5,10 +5,11 @@ from thinc.api import Model, Config
from thinc.types import Floats2d from thinc.types import Floats2d
from ..language import Language from ..language import Language
from ..training import Example, validate_examples, validate_get_examples from ..training import Example, validate_get_examples
from ..errors import Errors from ..errors import Errors
from ..scorer import Scorer from ..scorer import Scorer
from ..tokens import Doc from ..tokens import Doc
from ..util import registry
from ..vocab import Vocab from ..vocab import Vocab
from .textcat import TextCategorizer from .textcat import TextCategorizer
@ -70,7 +71,11 @@ subword_features = true
@Language.factory( @Language.factory(
"textcat_multilabel", "textcat_multilabel",
assigns=["doc.cats"], assigns=["doc.cats"],
default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL}, default_config={
"threshold": 0.5,
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
},
default_score_weights={ default_score_weights={
"cats_score": 1.0, "cats_score": 1.0,
"cats_score_desc": None, "cats_score_desc": None,
@ -86,7 +91,11 @@ subword_features = true
}, },
) )
def make_multilabel_textcat( def make_multilabel_textcat(
nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
) -> "TextCategorizer": ) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories """Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered over a whole document. It can learn one or more labels, and the labels are considered
@ -97,7 +106,23 @@ def make_multilabel_textcat(
scores for each category. scores for each category.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
""" """
return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold) return MultiLabel_TextCategorizer(
nlp.vocab, model, name, threshold=threshold, scorer=scorer
)
def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_cats(
examples,
"cats",
multi_label=True,
**kwargs,
)
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
def make_textcat_multilabel_scorer():
return textcat_multilabel_score
class MultiLabel_TextCategorizer(TextCategorizer): class MultiLabel_TextCategorizer(TextCategorizer):
@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
name: str = "textcat_multilabel", name: str = "textcat_multilabel",
*, *,
threshold: float, threshold: float,
scorer: Optional[Callable] = textcat_multilabel_score,
) -> None: ) -> None:
"""Initialize a text categorizer for multi-label classification. """Initialize a text categorizer for multi-label classification.
@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold} cfg = {"labels": [], "threshold": threshold}
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.scorer = scorer
def initialize( # type: ignore[override] def initialize( # type: ignore[override]
self, self,
@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer):
assert len(label_sample) > 0, Errors.E923.format(name=self.name) assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample) self.model.initialize(X=doc_sample, Y=label_sample)
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
DOCS: https://spacy.io/api/textcategorizer#score
"""
validate_examples(examples, "MultiLabel_TextCategorizer.score")
kwargs.setdefault("threshold", self.cfg["threshold"])
return Scorer.score_cats(
examples,
"cats",
labels=self.labels,
multi_label=True,
**kwargs,
)
def _validate_categories(self, examples: Iterable[Example]): def _validate_categories(self, examples: Iterable[Example]):
"""This component allows any type of single- or multi-label annotations. """This component allows any type of single- or multi-label annotations.
This method overwrites the more strict one from 'textcat'.""" This method overwrites the more strict one from 'textcat'."""

View File

@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe):
cdef public Vocab vocab cdef public Vocab vocab
cdef public object model cdef public object model
cdef public object cfg cdef public object cfg
cdef public object scorer

View File

@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe):
beam_density=0.0, beam_density=0.0,
beam_update_prob=0.0, beam_update_prob=0.0,
multitasks=tuple(), multitasks=tuple(),
incorrect_spans_key=None incorrect_spans_key=None,
scorer=None,
): ):
"""Create a Parser. """Create a Parser.
@ -86,6 +87,7 @@ cdef class Parser(TrainablePipe):
incorrect_spans_key (Optional[str]): Identifies spans that are known incorrect_spans_key (Optional[str]): Identifies spans that are known
to be incorrect entity annotations. The incorrect entity annotations to be incorrect entity annotations. The incorrect entity annotations
can be stored in the span group, under this key. can be stored in the span group, under this key.
scorer (Optional[Callable]): The scoring method. Defaults to None.
""" """
self.vocab = vocab self.vocab = vocab
self.name = name self.name = name
@ -117,6 +119,7 @@ cdef class Parser(TrainablePipe):
self.add_multitask_objective(multitask) self.add_multitask_objective(multitask)
self._rehearsal_model = None self._rehearsal_model = None
self.scorer = scorer
def __getnewargs_ex__(self): def __getnewargs_ex__(self):
"""This allows pickling the Parser and its keyword-only init arguments""" """This allows pickling the Parser and its keyword-only init arguments"""

View File

@ -351,7 +351,8 @@ class ConfigSchemaPretrain(BaseModel):
# fmt: off # fmt: off
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for") max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
dropout: StrictFloat = Field(..., title="Dropout rate") dropout: StrictFloat = Field(..., title="Dropout rate")
n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency") n_save_every: Optional[StrictInt] = Field(..., title="Saving additional temporary model after n batches within an epoch")
n_save_epoch: Optional[StrictInt] = Field(..., title="Saving model after every n epoch")
optimizer: Optimizer = Field(..., title="The optimizer to use") optimizer: Optimizer = Field(..., title="The optimizer to use")
corpus: StrictStr = Field(..., title="Path in the config to the training data") corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data") batcher: Batcher = Field(..., title="Batcher for the training data")

View File

@ -247,18 +247,21 @@ class Scorer:
missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment] missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment]
**cfg, **cfg,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Return PRF scores per feat for a token attribute in UFEATS format. """Return micro PRF and PRF scores per feat for a token attribute in
UFEATS format.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
attr (str): The attribute to score. attr (str): The attribute to score.
getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
getter(token, attr) should return the value of the attribute for an getter(token, attr) should return the value of the attribute for an
individual token. individual token.
missing_values (Set[Any]): Attribute values to treat as missing annotation missing_values (Set[Any]): Attribute values to treat as missing
in the reference annotation. annotation in the reference annotation.
RETURNS (dict): A dictionary containing the per-feat PRF scores under RETURNS (dict): A dictionary containing the micro PRF scores under the
the key attr_per_feat. key attr_micro_p/r/f and the per-feat PRF scores under
attr_per_feat.
""" """
micro_score = PRFScore()
per_feat = {} per_feat = {}
for example in examples: for example in examples:
pred_doc = example.predicted pred_doc = example.predicted
@ -300,15 +303,22 @@ class Scorer:
pred_per_feat[field] = set() pred_per_feat[field] = set()
pred_per_feat[field].add((gold_i, feat)) pred_per_feat[field].add((gold_i, feat))
for field in per_feat: for field in per_feat:
micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set()))
per_feat[field].score_set( per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
) )
score_key = f"{attr}_per_feat" result: Dict[str, Any] = {}
if any([len(v) for v in per_feat.values()]): if len(micro_score) > 0:
result = {k: v.to_dict() for k, v in per_feat.items()} result[f"{attr}_micro_p"] = micro_score.precision
return {score_key: result} result[f"{attr}_micro_r"] = micro_score.recall
result[f"{attr}_micro_f"] = micro_score.fscore
result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()}
else: else:
return {score_key: None} result[f"{attr}_micro_p"] = None
result[f"{attr}_micro_r"] = None
result[f"{attr}_micro_f"] = None
result[f"{attr}_per_feat"] = None
return result
@staticmethod @staticmethod
def score_spans( def score_spans(
@ -545,7 +555,7 @@ class Scorer:
@staticmethod @staticmethod
def score_links( def score_links(
examples: Iterable[Example], *, negative_labels: Iterable[str] examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Returns PRF for predicted links on the entity level. """Returns PRF for predicted links on the entity level.
To disentangle the performance of the NEL from the NER, To disentangle the performance of the NEL from the NER,
@ -721,7 +731,7 @@ class Scorer:
} }
def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]: def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Compute micro-PRF and per-entity PRF scores for a sequence of examples.""" """Compute micro-PRF and per-entity PRF scores for a sequence of examples."""
score_per_type = defaultdict(PRFScore) score_per_type = defaultdict(PRFScore)
for eg in examples: for eg in examples:

View File

@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64
from .typedefs cimport attr_t, hash_t from .typedefs cimport attr_t, hash_t
cpdef hash_t hash_string(unicode string) except 0 cpdef hash_t hash_string(str string) except 0
cdef hash_t hash_utf8(char* utf8_string, int length) nogil cdef hash_t hash_utf8(char* utf8_string, int length) nogil
cdef unicode decode_Utf8Str(const Utf8Str* string) cdef str decode_Utf8Str(const Utf8Str* string)
ctypedef union Utf8Str: ctypedef union Utf8Str:
@ -25,5 +25,5 @@ cdef class StringStore:
cdef vector[hash_t] keys cdef vector[hash_t] keys
cdef public PreshMap _map cdef public PreshMap _map
cdef const Utf8Str* intern_unicode(self, unicode py_string) cdef const Utf8Str* intern_unicode(self, str py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)

View File

@ -33,7 +33,7 @@ def get_string_id(key):
return hash_utf8(chars, len(chars)) return hash_utf8(chars, len(chars))
cpdef hash_t hash_string(unicode string) except 0: cpdef hash_t hash_string(str string) except 0:
chars = string.encode("utf8") chars = string.encode("utf8")
return hash_utf8(chars, len(chars)) return hash_utf8(chars, len(chars))
@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
return hash32(utf8_string, length, 1) return hash32(utf8_string, length, 1)
cdef unicode decode_Utf8Str(const Utf8Str* string): cdef str decode_Utf8Str(const Utf8Str* string):
cdef int i, length cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0: if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1].decode("utf8") return string.s[1:string.s[0]+1].decode("utf8")
@ -107,17 +107,17 @@ cdef class StringStore:
def __getitem__(self, object string_or_id): def __getitem__(self, object string_or_id):
"""Retrieve a string from a given hash, or vice versa. """Retrieve a string from a given hash, or vice versa.
string_or_id (bytes, unicode or uint64): The value to encode. string_or_id (bytes, str or uint64): The value to encode.
Returns (str / uint64): The value to be retrieved. Returns (str / uint64): The value to be retrieved.
""" """
if isinstance(string_or_id, basestring) and len(string_or_id) == 0: if isinstance(string_or_id, str) and len(string_or_id) == 0:
return 0 return 0
elif string_or_id == 0: elif string_or_id == 0:
return "" return ""
elif string_or_id in SYMBOLS_BY_STR: elif string_or_id in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string_or_id] return SYMBOLS_BY_STR[string_or_id]
cdef hash_t key cdef hash_t key
if isinstance(string_or_id, unicode): if isinstance(string_or_id, str):
key = hash_string(string_or_id) key = hash_string(string_or_id)
return key return key
elif isinstance(string_or_id, bytes): elif isinstance(string_or_id, bytes):
@ -135,14 +135,14 @@ cdef class StringStore:
def as_int(self, key): def as_int(self, key):
"""If key is an int, return it; otherwise, get the int value.""" """If key is an int, return it; otherwise, get the int value."""
if not isinstance(key, basestring): if not isinstance(key, str):
return key return key
else: else:
return self[key] return self[key]
def as_string(self, key): def as_string(self, key):
"""If key is a string, return it; otherwise, get the string value.""" """If key is a string, return it; otherwise, get the string value."""
if isinstance(key, basestring): if isinstance(key, str):
return key return key
else: else:
return self[key] return self[key]
@ -153,7 +153,7 @@ cdef class StringStore:
string (str): The string to add. string (str): The string to add.
RETURNS (uint64): The string's hash value. RETURNS (uint64): The string's hash value.
""" """
if isinstance(string, unicode): if isinstance(string, str):
if string in SYMBOLS_BY_STR: if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string] return SYMBOLS_BY_STR[string]
key = hash_string(string) key = hash_string(string)
@ -189,7 +189,7 @@ cdef class StringStore:
return True return True
elif string in SYMBOLS_BY_STR: elif string in SYMBOLS_BY_STR:
return True return True
elif isinstance(string, unicode): elif isinstance(string, str):
key = hash_string(string) key = hash_string(string)
else: else:
string = string.encode("utf8") string = string.encode("utf8")
@ -269,7 +269,7 @@ cdef class StringStore:
for string in strings: for string in strings:
self.add(string) self.add(string)
cdef const Utf8Str* intern_unicode(self, unicode py_string): cdef const Utf8Str* intern_unicode(self, str py_string):
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode("utf8") cdef bytes byte_string = py_string.encode("utf8")
return self._intern_utf8(byte_string, len(byte_string)) return self._intern_utf8(byte_string, len(byte_string))

View File

@ -5,9 +5,11 @@ from spacy.compat import pickle
def test_pickle_single_doc(): def test_pickle_single_doc():
nlp = Language() nlp = Language()
doc = nlp("pickle roundtrip") doc = nlp("pickle roundtrip")
doc._context = 3
data = pickle.dumps(doc, 1) data = pickle.dumps(doc, 1)
doc2 = pickle.loads(data) doc2 = pickle.loads(data)
assert doc2.text == "pickle roundtrip" assert doc2.text == "pickle roundtrip"
assert doc2._context == 3
def test_list_of_docs_pickles_efficiently(): def test_list_of_docs_pickles_efficiently():

View File

@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma):
def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer): def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer):
text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda." text = "La Dra. Puig viu a la pl. dels Til·lers."
tokens = ca_tokenizer(text) doc = ca_tokenizer(text)
assert len(tokens) == 15 assert [t.text for t in doc] == [
assert tokens[7].text == "aprox." "La",
"Dra.",
"Puig",
"viu",
"a",
"la",
"pl.",
"d",
"els",
"Til·lers",
".",
]

View File

@ -2,7 +2,14 @@ import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])] "text,expected_tokens",
[
("d'un", ["d'", "un"]),
("s'ha", ["s'", "ha"]),
("del", ["d", "el"]),
("cantar-te", ["cantar", "-te"]),
("-hola", ["-", "hola"]),
],
) )
def test_contractions(ca_tokenizer, text, expected_tokens): def test_contractions(ca_tokenizer, text, expected_tokens):
"""Test that the contractions are split into two tokens""" """Test that the contractions are split into two tokens"""

View File

@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida.""" una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
tokens = ca_tokenizer(text) tokens = ca_tokenizer(text)
assert len(tokens) == 140 assert len(tokens) == 146
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,length", "text,length",
[ [
("Perquè va anar-hi?", 4), ("Perquè va anar-hi?", 5),
("El cotxe dels veins.", 6),
("“Ah no?”", 5), ("“Ah no?”", 5),
("""Sí! "Anem", va contestar el Joan Carles""", 11), ("""Sí! "Anem", va contestar el Joan Carles""", 11),
("Van córrer aprox. 10km", 5), ("Van córrer aprox. 10km", 5),
("Llavors perqué...", 3), ("Llavors perqué...", 3),
("Vull parlar-te'n demà al matí", 8),
("Vull explicar-t'ho demà al matí", 8),
], ],
) )
def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length): def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length):

View File

@ -8,3 +8,17 @@ import pytest
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma): def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
test_lemma = ja_tokenizer(word)[0].lemma_ test_lemma = ja_tokenizer(word)[0].lemma_
assert test_lemma == lemma assert test_lemma == lemma
@pytest.mark.parametrize(
"word,norm",
[
("SUMMER", "サマー"),
("食べ物", "食べ物"),
("綜合", "総合"),
("コンピュータ", "コンピューター"),
],
)
def test_ja_lemmatizer_norm(ja_tokenizer, word, norm):
test_norm = ja_tokenizer(word)[0].norm_
assert test_norm == norm

View File

@ -0,0 +1,9 @@
import pytest
from spacy.lang.ja import Japanese
def test_ja_morphologizer_factory():
pytest.importorskip("sudachipy")
nlp = Japanese()
morphologizer = nlp.add_pipe("morphologizer")
assert morphologizer.cfg["extend"] is True

View File

@ -1,3 +1,5 @@
import pickle
from spacy.lang.ja import Japanese from spacy.lang.ja import Japanese
from ...util import make_tempdir from ...util import make_tempdir
@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer):
nlp_r.from_disk(d) nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes() assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.split_mode == "B" assert nlp_r.tokenizer.split_mode == "B"
def test_ja_tokenizer_pickle(ja_tokenizer):
b = pickle.dumps(ja_tokenizer)
ja_tokenizer_re = pickle.loads(b)
assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes()

View File

@ -34,22 +34,22 @@ SENTENCE_TESTS = [
] ]
tokens1 = [ tokens1 = [
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None), DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
DetailedToken(surface="", tag="名詞-普通名詞-一般", inf="", lemma="", reading="カイ", sub_tokens=None), DetailedToken(surface="", tag="名詞-普通名詞-一般", inf="", lemma="", norm="", reading="カイ", sub_tokens=None),
] ]
tokens2 = [ tokens2 = [
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None), DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None), DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None), DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None),
DetailedToken(surface="", tag="名詞-普通名詞-一般", inf="", lemma="", reading="カイ", sub_tokens=None), DetailedToken(surface="", tag="名詞-普通名詞-一般", inf="", lemma="", norm="", reading="カイ", sub_tokens=None),
] ]
tokens3 = [ tokens3 = [
DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None), DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None),
DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None), DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None),
DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", reading="イインカイ", sub_tokens=None), DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None),
] ]
SUB_TOKEN_TESTS = [ SUB_TOKEN_TESTS = [
("選挙管理委員会", [None, None, None, None], [None, None, [tokens1]], [[tokens2, tokens3]]) ("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]])
] ]
# fmt: on # fmt: on
@ -111,18 +111,16 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
assert len(nlp_c(text)) == len_c assert len(nlp_c(text)) == len_c
@pytest.mark.parametrize( @pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS)
"text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS
)
def test_ja_tokenizer_sub_tokens( def test_ja_tokenizer_sub_tokens(
ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c
): ):
nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}}) nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}})
nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}}) nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}})
nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}}) nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}})
assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a assert ja_tokenizer(text).user_data.get("sub_tokens") is None
assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a assert nlp_a(text).user_data.get("sub_tokens") is None
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
@ -132,16 +130,24 @@ def test_ja_tokenizer_sub_tokens(
[ [
( (
"取ってつけた", "取ってつけた",
("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"), (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
("トッ", "", "ツケ", ""), (["トッ"], [""], ["ツケ"], [""]),
),
(
"2=3",
([], [], []),
([""], ["_"], ["サン"])
), ),
], ],
) )
def test_ja_tokenizer_inflections_reading_forms( def test_ja_tokenizer_inflections_reading_forms(
ja_tokenizer, text, inflections, reading_forms ja_tokenizer, text, inflections, reading_forms
): ):
assert ja_tokenizer(text).user_data["inflections"] == inflections tokens = ja_tokenizer(text)
assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms test_inflections = [tt.morph.get("Inflection") for tt in tokens]
assert test_inflections == list(inflections)
test_readings = [tt.morph.get("Reading") for tt in tokens]
assert test_readings == list(reading_forms)
def test_ja_tokenizer_emptyish_texts(ja_tokenizer): def test_ja_tokenizer_emptyish_texts(ja_tokenizer):

View File

@ -0,0 +1,24 @@
import pickle
from spacy.lang.ko import Korean
from ...util import make_tempdir
def test_ko_tokenizer_serialize(ko_tokenizer):
tokenizer_bytes = ko_tokenizer.to_bytes()
nlp = Korean()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
ko_tokenizer.to_disk(file_path)
nlp = Korean()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_ko_tokenizer_pickle(ko_tokenizer):
b = pickle.dumps(ko_tokenizer)
ko_tokenizer_re = pickle.loads(b)
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match):
("www.google.com", True), ("www.google.com", True),
("google.com", True), ("google.com", True),
("sydney.com", True), ("sydney.com", True),
("2girls1cup.org", True), ("1abc2def.org", True),
("http://stupid", True), ("http://stupid", True),
("www.hi", True), ("www.hi", True),
("example.com/example", True),
("dog", False), ("dog", False),
("1.2", False), ("1.2", False),
("1.a", False), ("1.a", False),

View File

@ -0,0 +1,24 @@
import pickle
from spacy.lang.th import Thai
from ...util import make_tempdir
def test_th_tokenizer_serialize(th_tokenizer):
tokenizer_bytes = th_tokenizer.to_bytes()
nlp = Thai()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
th_tokenizer.to_disk(file_path)
nlp = Thai()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_th_tokenizer_pickle(th_tokenizer):
b = pickle.dumps(th_tokenizer)
th_tokenizer_re = pickle.loads(b)
assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()

View File

@ -37,7 +37,7 @@ def test_ti_tokenizer_handles_cnts(ti_tokenizer, text, length):
("10.000", True), ("10.000", True),
("1000", True), ("1000", True),
("999,0", True), ("999,0", True),
("", True), ("", True),
("ክልተ", True), ("ክልተ", True),
("ትሪልዮን", True), ("ትሪልዮን", True),
("ከልቢ", False), ("ከልቢ", False),

View File

@ -1,3 +1,5 @@
import pickle
from spacy.lang.vi import Vietnamese from spacy.lang.vi import Vietnamese
from ...util import make_tempdir from ...util import make_tempdir
@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer):
nlp_r.from_disk(d) nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes() assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.use_pyvi is False assert nlp_r.tokenizer.use_pyvi is False
def test_vi_tokenizer_pickle(vi_tokenizer):
b = pickle.dumps(vi_tokenizer)
vi_tokenizer_re = pickle.loads(b)
assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes()

View File

@ -32,24 +32,6 @@ def pattern_dicts():
] ]
@registry.misc("attribute_ruler_patterns")
def attribute_ruler_patterns():
return [
{
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
},
# one pattern sets the lemma
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
# another pattern sets the morphology
{
"patterns": [[{"ORTH": "test"}]],
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
"index": 0,
},
]
@pytest.fixture @pytest.fixture
def tag_map(): def tag_map():
return { return {
@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler") nlp.remove_pipe("attribute_ruler")
# initialize with patterns from misc registry # initialize with patterns from misc registry
@registry.misc("attribute_ruler_patterns")
def attribute_ruler_patterns():
return [
{
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
},
# one pattern sets the lemma
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
# another pattern sets the morphology
{
"patterns": [[{"ORTH": "test"}]],
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
"index": 0,
},
]
nlp.config["initialize"]["components"]["attribute_ruler"] = { nlp.config["initialize"]["components"]["attribute_ruler"] = {
"patterns": {"@misc": "attribute_ruler_patterns"} "patterns": {"@misc": "attribute_ruler_patterns"}
} }
@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts):
assert scores["lemma_acc"] == pytest.approx(0.2) assert scores["lemma_acc"] == pytest.approx(0.2)
# no morphs are set # no morphs are set
assert scores["morph_acc"] is None assert scores["morph_acc"] is None
nlp.remove_pipe("attribute_ruler")
# test with custom scorer
@registry.misc("weird_scorer.v1")
def make_weird_scorer():
def weird_scorer(examples, weird_score, **kwargs):
return {"weird_score": weird_score}
return weird_scorer
ruler = nlp.add_pipe(
"attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
)
ruler.initialize(lambda: [], patterns=pattern_dicts)
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
assert scores["weird_score"] == 0.12345
assert "token_acc" in scores
assert "lemma_acc" not in scores
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
assert scores["weird_score"] == 0.23456
def test_attributeruler_rule_order(nlp): def test_attributeruler_rule_order(nlp):

View File

@ -8,6 +8,7 @@ from spacy.language import Language
from spacy.tests.util import make_tempdir from spacy.tests.util import make_tempdir
from spacy.morphology import Morphology from spacy.morphology import Morphology
from spacy.attrs import MORPH from spacy.attrs import MORPH
from spacy.tokens import Doc
def test_label_types(): def test_label_types():
@ -137,6 +138,41 @@ def test_overfitting_IO():
assert [str(t.morph) for t in doc] == gold_morphs assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags assert [t.pos_ for t in doc] == gold_pos_tags
# Test overwrite+extend settings
# (note that "" is unset, "_" is set and empty)
morphs = ["Feat=V", "Feat=N", "_"]
doc = Doc(nlp.vocab, words=["blue", "ham", "like"], morphs=morphs)
orig_morphs = [str(t.morph) for t in doc]
orig_pos_tags = [t.pos_ for t in doc]
morphologizer = nlp.get_pipe("morphologizer")
# don't overwrite or extend
morphologizer.cfg["overwrite"] = False
doc = morphologizer(doc)
assert [str(t.morph) for t in doc] == orig_morphs
assert [t.pos_ for t in doc] == orig_pos_tags
# overwrite and extend
morphologizer.cfg["overwrite"] = True
morphologizer.cfg["extend"] = True
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
doc = morphologizer(doc)
assert [str(t.morph) for t in doc] == ["Feat=N|That=A|This=A", "Feat=V"]
# extend without overwriting
morphologizer.cfg["overwrite"] = False
morphologizer.cfg["extend"] = True
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", "That=B"])
doc = morphologizer(doc)
assert [str(t.morph) for t in doc] == ["Feat=A|That=A|This=A", "Feat=V|That=B"]
# overwrite without extending
morphologizer.cfg["overwrite"] = True
morphologizer.cfg["extend"] = False
doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""])
doc = morphologizer(doc)
assert [str(t.morph) for t in doc] == ["Feat=N", "Feat=V"]
# Test with unset morph and partial POS # Test with unset morph and partial POS
nlp.remove_pipe("morphologizer") nlp.remove_pipe("morphologizer")
nlp.add_pipe("morphologizer") nlp.add_pipe("morphologizer")

View File

@ -1,7 +1,9 @@
import pytest import pytest
import pickle import pickle
from thinc.api import get_current_ops
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.strings import StringStore from spacy.strings import StringStore
from spacy.vectors import Vectors
from ..util import make_tempdir from ..util import make_tempdir
@ -129,7 +131,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
def test_pickle_vocab(strings, lex_attr): def test_pickle_vocab(strings, lex_attr):
vocab = Vocab(strings=strings) vocab = Vocab(strings=strings)
ops = get_current_ops()
vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1)
vocab.vectors = vectors
vocab[strings[0]].norm_ = lex_attr vocab[strings[0]].norm_ = lex_attr
vocab_pickled = pickle.dumps(vocab) vocab_pickled = pickle.dumps(vocab)
vocab_unpickled = pickle.loads(vocab_pickled) vocab_unpickled = pickle.loads(vocab_pickled)
assert vocab.to_bytes() == vocab_unpickled.to_bytes() assert vocab.to_bytes() == vocab_unpickled.to_bytes()
assert vocab_unpickled.vectors.mode == "floret"

View File

@ -1,5 +1,6 @@
import pytest import pytest
from click import NoSuchOption from click import NoSuchOption
from packaging.specifiers import SpecifierSet
from spacy.training import docs_to_json, offsets_to_biluo_tags from spacy.training import docs_to_json, offsets_to_biluo_tags
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
@ -491,19 +492,27 @@ def test_string_to_list_intify(value):
assert string_to_list(value, intify=True) == [1, 2, 3] assert string_to_list(value, intify=True) == [1, 2, 3]
@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
def test_download_compatibility(): def test_download_compatibility():
model_name = "en_core_web_sm" spec = SpecifierSet("==" + about.__version__)
compatibility = get_compatibility() spec.prereleases = False
version = get_version(model_name, compatibility) if about.__version__ in spec:
assert get_minor_version(about.__version__) == get_minor_version(version) model_name = "en_core_web_sm"
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
assert get_minor_version(about.__version__) == get_minor_version(version)
@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release")
def test_validate_compatibility_table(): def test_validate_compatibility_table():
model_pkgs, compat = get_model_pkgs() spec = SpecifierSet("==" + about.__version__)
spacy_version = get_minor_version(about.__version__) spec.prereleases = False
current_compat = compat.get(spacy_version, {}) if about.__version__ in spec:
assert len(current_compat) > 0 model_pkgs, compat = get_model_pkgs()
assert "en_core_web_sm" in current_compat spacy_version = get_minor_version(about.__version__)
current_compat = compat.get(spacy_version, {})
assert len(current_compat) > 0
assert "en_core_web_sm" in current_compat
@pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"]) @pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"])

View File

@ -8,7 +8,7 @@ from spacy.vocab import Vocab
from spacy.training import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.de import German from spacy.lang.de import German
from spacy.util import registry, ignore_error, raise_error from spacy.util import registry, ignore_error, raise_error, find_matching_language
import spacy import spacy
from thinc.api import CupyOps, NumpyOps, get_current_ops from thinc.api import CupyOps, NumpyOps, get_current_ops
@ -255,6 +255,38 @@ def test_language_pipe_error_handler_custom(en_vocab, n_process):
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"] assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
"""Test the error handling of nlp.pipe with input as tuples"""
Language.component("my_evil_component", func=evil_component)
ops = get_current_ops()
if isinstance(ops, NumpyOps) or n_process < 2:
nlp = English()
nlp.add_pipe("my_evil_component")
texts = [
("TEXT 111", 111),
("TEXT 222", 222),
("TEXT 333", 333),
("TEXT 342", 342),
("TEXT 666", 666),
]
with pytest.raises(ValueError):
list(nlp.pipe(texts, as_tuples=True))
nlp.set_error_handler(warn_error)
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
# HACK/TODO? the warnings in child processes don't seem to be
# detected by the mock logger
if n_process == 1:
mock_warning.assert_called()
assert mock_warning.call_count == 2
assert len(tuples) + mock_warning.call_count == len(texts)
assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111)
assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333)
assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
@pytest.mark.parametrize("n_process", [1, 2]) @pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_error_handler_pipe(en_vocab, n_process): def test_language_pipe_error_handler_pipe(en_vocab, n_process):
"""Test the error handling of a component's pipe method""" """Test the error handling of a component's pipe method"""
@ -512,6 +544,55 @@ def test_spacy_blank():
assert nlp.meta["name"] == "my_custom_model" assert nlp.meta["name"] == "my_custom_model"
@pytest.mark.parametrize(
"lang,target",
[
("en", "en"),
("fra", "fr"),
("fre", "fr"),
("iw", "he"),
("mo", "ro"),
("mul", "xx"),
("no", "nb"),
("pt-BR", "pt"),
("xx", "xx"),
("zh-Hans", "zh"),
("zh-Hant", None),
("zxx", None),
],
)
def test_language_matching(lang, target):
"""
Test that we can look up languages by equivalent or nearly-equivalent
language codes.
"""
assert find_matching_language(lang) == target
@pytest.mark.parametrize(
"lang,target",
[
("en", "en"),
("fra", "fr"),
("fre", "fr"),
("iw", "he"),
("mo", "ro"),
("mul", "xx"),
("no", "nb"),
("pt-BR", "pt"),
("xx", "xx"),
("zh-Hans", "zh"),
],
)
def test_blank_languages(lang, target):
"""
Test that we can get spacy.blank in various languages, including codes
that are defined to be equivalent or that match by CLDR language matching.
"""
nlp = spacy.blank(lang)
assert nlp.lang == target
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab]) @pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
def test_language_init_invalid_vocab(value): def test_language_init_invalid_vocab(value):
err_fragment = "invalid value" err_fragment = "invalid value"
@ -540,6 +621,32 @@ def test_language_source_and_vectors(nlp2):
assert nlp.vocab.vectors.to_bytes() == vectors_bytes assert nlp.vocab.vectors.to_bytes() == vectors_bytes
@pytest.mark.parametrize("n_process", [1, 2])
def test_pass_doc_to_pipeline(nlp, n_process):
texts = ["cats", "dogs", "guinea pigs"]
docs = [nlp.make_doc(text) for text in texts]
assert not any(len(doc.cats) for doc in docs)
doc = nlp(docs[0])
assert doc.text == texts[0]
assert len(doc.cats) > 0
if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
docs = nlp.pipe(docs, n_process=n_process)
assert [doc.text for doc in docs] == texts
assert all(len(doc.cats) for doc in docs)
def test_invalid_arg_to_pipeline(nlp):
str_list = ["This is a text.", "This is another."]
with pytest.raises(ValueError):
nlp(str_list) # type: ignore
assert len(list(nlp.pipe(str_list))) == 2
int_list = [1, 2, 3]
with pytest.raises(ValueError):
list(nlp.pipe(int_list)) # type: ignore
with pytest.raises(ValueError):
nlp(int_list) # type: ignore
@pytest.mark.skipif( @pytest.mark.skipif(
not isinstance(get_current_ops(), CupyOps), reason="test requires GPU" not isinstance(get_current_ops(), CupyOps), reason="test requires GPU"
) )

Some files were not shown because too many files have changed in this diff Show More