diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 80c88b0b8..8df593bb7 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -65,8 +65,11 @@ steps: condition: eq(${{ parameters.gpu }}, true) - script: | - python -m spacy download ca_core_news_sm - python -m spacy download ca_core_news_md + #python -m spacy download ca_core_news_sm + #python -m spacy download ca_core_news_md + # temporarily install the v3.1.0 models + pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.1.0/ca_core_news_sm-3.1.0-py3-none-any.whl + pip install --no-deps https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.1.0/ca_core_news_md-3.1.0-py3-none-any.whl python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" displayName: 'Test download CLI' condition: eq(variables['python_version'], '3.8') @@ -95,7 +98,8 @@ steps: - script: | python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" - PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir + # temporarily ignore W095 + PYTHONWARNINGS="error,ignore:[W095]:UserWarning,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir displayName: 'Test assemble CLI' condition: eq(variables['python_version'], '3.8') diff --git a/.github/contributors/avi197.md b/.github/contributors/avi197.md new file mode 100644 index 000000000..903d7db4c --- /dev/null +++ b/.github/contributors/avi197.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Son Pham | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 09/10/2021 | +| GitHub username | Avi197 | +| Website (optional) | | diff --git a/.github/contributors/fgaim.md b/.github/contributors/fgaim.md new file mode 100644 index 000000000..1c3b409b4 --- /dev/null +++ b/.github/contributors/fgaim.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Fitsum Gaim | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-08-07 | +| GitHub username | fgaim | +| Website (optional) | | diff --git a/.github/contributors/syrull.md b/.github/contributors/syrull.md new file mode 100644 index 000000000..82cdade12 --- /dev/null +++ b/.github/contributors/syrull.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Dimitar Ganev | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021/8/2 | +| GitHub username | syrull | +| Website (optional) | | diff --git a/.gitignore b/.gitignore index ac72f2bbf..60036a475 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ keys/ spacy/tests/package/setup.cfg spacy/tests/package/pyproject.toml spacy/tests/package/requirements.txt +spacy/tests/universe/universe.json # Website website/.cache/ diff --git a/requirements.txt b/requirements.txt index 36cf5c58e..7e200be51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ # Our libraries spacy-legacy>=3.0.8,<3.1.0 +spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.12,<8.1.0 @@ -17,6 +18,7 @@ requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 jinja2 +langcodes>=3.2.0,<4.0.0 # Official Python utilities setuptools packaging>=20.0 diff --git a/setup.cfg b/setup.cfg index dc31228e5..72f4b39da 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,6 +42,7 @@ setup_requires = install_requires = # Our libraries spacy-legacy>=3.0.8,<3.1.0 + spacy-loggers>=1.0.0,<2.0.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 @@ -62,6 +63,7 @@ install_requires = setuptools packaging>=20.0 typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8" + langcodes>=3.2.0,<4.0.0 [options.entry_points] console_scripts = @@ -69,9 +71,9 @@ console_scripts = [options.extras_require] lookups = - spacy_lookups_data>=1.0.2,<1.1.0 + spacy_lookups_data>=1.0.3,<1.1.0 transformers = - spacy_transformers>=1.0.1,<1.2.0 + spacy_transformers>=1.1.2,<1.2.0 ray = spacy_ray>=0.1.0,<1.0.0 cuda = diff --git a/setup.py b/setup.py index fcc124a43..03a1e01dd 100755 --- a/setup.py +++ b/setup.py @@ -81,6 +81,7 @@ COPY_FILES = { ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package", ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package", ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package", + ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe", } diff --git a/spacy/about.py b/spacy/about.py index e6846f8d4..29f78805c 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.1.4" +__version__ = "3.2.0" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 9122de17b..640fb2f3c 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -142,7 +142,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): for name, value in stringy_attrs.items(): int_key = intify_attr(name) if int_key is not None: - if strings_map is not None and isinstance(value, basestring): + if strings_map is not None and isinstance(value, str): if hasattr(strings_map, 'add'): value = strings_map.add(value) else: diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 2a920cdda..d53a61b8e 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -20,6 +20,7 @@ def init_vectors_cli( output_dir: Path = Arg(..., help="Pipeline output directory"), prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), + mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), @@ -34,7 +35,14 @@ def init_vectors_cli( nlp = util.get_lang_class(lang)() if jsonl_loc is not None: update_lexemes(nlp, jsonl_loc) - convert_vectors(nlp, vectors_loc, truncate=truncate, prune=prune, name=name) + convert_vectors( + nlp, + vectors_loc, + truncate=truncate, + prune=prune, + name=name, + mode=mode, + ) msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") nlp.to_disk(output_dir) msg.good( diff --git a/spacy/default_config_pretraining.cfg b/spacy/default_config_pretraining.cfg index 16f767772..d70ecf04c 100644 --- a/spacy/default_config_pretraining.cfg +++ b/spacy/default_config_pretraining.cfg @@ -5,6 +5,7 @@ raw_text = null max_epochs = 1000 dropout = 0.2 n_save_every = null +n_save_epoch = null component = "tok2vec" layer = "" corpus = "corpora.pretrain" diff --git a/spacy/errors.py b/spacy/errors.py index 2da52e3b8..5fe550145 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -22,6 +22,9 @@ def setup_default_warnings(): # warn once about lemmatizer without required POS filter_warning("once", error_msg=Warnings.W108) + # floret vector table cannot be modified + filter_warning("once", error_msg="[W114]") + def filter_warning(action: str, error_msg: str): """Customize how spaCy should handle a certain warning. @@ -186,6 +189,8 @@ class Warnings(metaclass=ErrorsWithCodes): "vectors are not identical to current pipeline vectors.") W114 = ("Using multiprocessing with GPU models is not recommended and may " "lead to errors.") + W115 = ("Skipping {method}: the floret vector table cannot be modified. " + "Vectors are calculated from character ngrams.") class Errors(metaclass=ErrorsWithCodes): @@ -277,7 +282,7 @@ class Errors(metaclass=ErrorsWithCodes): "you forget to call the `set_extension` method?") E047 = ("Can't assign a value to unregistered extension attribute " "'{name}'. Did you forget to call the `set_extension` method?") - E048 = ("Can't import language {lang} from spacy.lang: {err}") + E048 = ("Can't import language {lang} or any matching language from spacy.lang: {err}") E050 = ("Can't find model '{name}'. It doesn't seem to be a Python " "package or a valid path to a data directory.") E052 = ("Can't find model directory: {path}") @@ -511,13 +516,24 @@ class Errors(metaclass=ErrorsWithCodes): E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.") E200 = ("Can't yet set {attr} from Span. Vote for this feature on the " "issue tracker: http://github.com/explosion/spaCy/issues") - E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") + E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.") # New errors added in v3.x - E866 = ("A SpanGroup is not functional after the corresponding Doc has " + E858 = ("The {mode} vector table does not support this operation. " + "{alternative}") + E859 = ("The floret vector table cannot be modified.") + E860 = ("Can't truncate fasttext-bloom vectors.") + E861 = ("No 'keys' should be provided when initializing floret vectors " + "with 'minn' and 'maxn'.") + E862 = ("'hash_count' must be between 1-4 for floret vectors.") + E863 = ("'maxn' must be greater than or equal to 'minn'.") + E864 = ("The complete vector table 'data' is required to initialize floret " + "vectors.") + E865 = ("A SpanGroup is not functional after the corresponding Doc has " "been garbage collected. To keep using the spans, make sure that " "the corresponding Doc object is still available in the scope of " "your function.") + E866 = ("Expected a string or 'Doc' as input, but got: {type}.") E867 = ("The 'textcat' component requires at least two labels because it " "uses mutually exclusive classes where exactly one label is True " "for each doc. For binary classification tasks, you can use two " diff --git a/spacy/kb.pyx b/spacy/kb.pyx index fed3009da..9a765c8e4 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -124,7 +124,7 @@ cdef class KnowledgeBase: def get_alias_strings(self): return [self.vocab.strings[x] for x in self._alias_index] - def add_entity(self, unicode entity, float freq, vector[float] entity_vector): + def add_entity(self, str entity, float freq, vector[float] entity_vector): """ Add an entity to the KB, optionally specifying its log probability based on corpus frequency Return the hash of the entity ID/name at the end. @@ -185,15 +185,15 @@ cdef class KnowledgeBase: i += 1 - def contains_entity(self, unicode entity): + def contains_entity(self, str entity): cdef hash_t entity_hash = self.vocab.strings.add(entity) return entity_hash in self._entry_index - def contains_alias(self, unicode alias): + def contains_alias(self, str alias): cdef hash_t alias_hash = self.vocab.strings.add(alias) return alias_hash in self._alias_index - def add_alias(self, unicode alias, entities, probabilities): + def add_alias(self, str alias, entities, probabilities): """ For a given alias, add its potential entities and prior probabilies to the KB. Return the alias_hash at the end @@ -239,7 +239,7 @@ cdef class KnowledgeBase: raise RuntimeError(Errors.E891.format(alias=alias)) return alias_hash - def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False): + def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False): """ For an alias already existing in the KB, extend its potential entities with one more. Throw a warning if either the alias or the entity is unknown, @@ -286,7 +286,7 @@ cdef class KnowledgeBase: alias_entry.probs = probs self._aliases_table[alias_index] = alias_entry - def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]: + def get_alias_candidates(self, str alias) -> Iterator[Candidate]: """ Return candidate entities for an alias. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. @@ -307,7 +307,7 @@ cdef class KnowledgeBase: for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) if entry_index != 0] - def get_vector(self, unicode entity): + def get_vector(self, str entity): cdef hash_t entity_hash = self.vocab.strings[entity] # Return an empty list if this entity is unknown in this KB @@ -317,7 +317,7 @@ cdef class KnowledgeBase: return self._vectors_table[self._entries[entry_index].vector_index] - def get_prior_prob(self, unicode entity, unicode alias): + def get_prior_prob(self, str entity, str alias): """ Return the prior probability of a given alias being linked to a given entity, or return 0.0 when this combination is not known in the knowledge base""" cdef hash_t alias_hash = self.vocab.strings[alias] @@ -587,7 +587,7 @@ cdef class Writer: def __init__(self, path): assert isinstance(path, Path) content = bytes(path) - cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content + cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content self._fp = fopen(bytes_loc, 'wb') if not self._fp: raise IOError(Errors.E146.format(path=path)) @@ -629,7 +629,7 @@ cdef class Writer: cdef class Reader: def __init__(self, path): content = bytes(path) - cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content + cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content self._fp = fopen(bytes_loc, 'rb') if not self._fp: PyErr_SetFromErrno(IOError) diff --git a/spacy/lang/am/punctuation.py b/spacy/lang/am/punctuation.py index 70af12039..555a179fa 100644 --- a/spacy/lang/am/punctuation.py +++ b/spacy/lang/am/punctuation.py @@ -1,7 +1,7 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import UNITS, ALPHA_UPPER -_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split() +_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split() _suffixes = ( _list_punct diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py index aae7692a2..df708b65e 100644 --- a/spacy/lang/bg/stop_words.py +++ b/spacy/lang/bg/stop_words.py @@ -1,265 +1,79 @@ -# Source: https://github.com/Alir3z4/stop-words - +""" +References: + https://github.com/Alir3z4/stop-words - Original list, serves as a base. + https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it. +""" STOP_WORDS = set( """ -а -автентичен -аз -ако -ала -бе -без -беше -би -бивш -бивша -бившо -бил -била -били -било -благодаря -близо -бъдат -бъде -бяха -в -вас -ваш -ваша -вероятно -вече -взема -ви -вие -винаги -внимава -време -все -всеки -всички -всичко -всяка -във -въпреки -върху -г -ги -главен -главна -главно -глас -го -година -години -годишен -д -да -дали -два -двама -двамата -две -двете -ден -днес -дни -до -добра -добре -добро -добър -докато -докога -дори -досега -доста -друг -друга -други -е -евтин -едва -един -една -еднаква -еднакви -еднакъв -едно -екип -ето -живот -за -забавям -зад -заедно -заради -засега -заспал -затова -защо -защото -и -из -или -им -има -имат -иска -й -каза -как -каква -какво -както -какъв -като -кога -когато -което -които -кой -който -колко -която -къде -където -към -лесен -лесно -ли -лош -м -май -малко -ме -между -мек -мен -месец -ми -много -мнозина -мога -могат -може -мокър -моля -момента -му -н -на -над -назад -най -направи -напред -например -нас -не -него -нещо -нея -ни -ние -никой -нито -нищо -но -нов -нова -нови -новина -някои -някой -няколко -няма -обаче -около -освен -особено -от -отгоре -отново -още -пак -по -повече -повечето -под -поне -поради -после -почти -прави -пред -преди -през -при -пък -първата -първи -първо -пъти -равен -равна -с -са -сам -само -се -сега -си -син -скоро -след -следващ -сме -смях -според -сред -срещу -сте -съм -със -също -т -тази -така -такива -такъв -там -твой -те -тези -ти -т.н. -то -това -тогава -този -той -толкова -точно -три -трябва -тук -тъй -тя -тях -у -утре -харесва -хиляди -ч -часа -че -често -чрез -ще -щом +а автентичен аз ако ала + +бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат +бъде бъда бяха + +в вас ваш ваша вашата вашият вероятно вече взема ви вие винаги внимава време все +всеки всички вместо всичко вследствие всъщност всяка втори във въпреки върху +вътре веднъж + +г ги главен главна главно глас го годно година години годишен + +д да дали далеч далече два двама двамата две двете ден днес дни до добра добре +добро добър достатъчно докато докога дори досега доста друг друга другаде други + +е евтин едва един една еднаква еднакви еднакъв едно екип ето + +живот жив + +за здравей здрасти знае зная забавям зад зададени заедно заради засега заспал +затова запазва започвам защо защото завинаги + +и из или им има имат иска искам използвайки изглежда изглеждаше изглеждайки +извън имайки + +й йо + +каза казва казвайки казвам как каква какво както какъв като кога кауза каузи +когато когото което които кой който колко която къде където към край кратък +кръгъл + +лесен лесно ли летя летиш летим лош + +м май малко макар малцина междувременно минус ме между мек мен месец ми мис +мисля много мнозина мога могат може мой можем мокър моля момента му + +н на над назад най наш навсякъде навътре нагоре направи напред надолу наистина +например наопаки наполовина напоследък нека независимо нас насам наскоро +настрана необходимо него негов нещо нея ни ние никой нито нищо но нов някак нова +нови новина някои някой някога някъде няколко няма + +о обаче около описан опитах опитва опитвайки опитвам определен определено освен +обикновено осигурява обратно означава особен особено от ох отвъд отгоре отдолу +отново отива отивам отидох отсега отделно отколкото откъдето очевидно оттам +относно още + +п пак по повече повечето под поне просто пряко поради после последен последно +посочен почти прави прав прави правя пред преди през при пък първата първи първо +път пъти плюс + +равен равна различен различни разумен разумно + +с са сам само себе сериозно сигурен сигурно се сега си син скоро скорошен след +следващ следващия следва следното следователно случва сме смях собствен +сравнително смея според сред става срещу съвсем съдържа съдържащ съжалявам +съответен съответно сте съм със също + +т така техен техни такива такъв твърде там трета твой те тези ти то това +тогава този той търси толкова точно три трябва тук тъй тя тях + +у утре ужасно употреба успоредно уточнен уточняване + +харесва харесали хиляди + +ч часа ценя цяло цялостен че често чрез чудя + +ще щеше щом щяха + юмрук -я -як + +я як """.split() ) diff --git a/spacy/lang/bg/tokenizer_exceptions.py b/spacy/lang/bg/tokenizer_exceptions.py index 0b7487c64..0f484b778 100644 --- a/spacy/lang/bg/tokenizer_exceptions.py +++ b/spacy/lang/bg/tokenizer_exceptions.py @@ -1,10 +1,16 @@ +""" +References: + https://slovored.com/bg/abbr/grammar/ - Additional refs for abbreviations + (countries, occupations, fields of studies and more). +""" + from ...symbols import ORTH, NORM _exc = {} - -_abbr_exc = [ +# measurements +for abbr in [ {ORTH: "м", NORM: "метър"}, {ORTH: "мм", NORM: "милиметър"}, {ORTH: "см", NORM: "сантиметър"}, @@ -17,51 +23,191 @@ _abbr_exc = [ {ORTH: "хл", NORM: "хектолиър"}, {ORTH: "дкл", NORM: "декалитър"}, {ORTH: "л", NORM: "литър"}, -] -for abbr in _abbr_exc: +]: _exc[abbr[ORTH]] = [abbr] -_abbr_line_exc = [ +# line abbreviations +for abbr in [ {ORTH: "г-жа", NORM: "госпожа"}, {ORTH: "г-н", NORM: "господин"}, {ORTH: "г-ца", NORM: "госпожица"}, {ORTH: "д-р", NORM: "доктор"}, {ORTH: "о-в", NORM: "остров"}, {ORTH: "п-в", NORM: "полуостров"}, -] - -for abbr in _abbr_line_exc: + {ORTH: "с-у", NORM: "срещу"}, + {ORTH: "в-у", NORM: "върху"}, + {ORTH: "м-у", NORM: "между"}, +]: _exc[abbr[ORTH]] = [abbr] -_abbr_dot_exc = [ +# foreign language related abbreviations +for abbr in [ + {ORTH: "англ.", NORM: "английски"}, + {ORTH: "ан.", NORM: "английски термин"}, + {ORTH: "араб.", NORM: "арабски"}, + {ORTH: "афр.", NORM: "африкански"}, + {ORTH: "гр.", NORM: "гръцки"}, + {ORTH: "лат.", NORM: "латински"}, + {ORTH: "рим.", NORM: "римски"}, + {ORTH: "старогр.", NORM: "старогръцки"}, + {ORTH: "староевр.", NORM: "староеврейски"}, + {ORTH: "фр.", NORM: "френски"}, + {ORTH: "хол.", NORM: "холандски"}, + {ORTH: "швед.", NORM: "шведски"}, + {ORTH: "шотл.", NORM: "шотландски"}, + {ORTH: "яп.", NORM: "японски"}, +]: + _exc[abbr[ORTH]] = [abbr] + +# profession and academic titles abbreviations +for abbr in [ {ORTH: "акад.", NORM: "академик"}, - {ORTH: "ал.", NORM: "алинея"}, {ORTH: "арх.", NORM: "архитект"}, + {ORTH: "инж.", NORM: "инженер"}, + {ORTH: "канц.", NORM: "канцлер"}, + {ORTH: "проф.", NORM: "професор"}, + {ORTH: "св.", NORM: "свети"}, +]: + _exc[abbr[ORTH]] = [abbr] + +# fields of studies +for abbr in [ + {ORTH: "агр.", NORM: "агрономия"}, + {ORTH: "ав.", NORM: "авиация"}, + {ORTH: "агр.", NORM: "агрономия"}, + {ORTH: "археол.", NORM: "археология"}, + {ORTH: "астр.", NORM: "астрономия"}, + {ORTH: "геод.", NORM: "геодезия"}, + {ORTH: "геол.", NORM: "геология"}, + {ORTH: "геом.", NORM: "геометрия"}, + {ORTH: "гимн.", NORM: "гимнастика"}, + {ORTH: "грам.", NORM: "граматика"}, + {ORTH: "жур.", NORM: "журналистика"}, + {ORTH: "журн.", NORM: "журналистика"}, + {ORTH: "зем.", NORM: "земеделие"}, + {ORTH: "икон.", NORM: "икономика"}, + {ORTH: "лит.", NORM: "литература"}, + {ORTH: "мат.", NORM: "математика"}, + {ORTH: "мед.", NORM: "медицина"}, + {ORTH: "муз.", NORM: "музика"}, + {ORTH: "печ.", NORM: "печатарство"}, + {ORTH: "пол.", NORM: "политика"}, + {ORTH: "псих.", NORM: "психология"}, + {ORTH: "соц.", NORM: "социология"}, + {ORTH: "стат.", NORM: "статистика"}, + {ORTH: "стил.", NORM: "стилистика"}, + {ORTH: "топогр.", NORM: "топография"}, + {ORTH: "търг.", NORM: "търговия"}, + {ORTH: "фарм.", NORM: "фармацевтика"}, + {ORTH: "фехт.", NORM: "фехтовка"}, + {ORTH: "физиол.", NORM: "физиология"}, + {ORTH: "физ.", NORM: "физика"}, + {ORTH: "фил.", NORM: "философия"}, + {ORTH: "фин.", NORM: "финанси"}, + {ORTH: "фолкл.", NORM: "фолклор"}, + {ORTH: "фон.", NORM: "фонетика"}, + {ORTH: "фот.", NORM: "фотография"}, + {ORTH: "футб.", NORM: "футбол"}, + {ORTH: "хим.", NORM: "химия"}, + {ORTH: "хир.", NORM: "хирургия"}, + {ORTH: "ел.", NORM: "електротехника"}, +]: + _exc[abbr[ORTH]] = [abbr] + +for abbr in [ + {ORTH: "ал.", NORM: "алинея"}, + {ORTH: "авт.", NORM: "автоматично"}, + {ORTH: "адм.", NORM: "администрация"}, + {ORTH: "арт.", NORM: "артилерия"}, {ORTH: "бл.", NORM: "блок"}, {ORTH: "бр.", NORM: "брой"}, {ORTH: "бул.", NORM: "булевард"}, + {ORTH: "букв.", NORM: "буквално"}, {ORTH: "в.", NORM: "век"}, + {ORTH: "вр.", NORM: "време"}, + {ORTH: "вм.", NORM: "вместо"}, + {ORTH: "воен.", NORM: "военен термин"}, {ORTH: "г.", NORM: "година"}, {ORTH: "гр.", NORM: "град"}, + {ORTH: "гл.", NORM: "глагол"}, + {ORTH: "др.", NORM: "други"}, + {ORTH: "ез.", NORM: "езеро"}, {ORTH: "ж.р.", NORM: "женски род"}, - {ORTH: "инж.", NORM: "инженер"}, + {ORTH: "жп.", NORM: "железопът"}, + {ORTH: "застр.", NORM: "застрахователно дело"}, + {ORTH: "знач.", NORM: "значение"}, + {ORTH: "и др.", NORM: "и други"}, + {ORTH: "и под.", NORM: "и подобни"}, + {ORTH: "и пр.", NORM: "и прочие"}, + {ORTH: "изр.", NORM: "изречение"}, + {ORTH: "изт.", NORM: "източен"}, + {ORTH: "конкр.", NORM: "конкретно"}, {ORTH: "лв.", NORM: "лев"}, + {ORTH: "л.", NORM: "лице"}, {ORTH: "м.р.", NORM: "мъжки род"}, - {ORTH: "мат.", NORM: "математика"}, - {ORTH: "мед.", NORM: "медицина"}, + {ORTH: "мин.вр.", NORM: "минало време"}, + {ORTH: "мн.ч.", NORM: "множествено число"}, + {ORTH: "напр.", NORM: "например"}, + {ORTH: "нар.", NORM: "наречие"}, + {ORTH: "науч.", NORM: "научен термин"}, + {ORTH: "непр.", NORM: "неправилно"}, + {ORTH: "обик.", NORM: "обикновено"}, + {ORTH: "опред.", NORM: "определение"}, + {ORTH: "особ.", NORM: "особено"}, + {ORTH: "ост.", NORM: "остаряло"}, + {ORTH: "относ.", NORM: "относително"}, + {ORTH: "отр.", NORM: "отрицателно"}, {ORTH: "пл.", NORM: "площад"}, - {ORTH: "проф.", NORM: "професор"}, + {ORTH: "пад.", NORM: "падеж"}, + {ORTH: "парл.", NORM: "парламентарен"}, + {ORTH: "погов.", NORM: "поговорка"}, + {ORTH: "пон.", NORM: "понякога"}, + {ORTH: "правосл.", NORM: "православен"}, + {ORTH: "прибл.", NORM: "приблизително"}, + {ORTH: "прил.", NORM: "прилагателно име"}, + {ORTH: "пр.", NORM: "прочие"}, {ORTH: "с.", NORM: "село"}, {ORTH: "с.р.", NORM: "среден род"}, - {ORTH: "св.", NORM: "свети"}, {ORTH: "сп.", NORM: "списание"}, {ORTH: "стр.", NORM: "страница"}, + {ORTH: "сз.", NORM: "съюз"}, + {ORTH: "сег.", NORM: "сегашно"}, + {ORTH: "сп.", NORM: "спорт"}, + {ORTH: "срв.", NORM: "сравни"}, + {ORTH: "с.ст.", NORM: "селскостопанска техника"}, + {ORTH: "счет.", NORM: "счетоводство"}, + {ORTH: "съкр.", NORM: "съкратено"}, + {ORTH: "съобщ.", NORM: "съобщение"}, + {ORTH: "същ.", NORM: "съществително"}, + {ORTH: "текст.", NORM: "текстилен"}, + {ORTH: "телев.", NORM: "телевизия"}, + {ORTH: "тел.", NORM: "телефон"}, + {ORTH: "т.е.", NORM: "тоест"}, + {ORTH: "т.н.", NORM: "така нататък"}, + {ORTH: "т.нар.", NORM: "така наречен"}, + {ORTH: "търж.", NORM: "тържествено"}, {ORTH: "ул.", NORM: "улица"}, + {ORTH: "уч.", NORM: "училище"}, + {ORTH: "унив.", NORM: "университет"}, + {ORTH: "харт.", NORM: "хартия"}, + {ORTH: "хидр.", NORM: "хидравлика"}, + {ORTH: "хран.", NORM: "хранителна"}, + {ORTH: "църк.", NORM: "църковен термин"}, + {ORTH: "числ.", NORM: "числително"}, {ORTH: "чл.", NORM: "член"}, -] - -for abbr in _abbr_dot_exc: + {ORTH: "ч.", NORM: "число"}, + {ORTH: "числ.", NORM: "числително"}, + {ORTH: "шахм.", NORM: "шахмат"}, + {ORTH: "шах.", NORM: "шахмат"}, + {ORTH: "юр.", NORM: "юридически"}, +]: _exc[abbr[ORTH]] = [abbr] +# slash abbreviations +for abbr in [ + {ORTH: "м/у", NORM: "между"}, + {ORTH: "с/у", NORM: "срещу"}, +]: + _exc[abbr[ORTH]] = [abbr] TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 4eb9735df..6d0331e00 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES @@ -23,13 +23,25 @@ class Bengali(Language): @Bengali.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Bengali"] diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py old mode 100644 new mode 100755 index 250ae9463..a3def660d --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -1,9 +1,9 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS @@ -15,6 +15,7 @@ class CatalanDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES suffixes = TOKENIZER_SUFFIXES + prefixes = TOKENIZER_PREFIXES stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS syntax_iterators = SYNTAX_ITERATORS @@ -28,13 +29,25 @@ class Catalan(Language): @Catalan.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return CatalanLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Catalan"] diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py old mode 100644 new mode 100755 index 39db08f17..8e2f09828 --- a/spacy/lang/ca/punctuation.py +++ b/spacy/lang/ca/punctuation.py @@ -1,4 +1,5 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS +from ..char_classes import LIST_CURRENCY from ..char_classes import CURRENCY from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT from ..char_classes import merge_chars, _units @@ -6,6 +7,14 @@ from ..char_classes import merge_chars, _units ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") +_prefixes = ( + ["§", "%", "=", "—", "–", "-", r"\+(?![0-9])"] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_CURRENCY + + LIST_ICONS +) _infixes = ( LIST_ELLIPSES @@ -18,6 +27,7 @@ _infixes = ( r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION), + r"('ls|'l|'ns|'t|'m|'n|-les|-la|-lo|-li|-los|-me|-nos|-te|-vos|-se|-hi|-ne|-ho)(?![A-Za-z])|(-l'|-m'|-t'|-n')", ] ) @@ -44,3 +54,4 @@ _suffixes = ( TOKENIZER_INFIXES = _infixes TOKENIZER_SUFFIXES = _suffixes +TOKENIZER_PREFIXES = _prefixes diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py old mode 100644 new mode 100755 index 5f9a50f5e..b261b3498 --- a/spacy/lang/ca/tokenizer_exceptions.py +++ b/spacy/lang/ca/tokenizer_exceptions.py @@ -18,12 +18,21 @@ for exc_data in [ {ORTH: "nov.", NORM: "novembre"}, {ORTH: "dec.", NORM: "desembre"}, {ORTH: "Dr.", NORM: "doctor"}, + {ORTH: "Dra.", NORM: "doctora"}, {ORTH: "Sr.", NORM: "senyor"}, {ORTH: "Sra.", NORM: "senyora"}, {ORTH: "Srta.", NORM: "senyoreta"}, {ORTH: "núm", NORM: "número"}, {ORTH: "St.", NORM: "sant"}, {ORTH: "Sta.", NORM: "santa"}, + {ORTH: "pl.", NORM: "plaça"}, + {ORTH: "à."}, + {ORTH: "è."}, + {ORTH: "é."}, + {ORTH: "í."}, + {ORTH: "ò."}, + {ORTH: "ó."}, + {ORTH: "ú."}, {ORTH: "'l"}, {ORTH: "'ls"}, {ORTH: "'m"}, @@ -34,6 +43,18 @@ for exc_data in [ ]: _exc[exc_data[ORTH]] = [exc_data] +_exc["del"] = [{ORTH: "d", NORM: "de"}, {ORTH: "el"}] +_exc["dels"] = [{ORTH: "d", NORM: "de"}, {ORTH: "els"}] + +_exc["al"] = [{ORTH: "a"}, {ORTH: "l", NORM: "el"}] +_exc["als"] = [{ORTH: "a"}, {ORTH: "ls", NORM: "els"}] + +_exc["pel"] = [{ORTH: "p", NORM: "per"}, {ORTH: "el"}] +_exc["pels"] = [{ORTH: "p", NORM: "per"}, {ORTH: "els"}] + +_exc["holahola"] = [{ORTH: "holahola", NORM: "cocacola"}] + + # Times _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", NORM: "p.m."}] diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 258b37a8a..53dd9be8e 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS @@ -28,13 +28,25 @@ class Greek(Language): @Greek.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return GreekLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return GreekLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Greek"] diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 854f59224..876186979 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS @@ -26,13 +26,25 @@ class English(Language): @English.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return EnglishLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["English"] diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py index 2cb0f9a53..c88b69bcc 100644 --- a/spacy/lang/en/lemmatizer.py +++ b/spacy/lang/en/lemmatizer.py @@ -10,7 +10,7 @@ class EnglishLemmatizer(Lemmatizer): Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. - univ_pos (unicode / int): The token's universal part-of-speech tag. + univ_pos (str / int): The token's universal part-of-speech tag. morphology (dict): The token's morphological features following the Universal Dependencies scheme. """ diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index f5d1eb97a..e75955202 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS @@ -26,13 +26,25 @@ class Spanish(Language): @Spanish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return SpanishLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Spanish"] diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 6db64ff62..914e4c27d 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -26,13 +26,25 @@ class Persian(Language): @Persian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Persian"] diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index e7267dc61..27d2a915e 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model @@ -31,13 +31,25 @@ class French(Language): @French.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return FrenchLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["French"] diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index 90735d749..3be53bc7a 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -1,6 +1,11 @@ +from typing import Optional + +from thinc.api import Model + from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from ...language import Language, BaseDefaults +from .lemmatizer import IrishLemmatizer class IrishDefaults(BaseDefaults): @@ -13,4 +18,16 @@ class Irish(Language): Defaults = IrishDefaults +@Irish.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return IrishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + + __all__ = ["Irish"] diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py deleted file mode 100644 index d606da975..000000000 --- a/spacy/lang/ga/irish_morphology_helpers.py +++ /dev/null @@ -1,35 +0,0 @@ -# fmt: off -consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"] -broad_vowels = ["a", "á", "o", "ó", "u", "ú"] -slender_vowels = ["e", "é", "i", "í"] -vowels = broad_vowels + slender_vowels -# fmt: on - - -def ends_dentals(word): - if word != "" and word[-1] in ["d", "n", "t", "s"]: - return True - else: - return False - - -def devoice(word): - if len(word) > 2 and word[-2] == "s" and word[-1] == "d": - return word[:-1] + "t" - else: - return word - - -def ends_with_vowel(word): - return word != "" and word[-1] in vowels - - -def starts_with_vowel(word): - return word != "" and word[0] in vowels - - -def deduplicate(word): - if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants: - return word[:-1] - else: - return word diff --git a/spacy/lang/ga/lemmatizer.py b/spacy/lang/ga/lemmatizer.py new file mode 100644 index 000000000..47aec8fd4 --- /dev/null +++ b/spacy/lang/ga/lemmatizer.py @@ -0,0 +1,162 @@ +from typing import List, Dict, Tuple + +from ...pipeline import Lemmatizer +from ...tokens import Token + + +class IrishLemmatizer(Lemmatizer): + # This is a lookup-based lemmatiser using data extracted from + # BuNaMo (https://github.com/michmech/BuNaMo) + + @classmethod + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: + if mode == "pos_lookup": + # fmt: off + required = [ + "lemma_lookup_adj", "lemma_lookup_adp", + "lemma_lookup_noun", "lemma_lookup_verb" + ] + # fmt: on + return (required, []) + else: + return super().get_lookups_config(mode) + + def pos_lookup_lemmatize(self, token: Token) -> List[str]: + univ_pos = token.pos_ + string = unponc(token.text) + if univ_pos not in ["PROPN", "ADP", "ADJ", "NOUN", "VERB"]: + return [string.lower()] + demutated = demutate(string) + secondary = "" + if string[0:1].lower() == "h" and string[1:2].lower() in "aáeéiíoóuú": + secondary = string[1:] + lookup_pos = univ_pos.lower() + if univ_pos == "PROPN": + lookup_pos = "noun" + if token.has_morph(): + # TODO: lookup is actually required for the genitive forms, but + # this is not in BuNaMo, and would not be of use with IDT. + if univ_pos == "NOUN" and ( + "VerbForm=Vnoun" in token.morph or "VerbForm=Inf" in token.morph + ): + hpref = "Form=HPref" in token.morph + return [demutate(string, hpref).lower()] + elif univ_pos == "ADJ" and "VerbForm=Part" in token.morph: + return [demutate(string).lower()] + lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {}) + + def to_list(value): + if value is None: + value = [] + elif not isinstance(value, list): + value = [value] + return value + + if univ_pos == "ADP": + return to_list(lookup_table.get(string, string.lower())) + ret = [] + if univ_pos == "PROPN": + ret.extend(to_list(lookup_table.get(demutated))) + ret.extend(to_list(lookup_table.get(secondary))) + else: + ret.extend(to_list(lookup_table.get(demutated.lower()))) + ret.extend(to_list(lookup_table.get(secondary.lower()))) + if len(ret) == 0: + ret = [string.lower()] + return ret + + +def demutate(word: str, is_hpref: bool = False) -> str: + UVOWELS = "AÁEÉIÍOÓUÚ" + LVOWELS = "aáeéiíoóuú" + lc = word.lower() + # remove eclipsis + if lc.startswith("bhf"): + word = word[2:] + elif lc.startswith("mb"): + word = word[1:] + elif lc.startswith("gc"): + word = word[1:] + elif lc.startswith("nd"): + word = word[1:] + elif lc.startswith("ng"): + word = word[1:] + elif lc.startswith("bp"): + word = word[1:] + elif lc.startswith("dt"): + word = word[1:] + elif word[0:1] == "n" and word[1:2] in UVOWELS: + word = word[1:] + elif lc.startswith("n-") and word[2:3] in LVOWELS: + word = word[2:] + # non-standard eclipsis + elif lc.startswith("bh-f"): + word = word[3:] + elif lc.startswith("m-b"): + word = word[2:] + elif lc.startswith("g-c"): + word = word[2:] + elif lc.startswith("n-d"): + word = word[2:] + elif lc.startswith("n-g"): + word = word[2:] + elif lc.startswith("b-p"): + word = word[2:] + elif lc.startswith("d-t"): + word = word[2:] + + # t-prothesis + elif lc.startswith("ts"): + word = word[1:] + elif lc.startswith("t-s"): + word = word[2:] + + # h-prothesis, if known to be present + elif is_hpref and word[0:1] == "h": + word = word[1:] + # h-prothesis, simple case + # words can also begin with 'h', but unlike eclipsis, + # a hyphen is not used, so that needs to be handled + # elsewhere + elif word[0:1] == "h" and word[1:2] in UVOWELS: + word = word[1:] + + # lenition + # this breaks the previous if, to handle super-non-standard + # text where both eclipsis and lenition were used. + if lc[0:1] in "bcdfgmpst" and lc[1:2] == "h": + word = word[0:1] + word[2:] + + return word + + +def unponc(word: str) -> str: + # fmt: off + PONC = { + "ḃ": "bh", + "ċ": "ch", + "ḋ": "dh", + "ḟ": "fh", + "ġ": "gh", + "ṁ": "mh", + "ṗ": "ph", + "ṡ": "sh", + "ṫ": "th", + "Ḃ": "BH", + "Ċ": "CH", + "Ḋ": "DH", + "Ḟ": "FH", + "Ġ": "GH", + "Ṁ": "MH", + "Ṗ": "PH", + "Ṡ": "SH", + "Ṫ": "TH" + } + # fmt: on + buf = [] + for ch in word: + if ch in PONC: + buf.append(PONC[ch]) + else: + buf.append(ch) + return "".join(buf) diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index abf49c511..63af65fe9 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -9,6 +9,8 @@ _exc = { "ded'": [{ORTH: "de", NORM: "de"}, {ORTH: "d'", NORM: "do"}], "lem'": [{ORTH: "le", NORM: "le"}, {ORTH: "m'", NORM: "mo"}], "led'": [{ORTH: "le", NORM: "le"}, {ORTH: "d'", NORM: "do"}], + "théis": [{ORTH: "th", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}], + "tréis": [{ORTH: "tr", NORM: "tar"}, {ORTH: "éis", NORM: "éis"}], } for exc_data in [ diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py index 4a64a1d2c..ffaa74f50 100644 --- a/spacy/lang/hu/tokenizer_exceptions.py +++ b/spacy/lang/hu/tokenizer_exceptions.py @@ -646,5 +646,10 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format( ) +for u in "cfkCFK": + _exc[f"°{u}"] = [{ORTH: f"°{u}"}] + _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}] + + TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 863ed8e2f..1edebc837 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .stop_words import STOP_WORDS @@ -23,13 +23,25 @@ class Italian(Language): @Italian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, + default_config={ + "model": None, + "mode": "pos_lookup", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return ItalianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return ItalianLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Italian"] diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 8499fc73e..81ff5b5b8 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,21 +1,25 @@ -from typing import Optional, Union, Dict, Any +from typing import Optional, Union, Dict, Any, Callable from pathlib import Path import srsly from collections import namedtuple +from thinc.api import Model +import re from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from .tag_map import TAG_MAP from .tag_orth_map import TAG_ORTH_MAP from .tag_bigram_map import TAG_BIGRAM_MAP -from ...compat import copy_reg from ...errors import Errors from ...language import Language, BaseDefaults +from ...pipeline import Morphologizer +from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL from ...scorer import Scorer from ...symbols import POS -from ...tokens import Doc +from ...tokens import Doc, MorphAnalysis from ...training import validate_examples from ...util import DummyTokenizer, registry, load_config_from_str +from ...vocab import Vocab from ... import util @@ -31,16 +35,21 @@ split_mode = null @registry.tokenizers("spacy.ja.JapaneseTokenizer") def create_tokenizer(split_mode: Optional[str] = None): def japanese_tokenizer_factory(nlp): - return JapaneseTokenizer(nlp, split_mode=split_mode) + return JapaneseTokenizer(nlp.vocab, split_mode=split_mode) return japanese_tokenizer_factory class JapaneseTokenizer(DummyTokenizer): - def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None: - self.vocab = nlp.vocab + def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None: + self.vocab = vocab self.split_mode = split_mode self.tokenizer = try_sudachi_import(self.split_mode) + # if we're using split mode A we don't need subtokens + self.need_subtokens = not (split_mode is None or split_mode == "A") + + def __reduce__(self): + return JapaneseTokenizer, (self.vocab, self.split_mode) def __call__(self, text: str) -> Doc: # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces @@ -49,8 +58,8 @@ class JapaneseTokenizer(DummyTokenizer): dtokens, spaces = get_dtokens_and_spaces(dtokens, text) # create Doc with tag bi-gram based part-of-speech identification rules - words, tags, inflections, lemmas, readings, sub_tokens_list = ( - zip(*dtokens) if dtokens else [[]] * 6 + words, tags, inflections, lemmas, norms, readings, sub_tokens_list = ( + zip(*dtokens) if dtokens else [[]] * 7 ) sub_tokens_list = list(sub_tokens_list) doc = Doc(self.vocab, words=words, spaces=spaces) @@ -68,9 +77,18 @@ class JapaneseTokenizer(DummyTokenizer): ) # if there's no lemma info (it's an unk) just use the surface token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface - doc.user_data["inflections"] = inflections - doc.user_data["reading_forms"] = readings - doc.user_data["sub_tokens"] = sub_tokens_list + morph = {} + if dtoken.inf: + # it's normal for this to be empty for non-inflecting types + morph["Inflection"] = dtoken.inf + token.norm_ = dtoken.norm + if dtoken.reading: + # punctuation is its own reading, but we don't want values like + # "=" here + morph["Reading"] = re.sub("[=|]", "_", dtoken.reading) + token.morph = MorphAnalysis(self.vocab, morph) + if self.need_subtokens: + doc.user_data["sub_tokens"] = sub_tokens_list return doc def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True): @@ -81,9 +99,10 @@ class JapaneseTokenizer(DummyTokenizer): DetailedToken( token.surface(), # orth "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag - ",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf + ";".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf token.dictionary_form(), # lemma - token.reading_form(), # user_data['reading_forms'] + token.normalized_form(), + token.reading_form(), sub_tokens_list[idx] if sub_tokens_list else None, # user_data['sub_tokens'] @@ -105,9 +124,8 @@ class JapaneseTokenizer(DummyTokenizer): ] def _get_sub_tokens(self, sudachipy_tokens): - if ( - self.split_mode is None or self.split_mode == "A" - ): # do nothing for default split mode + # do nothing for default split mode + if not self.need_subtokens: return None sub_tokens_list = [] # list of (list of list of DetailedToken | None) @@ -176,9 +194,33 @@ class Japanese(Language): Defaults = JapaneseDefaults +@Japanese.factory( + "morphologizer", + assigns=["token.morph", "token.pos"], + default_config={ + "model": DEFAULT_MORPH_MODEL, + "overwrite": True, + "extend": True, + "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, + }, + default_score_weights={"pos_acc": 0.5, "morph_micro_f": 0.5, "morph_per_feat": None}, +) +def make_morphologizer( + nlp: Language, + model: Model, + name: str, + overwrite: bool, + extend: bool, + scorer: Optional[Callable], +): + return Morphologizer( + nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer + ) + + # Hold the attributes we need with convenient names DetailedToken = namedtuple( - "DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"] + "DetailedToken", ["surface", "tag", "inf", "lemma", "norm", "reading", "sub_tokens"] ) @@ -254,7 +296,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): return text_dtokens, text_spaces elif len([word for word in words if not word.isspace()]) == 0: assert text.isspace() - text_dtokens = [DetailedToken(text, gap_tag, "", text, None, None)] + text_dtokens = [DetailedToken(text, gap_tag, "", text, text, None, None)] text_spaces = [False] return text_dtokens, text_spaces @@ -271,7 +313,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): # space token if word_start > 0: w = text[text_pos : text_pos + word_start] - text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None)) + text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None)) text_spaces.append(False) text_pos += word_start @@ -287,16 +329,10 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): # trailing space token if text_pos < len(text): w = text[text_pos:] - text_dtokens.append(DetailedToken(w, gap_tag, "", w, None, None)) + text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None)) text_spaces.append(False) return text_dtokens, text_spaces -def pickle_japanese(instance): - return Japanese, tuple() - - -copy_reg.pickle(Japanese, pickle_japanese) - __all__ = ["Japanese"] diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index dfb311136..05fc67e79 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -5,11 +5,11 @@ from .tag_map import TAG_MAP from .lex_attrs import LEX_ATTRS from ...language import Language, BaseDefaults from ...tokens import Doc -from ...compat import copy_reg from ...scorer import Scorer from ...symbols import POS from ...training import validate_examples from ...util import DummyTokenizer, registry, load_config_from_str +from ...vocab import Vocab DEFAULT_CONFIG = """ @@ -23,17 +23,20 @@ DEFAULT_CONFIG = """ @registry.tokenizers("spacy.ko.KoreanTokenizer") def create_tokenizer(): def korean_tokenizer_factory(nlp): - return KoreanTokenizer(nlp) + return KoreanTokenizer(nlp.vocab) return korean_tokenizer_factory class KoreanTokenizer(DummyTokenizer): - def __init__(self, nlp: Language): - self.vocab = nlp.vocab + def __init__(self, vocab: Vocab): + self.vocab = vocab MeCab = try_mecab_import() # type: ignore[func-returns-value] self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") + def __reduce__(self): + return KoreanTokenizer, (self.vocab,) + def __del__(self): self.mecab_tokenizer.__del__() @@ -106,10 +109,4 @@ def check_spaces(text, tokens): yield False -def pickle_korean(instance): - return Korean, tuple() - - -copy_reg.pickle(Korean, pickle_korean) - __all__ = ["Korean"] diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index 12016c273..6ed981a06 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -3,6 +3,7 @@ import unicodedata import re from .. import attrs +from .tokenizer_exceptions import URL_MATCH _like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match @@ -109,6 +110,8 @@ def like_url(text: str) -> bool: return True if tld.isalpha() and tld in _tlds: return True + if URL_MATCH(text): + return True return False diff --git a/spacy/lang/mk/__init__.py b/spacy/lang/mk/__init__.py index 376afb552..fa07cfef9 100644 --- a/spacy/lang/mk/__init__.py +++ b/spacy/lang/mk/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .lemmatizer import MacedonianLemmatizer from .stop_words import STOP_WORDS @@ -38,13 +38,25 @@ class Macedonian(Language): @Macedonian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return MacedonianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return MacedonianLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Macedonian"] diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index e27754e55..e079236fd 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES @@ -26,13 +26,25 @@ class Norwegian(Language): @Norwegian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Norwegian"] diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 8f370eaaf..ad2205a0b 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model @@ -30,13 +30,25 @@ class Dutch(Language): @Dutch.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return DutchLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return DutchLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Dutch"] diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 4b8c88bd7..02c96799b 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model @@ -33,13 +33,25 @@ class Polish(Language): @Polish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pos_lookup", "overwrite": False}, + default_config={ + "model": None, + "mode": "pos_lookup", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return PolishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return PolishLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Polish"] diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 16ae5eef5..5d31d8ea2 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .stop_words import STOP_WORDS @@ -22,7 +22,12 @@ class Russian(Language): @Russian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pymorphy2", "overwrite": False}, + default_config={ + "model": None, + "mode": "pymorphy2", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( @@ -31,8 +36,11 @@ def make_lemmatizer( name: str, mode: str, overwrite: bool, + scorer: Optional[Callable], ): - return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return RussianLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Russian"] diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index ce5ccf36f..2fc3a471b 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -1,8 +1,9 @@ -from typing import Optional, List, Dict, Tuple +from typing import Optional, List, Dict, Tuple, Callable from thinc.api import Model from ...pipeline import Lemmatizer +from ...pipeline.lemmatizer import lemmatizer_score from ...symbols import POS from ...tokens import Token from ...vocab import Vocab @@ -20,6 +21,7 @@ class RussianLemmatizer(Lemmatizer): *, mode: str = "pymorphy2", overwrite: bool = False, + scorer: Optional[Callable] = lemmatizer_score, ) -> None: if mode == "pymorphy2": try: @@ -31,7 +33,7 @@ class RussianLemmatizer(Lemmatizer): ) from None if getattr(self, "_morph", None) is None: self._morph = MorphAnalyzer() - super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) + super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer) def pymorphy2_lemmatize(self, token: Token) -> List[str]: string = token.text diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py index bde662bf7..7d29bc1b4 100644 --- a/spacy/lang/si/stop_words.py +++ b/spacy/lang/si/stop_words.py @@ -1,47 +1,195 @@ STOP_WORDS = set( """ -අතර -එච්චර -එපමණ -එලෙස -එවිට -ඒ -කට -කදී -කින් -ක් -ට -තුර -ත් -ද -නමුත් -නොහොත් -පමණ -පමණි -ම -මෙච්චර -මෙපමණ -මෙලෙස -මෙවිට -මේ -ය -යි -ලදී +සහ +සමග +සමඟ +අහා +ආහ් +ආ +ඕහෝ +අනේ +අඳෝ +අපොයි +අපෝ +අයියෝ +ආයි +ඌයි +චී +චිහ් +චික් +හෝ‍ +දෝ +දෝහෝ +මෙන් +සේ +වැනි +බඳු +වන් +අයුරු +අයුරින් ලෙස -වගේ +වැඩි +ශ්‍රී +හා +ය +නිසා +නිසාවෙන් +බවට +බව +බවෙන් +නම් +වැඩි +සිට +දී +මහා +මහ +පමණ +පමණින් +පමන වන විට -විටෙක -විතර -විය -වුව -වුවත් -වුවද -වූ -සමඟ +විටින් +මේ +මෙලෙස +මෙයින් +ඇති +ලෙස +සිදු +වශයෙන් +යන +සඳහා +මගින් +හෝ‍ +ඉතා +ඒ +එම +ද +අතර +විසින් +සමග +පිළිබඳව +පිළිබඳ +තුළ +බව +වැනි +මහ +මෙම +මෙහි +මේ +වෙත +වෙතින් +වෙතට +වෙනුවෙන් +වෙනුවට +වෙන +ගැන +නෑ +අනුව +නව +පිළිබඳ +විශේෂ +දැනට +එහෙන් +මෙහෙන් +එහේ +මෙහේ +ම +තවත් +තව සහ -හා +දක්වා +ට +ගේ +එ +ක +ක් +බවත් +බවද +මත +ඇතුලු +ඇතුළු +මෙසේ +වඩා +වඩාත්ම +නිති +නිතිත් +නිතොර +නිතර +ඉක්බිති +දැන් +යලි +පුන +ඉතින් +සිට +සිටන් +පටන් +තෙක් +දක්වා +සා +තාක් +තුවක් +පවා +ද +හෝ‍ +වත් +විනා +හැර +මිස +මුත් +කිම +කිම් +ඇයි +මන්ද හෙවත් -හෝ +නොහොත් +පතා +පාසා +ගානෙ +තව +ඉතා +බොහෝ +වහා +සෙද +සැනින් +හනික +එම්බා +එම්බල +බොල +නම් +වනාහි +කලී +ඉඳුරා +අන්න +ඔන්න +මෙන්න +උදෙසා +පිණිස +සඳහා +අරබයා +නිසා +එනිසා +එබැවින් +බැවින් +හෙයින් +සේක් +සේක +ගැන +අනුව +පරිදි +විට +තෙක් +මෙතෙක් +මේතාක් +තුරු +තුරා +තුරාවට +තුලින් +නමුත් +එනමුත් +වස් +මෙන් +ලෙස +පරිදි +එහෙත් """.split() ) diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 518ee0db7..6963e8b79 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS @@ -29,13 +29,25 @@ class Swedish(Language): @Swedish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "overwrite": False}, + default_config={ + "model": None, + "mode": "rule", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Swedish"] diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 10d466bd3..12b1527e0 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -3,6 +3,7 @@ from .lex_attrs import LEX_ATTRS from ...language import Language, BaseDefaults from ...tokens import Doc from ...util import DummyTokenizer, registry, load_config_from_str +from ...vocab import Vocab DEFAULT_CONFIG = """ @@ -16,13 +17,13 @@ DEFAULT_CONFIG = """ @registry.tokenizers("spacy.th.ThaiTokenizer") def create_thai_tokenizer(): def thai_tokenizer_factory(nlp): - return ThaiTokenizer(nlp) + return ThaiTokenizer(nlp.vocab) return thai_tokenizer_factory class ThaiTokenizer(DummyTokenizer): - def __init__(self, nlp: Language) -> None: + def __init__(self, vocab: Vocab) -> None: try: from pythainlp.tokenize import word_tokenize except ImportError: @@ -31,7 +32,7 @@ class ThaiTokenizer(DummyTokenizer): "https://github.com/PyThaiNLP/pythainlp" ) from None self.word_tokenize = word_tokenize - self.vocab = nlp.vocab + self.vocab = vocab def __call__(self, text: str) -> Doc: words = list(self.word_tokenize(text)) diff --git a/spacy/lang/ti/lex_attrs.py b/spacy/lang/ti/lex_attrs.py index ed094de3b..b29bd8c96 100644 --- a/spacy/lang/ti/lex_attrs.py +++ b/spacy/lang/ti/lex_attrs.py @@ -2,7 +2,7 @@ from ...attrs import LIKE_NUM _num_words = [ "ዜሮ", - "ሐደ", + "ሓደ", "ክልተ", "ሰለስተ", "ኣርባዕተ", @@ -11,66 +11,37 @@ _num_words = [ "ሸውዓተ", "ሽሞንተ", "ትሽዓተ", - "ኣሰርተ", - "ኣሰርተ ሐደ", - "ኣሰርተ ክልተ", - "ኣሰርተ ሰለስተ", - "ኣሰርተ ኣርባዕተ", - "ኣሰርተ ሓሙሽተ", - "ኣሰርተ ሽድሽተ", - "ኣሰርተ ሸውዓተ", - "ኣሰርተ ሽሞንተ", - "ኣሰርተ ትሽዓተ", + "ዓሰርተ", "ዕስራ", "ሰላሳ", "ኣርብዓ", - "ሃምሳ", - "ስልሳ", + "ሓምሳ", + "ሱሳ", "ሰብዓ", "ሰማንያ", - "ተስዓ", + "ቴስዓ", "ሚእቲ", "ሺሕ", "ሚልዮን", "ቢልዮን", "ትሪልዮን", "ኳድሪልዮን", - "ገጅልዮን", - "ባዝልዮን", + "ጋዚልዮን", + "ባዚልዮን" ] +# Tigrinya ordinals above 10 are the same as _num_words but start with "መበል " _ordinal_words = [ "ቀዳማይ", "ካልኣይ", "ሳልሳይ", - "ራብኣይ", + "ራብዓይ", "ሓምሻይ", "ሻድሻይ", "ሻውዓይ", "ሻምናይ", - "ዘጠነኛ", - "አስረኛ", - "ኣሰርተ አንደኛ", - "ኣሰርተ ሁለተኛ", - "ኣሰርተ ሶስተኛ", - "ኣሰርተ አራተኛ", - "ኣሰርተ አምስተኛ", - "ኣሰርተ ስድስተኛ", - "ኣሰርተ ሰባተኛ", - "ኣሰርተ ስምንተኛ", - "ኣሰርተ ዘጠነኛ", - "ሃያኛ", - "ሰላሳኛ" "አርባኛ", - "አምሳኛ", - "ስድሳኛ", - "ሰባኛ", - "ሰማንያኛ", - "ዘጠናኛ", - "መቶኛ", - "ሺኛ", - "ሚሊዮንኛ", - "ቢሊዮንኛ", - "ትሪሊዮንኛ", + "ታሽዓይ", + "ዓስራይ" ] @@ -92,7 +63,7 @@ def like_num(text): # Check ordinal number if text_lower in _ordinal_words: return True - if text_lower.endswith("ኛ"): + if text_lower.endswith("ይ"): if text_lower[:-2].isdigit(): return True diff --git a/spacy/lang/ti/punctuation.py b/spacy/lang/ti/punctuation.py index 772b009bf..aa884c2ba 100644 --- a/spacy/lang/ti/punctuation.py +++ b/spacy/lang/ti/punctuation.py @@ -1,7 +1,7 @@ from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import UNITS, ALPHA_UPPER -_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧".strip().split() +_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split() _suffixes = ( _list_punct diff --git a/spacy/lang/ti/stop_words.py b/spacy/lang/ti/stop_words.py index c4f8f20fa..9bd712200 100644 --- a/spacy/lang/ti/stop_words.py +++ b/spacy/lang/ti/stop_words.py @@ -1,6 +1,27 @@ +# Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt + # Stop words STOP_WORDS = set( """ -ግን ግና ንስኻ ንስኺ ንስኻትክን ንስኻትኩም ናትካ ናትኪ ናትክን ናትኩም +'ምበር 'ሞ 'ቲ 'ታ 'ኳ 'ውን 'ዚ 'የ 'ዩ 'ያ 'ዮም 'ዮን +ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል +ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም +ስለ ስለዚ ስለዝበላ ሽዑ ቅድሚ በለ በቲ በዚ ብምባል ብተወሳኺ ብኸመይ +ብዘይ ብዘይካ ብዙሕ ብዛዕባ ብፍላይ ተባሂሉ ነበረ ነቲ ነታ ነቶም +ነዚ ነይሩ ነገራት ነገር ናብ ናብቲ ናትኩም ናትኪ ናትካ ናትክን +ናይ ናይቲ ንሕና ንሱ ንሳ ንሳቶም ንስኺ ንስኻ ንስኻትኩም ንስኻትክን ንዓይ +ኢለ ኢሉ ኢላ ኢልካ ኢሎም ኢና ኢኻ ኢዩ ኣለኹ +ኣለዉ ኣለዎ ኣሎ ኣብ ኣብቲ ኣብታ ኣብኡ ኣብዚ ኣነ ኣዝዩ ኣይኮነን ኣይኰነን +እምበር እሞ እተን እቲ እታ እቶም እንተ እንተሎ +ኣላ እንተኾነ እንታይ እንከሎ እኳ እዋን እውን እዚ እዛ እዞም +እየ እየን እዩ እያ እዮም +ከሎ ከመይ ከም ከምቲ ከምኡ ከምዘሎ +ከምዚ ከኣ ኩሉ ካልእ ካልኦት ካብ ካብቲ ካብቶም ክሳብ ክሳዕ ክብል +ክንደይ ክንዲ ክኸውን ኮይኑ ኰይኑ ኵሉ ኸም ኸኣ ወይ +ዋላ ዘለና ዘለዉ ዘለዋ ዘለዎ ዘለዎም ዘላ ዘሎ ዘይብሉ +ዝርከብ ዝበሃል ዝበለ ዝብል ዝተባህለ ዝተኻየደ ዝተፈላለየ ዝተፈላለዩ +ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ +የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ +ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ """.split() ) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index e41db911f..d76fe4262 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -250,3 +250,9 @@ o.0 for orth in emoticons: BASE_EXCEPTIONS[orth] = [{ORTH: orth}] + + +# Moved from a suffix setting due to #9155 removing prefixes from consideration +# for lookbehinds +for u in "cfkCFK": + BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}] diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 1fa568292..21f9649f2 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model @@ -23,13 +23,25 @@ class Ukrainian(Language): @Ukrainian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pymorphy2", "overwrite": False}, + default_config={ + "model": None, + "mode": "pymorphy2", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return UkrainianLemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) __all__ = ["Ukrainian"] diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 1fb030e06..fd566a3a8 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -1,8 +1,9 @@ -from typing import Optional +from typing import Optional, Callable from thinc.api import Model from ..ru.lemmatizer import RussianLemmatizer +from ...pipeline.lemmatizer import lemmatizer_score from ...vocab import Vocab @@ -15,6 +16,7 @@ class UkrainianLemmatizer(RussianLemmatizer): *, mode: str = "pymorphy2", overwrite: bool = False, + scorer: Optional[Callable] = lemmatizer_score, ) -> None: if mode == "pymorphy2": try: @@ -27,4 +29,4 @@ class UkrainianLemmatizer(RussianLemmatizer): ) from None if getattr(self, "_morph", None) is None: self._morph = MorphAnalyzer(lang="uk") - super().__init__(vocab, model, name, mode=mode, overwrite=overwrite) + super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer) diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 9d5fd8d9d..822dc348c 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -9,6 +9,7 @@ from .lex_attrs import LEX_ATTRS from ...language import Language, BaseDefaults from ...tokens import Doc from ...util import DummyTokenizer, registry, load_config_from_str +from ...vocab import Vocab from ... import util @@ -24,14 +25,14 @@ use_pyvi = true @registry.tokenizers("spacy.vi.VietnameseTokenizer") def create_vietnamese_tokenizer(use_pyvi: bool = True): def vietnamese_tokenizer_factory(nlp): - return VietnameseTokenizer(nlp, use_pyvi=use_pyvi) + return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi) return vietnamese_tokenizer_factory class VietnameseTokenizer(DummyTokenizer): - def __init__(self, nlp: Language, use_pyvi: bool = False): - self.vocab = nlp.vocab + def __init__(self, vocab: Vocab, use_pyvi: bool = False): + self.vocab = vocab self.use_pyvi = use_pyvi if self.use_pyvi: try: @@ -45,6 +46,9 @@ class VietnameseTokenizer(DummyTokenizer): ) raise ImportError(msg) from None + def __reduce__(self): + return VietnameseTokenizer, (self.vocab, self.use_pyvi) + def __call__(self, text: str) -> Doc: if self.use_pyvi: words = self.pyvi_tokenize(text) diff --git a/spacy/lang/vi/examples.py b/spacy/lang/vi/examples.py new file mode 100644 index 000000000..86d0b50b8 --- /dev/null +++ b/spacy/lang/vi/examples.py @@ -0,0 +1,18 @@ + +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.vi.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Đây là đâu, tôi là ai?", + "Căn phòng có nhiều cửa sổ nên nó khá sáng", + "Đại dịch COVID vừa qua đã gây ảnh hưởng rất lớn tới nhiều doanh nghiệp lớn nhỏ.", + "Thành phố Hồ Chí Minh đã bị ảnh hưởng nặng nề trong thời gian vừa qua.", + "Ông bạn đang ở đâu thế?", + "Ai là người giải phóng đất nước Việt Nam khỏi ách đô hộ?", + "Vị tướng nào là người đã làm nên chiến thắng lịch sử Điện Biên Phủ?", + "Làm việc nhiều chán quá, đi chơi đâu đi?", +] diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py index b3dbf2192..33a3745cc 100644 --- a/spacy/lang/vi/lex_attrs.py +++ b/spacy/lang/vi/lex_attrs.py @@ -9,11 +9,14 @@ _num_words = [ "bốn", "năm", "sáu", + "bảy", "bẩy", "tám", "chín", "mười", + "chục", "trăm", + "nghìn", "tỷ", ] diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 755a294e2..fdf6776e2 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -11,6 +11,7 @@ from ...scorer import Scorer from ...tokens import Doc from ...training import validate_examples, Example from ...util import DummyTokenizer, registry, load_config_from_str +from ...vocab import Vocab from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from ... import util @@ -48,14 +49,14 @@ class Segmenter(str, Enum): @registry.tokenizers("spacy.zh.ChineseTokenizer") def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char): def chinese_tokenizer_factory(nlp): - return ChineseTokenizer(nlp, segmenter=segmenter) + return ChineseTokenizer(nlp.vocab, segmenter=segmenter) return chinese_tokenizer_factory class ChineseTokenizer(DummyTokenizer): - def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char): - self.vocab = nlp.vocab + def __init__(self, vocab: Vocab, segmenter: Segmenter = Segmenter.char): + self.vocab = vocab self.segmenter = ( segmenter.value if isinstance(segmenter, Segmenter) else segmenter ) diff --git a/spacy/language.py b/spacy/language.py index 80703259d..aa57989ac 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -115,7 +115,7 @@ class Language: Defaults (class): Settings, data and factory methods for creating the `nlp` object and processing pipeline. - lang (str): Two-letter language ID, i.e. ISO code. + lang (str): IETF language code, such as 'en'. DOCS: https://spacy.io/api/language """ @@ -228,6 +228,7 @@ class Language: "vectors": len(self.vocab.vectors), "keys": self.vocab.vectors.n_keys, "name": self.vocab.vectors.name, + "mode": self.vocab.vectors.mode, } self._meta["labels"] = dict(self.pipe_labels) # TODO: Adding this back to prevent breaking people's code etc., but @@ -978,7 +979,7 @@ class Language: def __call__( self, - text: str, + text: Union[str, Doc], *, disable: Iterable[str] = SimpleFrozenList(), component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, @@ -987,7 +988,9 @@ class Language: and can contain arbitrary whitespace. Alignment into the original string is preserved. - text (str): The text to be processed. + text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`, + the doc will be passed directly to the pipeline, skipping + `Language.make_doc`. disable (List[str]): Names of the pipeline components to disable. component_cfg (Dict[str, dict]): An optional dictionary with extra keyword arguments for specific components. @@ -995,7 +998,7 @@ class Language: DOCS: https://spacy.io/api/language#call """ - doc = self.make_doc(text) + doc = self._ensure_doc(text) if component_cfg is None: component_cfg = {} for name, proc in self.pipeline: @@ -1080,6 +1083,20 @@ class Language: ) return self.tokenizer(text) + def _ensure_doc(self, doc_like: Union[str, Doc]) -> Doc: + """Create a Doc if need be, or raise an error if the input is not a Doc or a string.""" + if isinstance(doc_like, Doc): + return doc_like + if isinstance(doc_like, str): + return self.make_doc(doc_like) + raise ValueError(Errors.E866.format(type=type(doc_like))) + + def _ensure_doc_with_context(self, doc_like: Union[str, Doc], context: Any) -> Doc: + """Create a Doc if need be and add as_tuples context, or raise an error if the input is not a Doc or a string.""" + doc = self._ensure_doc(doc_like) + doc._context = context + return doc + def update( self, examples: Iterable[Example], @@ -1450,7 +1467,7 @@ class Language: @overload def pipe( self, - texts: Iterable[str], + texts: Iterable[Union[str, Doc]], *, as_tuples: Literal[False] = ..., batch_size: Optional[int] = ..., @@ -1463,7 +1480,7 @@ class Language: @overload def pipe( # noqa: F811 self, - texts: Iterable[Tuple[str, _AnyContext]], + texts: Iterable[Tuple[Union[str, Doc], _AnyContext]], *, as_tuples: Literal[True] = ..., batch_size: Optional[int] = ..., @@ -1475,7 +1492,9 @@ class Language: def pipe( # noqa: F811 self, - texts: Union[Iterable[str], Iterable[Tuple[str, _AnyContext]]], + texts: Union[ + Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]] + ], *, as_tuples: bool = False, batch_size: Optional[int] = None, @@ -1485,7 +1504,8 @@ class Language: ) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]: """Process texts as a stream, and yield `Doc` objects in order. - texts (Iterable[str]): A sequence of texts to process. + texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to + process. as_tuples (bool): If set to True, inputs should be a sequence of (text, context) tuples. Output will then be a sequence of (doc, context) tuples. Defaults to False. @@ -1500,23 +1520,24 @@ class Language: """ # Handle texts with context as tuples if as_tuples: - texts = cast(Iterable[Tuple[str, _AnyContext]], texts) - text_context1, text_context2 = itertools.tee(texts) - texts = (tc[0] for tc in text_context1) - contexts = (tc[1] for tc in text_context2) + texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts) + docs_with_contexts = ( + self._ensure_doc_with_context(text, context) for text, context in texts + ) docs = self.pipe( - texts, + docs_with_contexts, batch_size=batch_size, disable=disable, n_process=n_process, component_cfg=component_cfg, ) - for doc, context in zip(docs, contexts): + for doc in docs: + context = doc._context + doc._context = None yield (doc, context) return - # At this point, we know that we're dealing with an iterable of plain texts - texts = cast(Iterable[str], texts) + texts = cast(Iterable[Union[str, Doc]], texts) # Set argument defaults if n_process == -1: @@ -1551,7 +1572,7 @@ class Language: docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size) else: # if n_process == 1, no processes are forked. - docs = (self.make_doc(text) for text in texts) + docs = (self._ensure_doc(text) for text in texts) for pipe in pipes: docs = pipe(docs) for doc in docs: @@ -1570,7 +1591,7 @@ class Language: def _multiprocessing_pipe( self, - texts: Iterable[str], + texts: Iterable[Union[str, Doc]], pipes: Iterable[Callable[..., Iterator[Doc]]], n_process: int, batch_size: int, @@ -1596,7 +1617,7 @@ class Language: procs = [ mp.Process( target=_apply_pipes, - args=(self.make_doc, pipes, rch, sch, Underscore.get_state()), + args=(self._ensure_doc, pipes, rch, sch, Underscore.get_state()), ) for rch, sch in zip(texts_q, bytedocs_send_ch) ] @@ -1609,11 +1630,12 @@ class Language: recv.recv() for recv in cycle(bytedocs_recv_ch) ) try: - for i, (_, (byte_doc, byte_error)) in enumerate( + for i, (_, (byte_doc, byte_context, byte_error)) in enumerate( zip(raw_texts, byte_tuples), 1 ): if byte_doc is not None: doc = Doc(self.vocab).from_bytes(byte_doc) + doc._context = byte_context yield doc elif byte_error is not None: error = srsly.msgpack_loads(byte_error) @@ -2138,7 +2160,7 @@ def _copy_examples(examples: Iterable[Example]) -> List[Example]: def _apply_pipes( - make_doc: Callable[[str], Doc], + ensure_doc: Callable[[Union[str, Doc]], Doc], pipes: Iterable[Callable[..., Iterator[Doc]]], receiver, sender, @@ -2146,7 +2168,8 @@ def _apply_pipes( ) -> None: """Worker for Language.pipe - make_doc (Callable[[str,] Doc]): Function to create Doc from text. + ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text + or raise an error if the input is neither a Doc nor a string. pipes (Iterable[Pipe]): The components to apply. receiver (multiprocessing.Connection): Pipe to receive text. Usually created by `multiprocessing.Pipe()` @@ -2159,16 +2182,16 @@ def _apply_pipes( while True: try: texts = receiver.get() - docs = (make_doc(text) for text in texts) + docs = (ensure_doc(text) for text in texts) for pipe in pipes: docs = pipe(docs) # type: ignore[arg-type, assignment] # Connection does not accept unpickable objects, so send list. - byte_docs = [(doc.to_bytes(), None) for doc in docs] - padding = [(None, None)] * (len(texts) - len(byte_docs)) + byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs] + padding = [(None, None, None)] * (len(texts) - len(byte_docs)) sender.send(byte_docs + padding) # type: ignore[operator] except Exception: - error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))] - padding = [(None, None)] * (len(texts) - 1) + error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))] + padding = [(None, None, None)] * (len(texts) - 1) sender.send(error_msg + padding) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 3564b6e42..792e405dd 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -284,7 +284,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.lower] - def __set__(self, unicode x): + def __set__(self, str x): self.c.lower = self.vocab.strings.add(x) property norm_: @@ -294,7 +294,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.norm] - def __set__(self, unicode x): + def __set__(self, str x): self.norm = self.vocab.strings.add(x) property shape_: @@ -304,7 +304,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.shape] - def __set__(self, unicode x): + def __set__(self, str x): self.c.shape = self.vocab.strings.add(x) property prefix_: @@ -314,7 +314,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.prefix] - def __set__(self, unicode x): + def __set__(self, str x): self.c.prefix = self.vocab.strings.add(x) property suffix_: @@ -324,7 +324,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.suffix] - def __set__(self, unicode x): + def __set__(self, str x): self.c.suffix = self.vocab.strings.add(x) property lang_: @@ -332,7 +332,7 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.lang] - def __set__(self, unicode x): + def __set__(self, str x): self.c.lang = self.vocab.strings.add(x) property flags: diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index b667e6b2f..a602ba737 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -148,9 +148,9 @@ cdef class DependencyMatcher: Creates a token key to be used by the matcher """ return self._normalize_key( - unicode(key) + DELIMITER + - unicode(pattern_idx) + DELIMITER + - unicode(token_idx) + str(key) + DELIMITER + + str(pattern_idx) + DELIMITER + + str(token_idx) ) def add(self, key, patterns, *, on_match=None): @@ -424,7 +424,7 @@ cdef class DependencyMatcher: return [doc[child.i] for child in doc[node].head.children if child.i < node] def _normalize_key(self, key): - if isinstance(key, basestring): + if isinstance(key, str): return self.vocab.strings.add(key) else: return key diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index f8482a1eb..745d7cf43 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -312,7 +312,7 @@ cdef class Matcher: return final_results def _normalize_key(self, key): - if isinstance(key, basestring): + if isinstance(key, str): return self.vocab.strings.add(key) else: return key @@ -360,7 +360,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e for i, token in enumerate(doclike): for name, index in extensions.items(): value = token._.get(name) - if isinstance(value, basestring): + if isinstance(value, str): value = token.vocab.strings[value] extra_attr_values[i * nr_extra_attr + index] = value # Main loop @@ -786,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): def _get_attr_values(spec, string_store): attr_values = [] for attr, value in spec.items(): - if isinstance(attr, basestring): + if isinstance(attr, str): attr = attr.upper() if attr == '_': continue @@ -797,7 +797,7 @@ def _get_attr_values(spec, string_store): if attr == "IS_SENT_START": attr = "SENT_START" attr = IDS.get(attr) - if isinstance(value, basestring): + if isinstance(value, str): value = string_store.add(value) elif isinstance(value, bool): value = int(value) @@ -938,7 +938,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab): seen_predicates = {pred.key: pred.i for pred in extra_predicates} output = [] for attr, value in spec.items(): - if isinstance(attr, basestring): + if isinstance(attr, str): if attr == "_": output.extend( _get_extension_extra_predicates( @@ -995,7 +995,7 @@ def _get_operators(spec): "?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)} # Fix casing spec = {key.upper(): values for key, values in spec.items() - if isinstance(key, basestring)} + if isinstance(key, str)} if "OP" not in spec: return (ONE,) elif spec["OP"] in lookup: @@ -1013,7 +1013,7 @@ def _get_extensions(spec, string_store, name2index): if isinstance(value, dict): # Handle predicates (e.g. "IN", in the extra_predicates, not here. continue - if isinstance(value, basestring): + if isinstance(value, str): value = string_store.add(value) if name not in name2index: name2index[name] = len(name2index) diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 53ef01906..8dd65833b 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -1,11 +1,13 @@ -from typing import List, Tuple, Callable, Optional, cast +from typing import List, Tuple, Callable, Optional, Sequence, cast from thinc.initializers import glorot_uniform_init from thinc.util import partial -from thinc.types import Ragged, Floats2d, Floats1d +from thinc.types import Ragged, Floats2d, Floats1d, Ints1d from thinc.api import Model, Ops, registry from ..tokens import Doc from ..errors import Errors +from ..vectors import Mode +from ..vocab import Vocab @registry.layers("spacy.StaticVectors.v2") @@ -34,20 +36,32 @@ def StaticVectors( def forward( model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool ) -> Tuple[Ragged, Callable]: - if not sum(len(doc) for doc in docs): + token_count = sum(len(doc) for doc in docs) + if not token_count: return _handle_empty(model.ops, model.get_dim("nO")) - key_attr = model.attrs["key_attr"] - W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) - V = cast(Floats2d, model.ops.asarray(docs[0].vocab.vectors.data)) - rows = model.ops.flatten( - [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs] + key_attr: int = model.attrs["key_attr"] + keys: Ints1d = model.ops.flatten( + cast(Sequence, [doc.to_array(key_attr) for doc in docs]) ) + vocab: Vocab = docs[0].vocab + W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) + if vocab.vectors.mode == Mode.default: + V = cast(Floats2d, model.ops.asarray(vocab.vectors.data)) + rows = vocab.vectors.find(keys=keys) + V = model.ops.as_contig(V[rows]) + elif vocab.vectors.mode == Mode.floret: + V = cast(Floats2d, vocab.vectors.get_batch(keys)) + V = model.ops.as_contig(V) + else: + raise RuntimeError(Errors.E896) try: - vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True) + vectors_data = model.ops.gemm(V, W, trans2=True) except ValueError: raise RuntimeError(Errors.E896) - # Convert negative indices to 0-vectors (TODO: more options for UNK tokens) - vectors_data[rows < 0] = 0 + if vocab.vectors.mode == Mode.default: + # Convert negative indices to 0-vectors + # TODO: more options for UNK tokens + vectors_data[rows < 0] = 0 output = Ragged( vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i") # type: ignore ) @@ -63,7 +77,7 @@ def forward( model.inc_grad( "W", model.ops.gemm( - cast(Floats2d, d_output.data), model.ops.as_contig(V[rows]), trans1=True + cast(Floats2d, d_output.data), model.ops.as_contig(V), trans1=True ), ) return [] diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 9ca702f9b..f34975858 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -17,7 +17,7 @@ from ...errors import Errors from thinc.extra.search cimport Beam cdef weight_t MIN_SCORE = -90000 -cdef attr_t SUBTOK_LABEL = hash_string(u'subtok') +cdef attr_t SUBTOK_LABEL = hash_string('subtok') DEF NON_MONOTONIC = True diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 331eaa4d8..0d9494865 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -5,15 +5,15 @@ from pathlib import Path from .pipe import Pipe from ..errors import Errors -from ..training import validate_examples, Example +from ..training import Example from ..language import Language from ..matcher import Matcher from ..scorer import Scorer -from ..symbols import IDS, TAG, POS, MORPH, LEMMA +from ..symbols import IDS from ..tokens import Doc, Span from ..tokens._retokenize import normalize_token_attrs, set_token_attrs from ..vocab import Vocab -from ..util import SimpleFrozenList +from ..util import SimpleFrozenList, registry from .. import util @@ -23,9 +23,41 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] -@Language.factory("attribute_ruler", default_config={"validate": False}) -def make_attribute_ruler(nlp: Language, name: str, validate: bool): - return AttributeRuler(nlp.vocab, name, validate=validate) +@Language.factory( + "attribute_ruler", + default_config={ + "validate": False, + "scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"}, + }, +) +def make_attribute_ruler( + nlp: Language, name: str, validate: bool, scorer: Optional[Callable] +): + return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer) + + +def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + def morph_key_getter(token, attr): + return getattr(token, attr).key + + results = {} + results.update(Scorer.score_token_attr(examples, "tag", **kwargs)) + results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) + results.update( + Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs) + ) + results.update( + Scorer.score_token_attr_per_feat( + examples, "morph", getter=morph_key_getter, **kwargs + ) + ) + results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) + return results + + +@registry.scorers("spacy.attribute_ruler_scorer.v1") +def make_attribute_ruler_scorer(): + return attribute_ruler_score class AttributeRuler(Pipe): @@ -36,7 +68,12 @@ class AttributeRuler(Pipe): """ def __init__( - self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False + self, + vocab: Vocab, + name: str = "attribute_ruler", + *, + validate: bool = False, + scorer: Optional[Callable] = attribute_ruler_score, ) -> None: """Create the AttributeRuler. After creation, you can add patterns with the `.initialize()` or `.add_patterns()` methods, or load patterns @@ -45,6 +82,10 @@ class AttributeRuler(Pipe): vocab (Vocab): The vocab. name (str): The pipe name. Defaults to "attribute_ruler". + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_token_attr for the attributes "tag", "pos", "morph" and + "lemma" and Scorer.score_token_attr_per_feat for the attribute + "morph". RETURNS (AttributeRuler): The AttributeRuler component. @@ -57,6 +98,7 @@ class AttributeRuler(Pipe): self.attrs: List[Dict] = [] self._attrs_unnormed: List[Dict] = [] # store for reference self.indices: List[int] = [] + self.scorer = scorer def clear(self) -> None: """Reset all patterns.""" @@ -228,45 +270,6 @@ class AttributeRuler(Pipe): all_patterns.append(p) return all_patterns # type: ignore[return-value] - def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by - Scorer.score_token_attr for the attributes "tag", "pos", "morph" - and "lemma" for the target token attributes. - - DOCS: https://spacy.io/api/tagger#score - """ - - def morph_key_getter(token, attr): - return getattr(token, attr).key - - validate_examples(examples, "AttributeRuler.score") - results = {} - attrs = set() # type: ignore - for token_attrs in self.attrs: - attrs.update(token_attrs) - for attr in attrs: - if attr == TAG: - results.update(Scorer.score_token_attr(examples, "tag", **kwargs)) - elif attr == POS: - results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) - elif attr == MORPH: - results.update( - Scorer.score_token_attr( - examples, "morph", getter=morph_key_getter, **kwargs - ) - ) - results.update( - Scorer.score_token_attr_per_feat( - examples, "morph", getter=morph_key_getter, **kwargs - ) - ) - elif attr == LEMMA: - results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) - return results - def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: """Serialize the AttributeRuler to a bytestring. diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index be23ab0dd..50c57ee5b 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -1,6 +1,6 @@ # cython: infer_types=True, profile=True, binding=True from collections import defaultdict -from typing import Optional, Iterable +from typing import Optional, Iterable, Callable from thinc.api import Model, Config from ._parser_internals.transition_system import TransitionSystem @@ -12,7 +12,7 @@ from ..language import Language from ._parser_internals import nonproj from ._parser_internals.nonproj import DELIMITER from ..scorer import Scorer -from ..training import validate_examples +from ..util import registry default_model_config = """ @@ -46,6 +46,7 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, + "scorer": {"@scorers": "spacy.parser_scorer.v1"}, }, default_score_weights={ "dep_uas": 0.5, @@ -63,7 +64,8 @@ def make_parser( moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, learn_tokens: bool, - min_action_freq: int + min_action_freq: int, + scorer: Optional[Callable], ): """Create a transition-based DependencyParser component. The dependency parser jointly learns sentence segmentation and labelled dependency parsing, and can @@ -100,6 +102,7 @@ def make_parser( primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. + scorer (Optional[Callable]): The scoring method. """ return DependencyParser( nlp.vocab, @@ -115,7 +118,8 @@ def make_parser( beam_update_prob=0.0, # At some point in the future we can try to implement support for # partial annotations, perhaps only in the beam objective. - incorrect_spans_key=None + incorrect_spans_key=None, + scorer=scorer, ) @Language.factory( @@ -130,6 +134,7 @@ def make_parser( "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, + "scorer": {"@scorers": "spacy.parser_scorer.v1"}, }, default_score_weights={ "dep_uas": 0.5, @@ -151,6 +156,7 @@ def make_beam_parser( beam_width: int, beam_density: float, beam_update_prob: float, + scorer: Optional[Callable], ): """Create a transition-based DependencyParser component that uses beam-search. The dependency parser jointly learns sentence segmentation and labelled @@ -207,10 +213,41 @@ def make_beam_parser( min_action_freq=min_action_freq, # At some point in the future we can try to implement support for # partial annotations, perhaps only in the beam objective. - incorrect_spans_key=None + incorrect_spans_key=None, + scorer=scorer, ) +def parser_score(examples, **kwargs): + """Score a batch of examples. + + examples (Iterable[Example]): The examples to score. + RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans + and Scorer.score_deps. + + DOCS: https://spacy.io/api/dependencyparser#score + """ + def has_sents(doc): + return doc.has_annotation("SENT_START") + + def dep_getter(token, attr): + dep = getattr(token, attr) + dep = token.vocab.strings.as_string(dep).lower() + return dep + results = {} + results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) + kwargs.setdefault("getter", dep_getter) + kwargs.setdefault("ignore_labels", ("p", "punct")) + results.update(Scorer.score_deps(examples, "dep", **kwargs)) + del results["sents_per_type"] + return results + + +@registry.scorers("spacy.parser_scorer.v1") +def make_parser_scorer(): + return parser_score + + cdef class DependencyParser(Parser): """Pipeline component for dependency parsing. @@ -233,6 +270,7 @@ cdef class DependencyParser(Parser): beam_update_prob=0.0, multitasks=tuple(), incorrect_spans_key=None, + scorer=parser_score, ): """Create a DependencyParser. """ @@ -249,6 +287,7 @@ cdef class DependencyParser(Parser): beam_update_prob=beam_update_prob, multitasks=multitasks, incorrect_spans_key=incorrect_spans_key, + scorer=scorer, ) @property @@ -281,31 +320,6 @@ cdef class DependencyParser(Parser): labels.add(label) return tuple(sorted(labels)) - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans - and Scorer.score_deps. - - DOCS: https://spacy.io/api/dependencyparser#score - """ - def has_sents(doc): - return doc.has_annotation("SENT_START") - - validate_examples(examples, "DependencyParser.score") - def dep_getter(token, attr): - dep = getattr(token, attr) - dep = token.vocab.strings.as_string(dep).lower() - return dep - results = {} - results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) - kwargs.setdefault("getter", dep_getter) - kwargs.setdefault("ignore_labels", ("p", "punct")) - results.update(Scorer.score_deps(examples, "dep", **kwargs)) - del results["sents_per_type"] - return results - def scored_parses(self, beams): """Return two dictionaries with scores for each beam/doc that was processed: one containing (i, head) keys, and another containing (i, label) keys. diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 4a0902444..1169e898d 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -17,10 +17,12 @@ from ..language import Language from ..vocab import Vocab from ..training import Example, validate_examples, validate_get_examples from ..errors import Errors, Warnings -from ..util import SimpleFrozenList +from ..util import SimpleFrozenList, registry from .. import util from ..scorer import Scorer +# See #9050 +BACKWARD_OVERWRITE = True default_model_config = """ [model] @@ -51,6 +53,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "incl_context": True, "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, + "overwrite": True, + "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, }, default_score_weights={ "nel_micro_f": 1.0, @@ -69,6 +73,8 @@ def make_entity_linker( incl_context: bool, entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], + overwrite: bool, + scorer: Optional[Callable], ): """Construct an EntityLinker component. @@ -82,6 +88,7 @@ def make_entity_linker( entity_vector_length (int): Size of encoding vectors in the KB. get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. + scorer (Optional[Callable]): The scoring method. """ return EntityLinker( nlp.vocab, @@ -93,9 +100,20 @@ def make_entity_linker( incl_context=incl_context, entity_vector_length=entity_vector_length, get_candidates=get_candidates, + overwrite=overwrite, + scorer=scorer, ) +def entity_linker_score(examples, **kwargs): + return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs) + + +@registry.scorers("spacy.entity_linker_scorer.v1") +def make_entity_linker_scorer(): + return entity_linker_score + + class EntityLinker(TrainablePipe): """Pipeline component for named entity linking. @@ -116,6 +134,8 @@ class EntityLinker(TrainablePipe): incl_context: bool, entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], + overwrite: bool = BACKWARD_OVERWRITE, + scorer: Optional[Callable] = entity_linker_score, ) -> None: """Initialize an entity linker. @@ -130,6 +150,8 @@ class EntityLinker(TrainablePipe): entity_vector_length (int): Size of encoding vectors in the KB. get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_links. DOCS: https://spacy.io/api/entitylinker#init """ @@ -141,11 +163,12 @@ class EntityLinker(TrainablePipe): self.incl_prior = incl_prior self.incl_context = incl_context self.get_candidates = get_candidates - self.cfg: Dict[str, Any] = {} + self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) # how many neighbour sentences to take into account # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. self.kb = empty_kb(entity_vector_length)(self.vocab) + self.scorer = scorer def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will @@ -384,23 +407,14 @@ class EntityLinker(TrainablePipe): if count_ents != len(kb_ids): raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) i = 0 + overwrite = self.cfg["overwrite"] for doc in docs: for ent in doc.ents: kb_id = kb_ids[i] i += 1 for token in ent: - token.ent_kb_id_ = kb_id - - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores. - - DOCS TODO: https://spacy.io/api/entity_linker#score - """ - validate_examples(examples, "EntityLinker.score") - return Scorer.score_links(examples, negative_labels=[self.NIL]) + if token.ent_kb_id == 0 or overwrite: + token.ent_kb_id_ = kb_id def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index b8f32b4d3..2c3db2575 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -9,11 +9,10 @@ from .pipe import Pipe from ..training import Example from ..language import Language from ..errors import Errors, Warnings -from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList +from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher from ..scorer import get_ner_prf -from ..training import validate_examples DEFAULT_ENT_ID_SEP = "||" @@ -28,6 +27,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] "validate": False, "overwrite_ents": False, "ent_id_sep": DEFAULT_ENT_ID_SEP, + "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, }, default_score_weights={ "ents_f": 1.0, @@ -43,6 +43,7 @@ def make_entity_ruler( validate: bool, overwrite_ents: bool, ent_id_sep: str, + scorer: Optional[Callable], ): return EntityRuler( nlp, @@ -51,9 +52,19 @@ def make_entity_ruler( validate=validate, overwrite_ents=overwrite_ents, ent_id_sep=ent_id_sep, + scorer=scorer, ) +def entity_ruler_score(examples, **kwargs): + return get_ner_prf(examples) + + +@registry.scorers("spacy.entity_ruler_scorer.v1") +def make_entity_ruler_scorer(): + return entity_ruler_score + + class EntityRuler(Pipe): """The EntityRuler lets you add spans to the `Doc.ents` using token-based rules or exact phrase matches. It can be combined with the statistical @@ -75,6 +86,7 @@ class EntityRuler(Pipe): overwrite_ents: bool = False, ent_id_sep: str = DEFAULT_ENT_ID_SEP, patterns: Optional[List[PatternType]] = None, + scorer: Optional[Callable] = entity_ruler_score, ) -> None: """Initialize the entity ruler. If patterns are supplied here, they need to be a list of dictionaries with a `"label"` and `"pattern"` @@ -95,6 +107,8 @@ class EntityRuler(Pipe): overwrite_ents (bool): If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. ent_id_sep (str): Separator used internally for entity IDs. + scorer (Optional[Callable]): The scoring method. Defaults to + spacy.scorer.get_ner_prf. DOCS: https://spacy.io/api/entityruler#init """ @@ -113,6 +127,7 @@ class EntityRuler(Pipe): self._ent_ids = defaultdict(tuple) # type: ignore if patterns is not None: self.add_patterns(patterns) + self.scorer = scorer def __len__(self) -> int: """The number of all patterns added to the entity ruler.""" @@ -363,10 +378,6 @@ class EntityRuler(Pipe): label = f"{label}{self.ent_id_sep}{ent_id}" return label - def score(self, examples, **kwargs): - validate_examples(examples, "EntityRuler.score") - return get_ner_prf(examples) - def from_bytes( self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() ) -> "EntityRuler": diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index ad227d240..9c2fc2f09 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -12,21 +12,41 @@ from ..lookups import Lookups, load_lookups from ..scorer import Scorer from ..tokens import Doc, Token from ..vocab import Vocab -from ..training import validate_examples -from ..util import logger, SimpleFrozenList +from ..util import logger, SimpleFrozenList, registry from .. import util @Language.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "lookup", "overwrite": False}, + default_config={ + "model": None, + "mode": "lookup", + "overwrite": False, + "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + }, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( - nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + overwrite: bool, + scorer: Optional[Callable], ): - return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer + ) + + +def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + return Scorer.score_token_attr(examples, "lemma", **kwargs) + + +@registry.scorers("spacy.lemmatizer_scorer.v1") +def make_lemmatizer_scorer(): + return lemmatizer_score class Lemmatizer(Pipe): @@ -60,6 +80,7 @@ class Lemmatizer(Pipe): *, mode: str = "lookup", overwrite: bool = False, + scorer: Optional[Callable] = lemmatizer_score, ) -> None: """Initialize a Lemmatizer. @@ -69,6 +90,8 @@ class Lemmatizer(Pipe): mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup". overwrite (bool): Whether to overwrite existing lemmas. Defaults to `False`. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_token_attr for the attribute "lemma". DOCS: https://spacy.io/api/lemmatizer#init """ @@ -89,6 +112,7 @@ class Lemmatizer(Pipe): raise ValueError(Errors.E1003.format(mode=mode)) self.lemmatize = getattr(self, mode_attr) self.cache = {} # type: ignore[var-annotated] + self.scorer = scorer @property def mode(self): @@ -247,17 +271,6 @@ class Lemmatizer(Pipe): """ return False - def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores. - - DOCS: https://spacy.io/api/lemmatizer#score - """ - validate_examples(examples, "Lemmatizer.score") - return Scorer.score_token_attr(examples, "lemma", **kwargs) - def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ): diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 3ba05e616..db425b69a 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,5 +1,5 @@ # cython: infer_types=True, profile=True, binding=True -from typing import Optional, Union, Dict +from typing import Optional, Union, Dict, Callable import srsly from thinc.api import SequenceCategoricalCrossentropy, Model, Config from itertools import islice @@ -17,7 +17,11 @@ from .tagger import Tagger from .. import util from ..scorer import Scorer from ..training import validate_examples, validate_get_examples +from ..util import registry +# See #9050 +BACKWARD_OVERWRITE = True +BACKWARD_EXTEND = False default_model_config = """ [model] @@ -48,15 +52,35 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "morphologizer", assigns=["token.morph", "token.pos"], - default_config={"model": DEFAULT_MORPH_MODEL}, + default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}}, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, ) def make_morphologizer( nlp: Language, model: Model, name: str, + overwrite: bool, + extend: bool, + scorer: Optional[Callable], ): - return Morphologizer(nlp.vocab, model, name) + return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer) + + +def morphologizer_score(examples, **kwargs): + def morph_key_getter(token, attr): + return getattr(token, attr).key + + results = {} + results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) + results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) + results.update(Scorer.score_token_attr_per_feat(examples, + "morph", getter=morph_key_getter, **kwargs)) + return results + + +@registry.scorers("spacy.morphologizer_scorer.v1") +def make_morphologizer_scorer(): + return morphologizer_score class Morphologizer(Tagger): @@ -67,6 +91,10 @@ class Morphologizer(Tagger): vocab: Vocab, model: Model, name: str = "morphologizer", + *, + overwrite: bool = BACKWARD_OVERWRITE, + extend: bool = BACKWARD_EXTEND, + scorer: Optional[Callable] = morphologizer_score, ): """Initialize a morphologizer. @@ -74,6 +102,9 @@ class Morphologizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_token_attr for the attributes "pos" and "morph" and + Scorer.score_token_attr_per_feat for the attribute "morph". DOCS: https://spacy.io/api/morphologizer#init """ @@ -85,8 +116,14 @@ class Morphologizer(Tagger): # store mappings from morph+POS labels to token-level annotations: # 1) labels_morph stores a mapping from morph+POS->morph # 2) labels_pos stores a mapping from morph+POS->POS - cfg = {"labels_morph": {}, "labels_pos": {}} + cfg = { + "labels_morph": {}, + "labels_pos": {}, + "overwrite": overwrite, + "extend": extend, + } self.cfg = dict(sorted(cfg.items())) + self.scorer = scorer @property def labels(self): @@ -192,14 +229,34 @@ class Morphologizer(Tagger): docs = [docs] cdef Doc doc cdef Vocab vocab = self.vocab + cdef bint overwrite = self.cfg["overwrite"] + cdef bint extend = self.cfg["extend"] for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): morph = self.labels[tag_id] - doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0)) - doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0) + # set morph + if doc.c[j].morph == 0 or overwrite or extend: + if overwrite and extend: + # morphologizer morph overwrites any existing features + # while extending + extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]) + extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))) + doc.c[j].morph = self.vocab.morphology.add(extended_morph) + elif extend: + # existing features are preserved and any new features + # are added + extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)) + extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])) + doc.c[j].morph = self.vocab.morphology.add(extended_morph) + else: + # clobber + doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0)) + # set POS + if doc.c[j].pos == 0 or overwrite: + doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0) def get_loss(self, examples, scores): """Find the loss and gradient of loss for the batch of documents and @@ -246,24 +303,3 @@ class Morphologizer(Tagger): if self.model.ops.xp.isnan(loss): raise ValueError(Errors.E910.format(name=self.name)) return float(loss), d_scores - - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by - Scorer.score_token_attr for the attributes "pos" and "morph" and - Scorer.score_token_attr_per_feat for the attribute "morph". - - DOCS: https://spacy.io/api/morphologizer#score - """ - def morph_key_getter(token, attr): - return getattr(token, attr).key - - validate_examples(examples, "Morphologizer.score") - results = {} - results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) - results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) - results.update(Scorer.score_token_attr_per_feat(examples, - "morph", getter=morph_key_getter, **kwargs)) - return results diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index f4ae4b787..4835a8c4b 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -1,6 +1,6 @@ # cython: infer_types=True, profile=True, binding=True from collections import defaultdict -from typing import Optional, Iterable +from typing import Optional, Iterable, Callable from thinc.api import Model, Config from ._parser_internals.transition_system import TransitionSystem @@ -9,7 +9,7 @@ from ._parser_internals.ner cimport BiluoPushDown from ..language import Language from ..scorer import get_ner_prf, PRFScore -from ..training import validate_examples +from ..util import registry default_model_config = """ @@ -41,7 +41,8 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "moves": None, "update_with_oracle_cut_size": 100, "model": DEFAULT_NER_MODEL, - "incorrect_spans_key": None + "incorrect_spans_key": None, + "scorer": {"@scorers": "spacy.ner_scorer.v1"}, }, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, @@ -52,7 +53,8 @@ def make_ner( model: Model, moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, - incorrect_spans_key: Optional[str]=None + incorrect_spans_key: Optional[str], + scorer: Optional[Callable], ): """Create a transition-based EntityRecognizer component. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -80,6 +82,7 @@ def make_ner( incorrect_spans_key (Optional[str]): Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group, under this key. + scorer (Optional[Callable]): The scoring method. """ return EntityRecognizer( nlp.vocab, @@ -92,6 +95,7 @@ def make_ner( beam_width=1, beam_density=0.0, beam_update_prob=0.0, + scorer=scorer, ) @Language.factory( @@ -104,7 +108,8 @@ def make_ner( "beam_density": 0.01, "beam_update_prob": 0.5, "beam_width": 32, - "incorrect_spans_key": None + "incorrect_spans_key": None, + "scorer": None, }, default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, ) @@ -117,7 +122,8 @@ def make_beam_ner( beam_width: int, beam_density: float, beam_update_prob: float, - incorrect_spans_key: Optional[str]=None + incorrect_spans_key: Optional[str], + scorer: Optional[Callable], ): """Create a transition-based EntityRecognizer component that uses beam-search. The entity recognizer identifies non-overlapping labelled spans of tokens. @@ -153,6 +159,7 @@ def make_beam_ner( and are faster to compute. incorrect_spans_key (Optional[str]): Optional key into span groups of entities known to be non-entities. + scorer (Optional[Callable]): The scoring method. """ return EntityRecognizer( nlp.vocab, @@ -164,10 +171,20 @@ def make_beam_ner( beam_width=beam_width, beam_density=beam_density, beam_update_prob=beam_update_prob, - incorrect_spans_key=incorrect_spans_key + incorrect_spans_key=incorrect_spans_key, + scorer=scorer, ) +def ner_score(examples, **kwargs): + return get_ner_prf(examples, **kwargs) + + +@registry.scorers("spacy.ner_scorer.v1") +def make_ner_scorer(): + return ner_score + + cdef class EntityRecognizer(Parser): """Pipeline component for named entity recognition. @@ -188,6 +205,7 @@ cdef class EntityRecognizer(Parser): beam_update_prob=0.0, multitasks=tuple(), incorrect_spans_key=None, + scorer=ner_score, ): """Create an EntityRecognizer. """ @@ -204,6 +222,7 @@ cdef class EntityRecognizer(Parser): beam_update_prob=beam_update_prob, multitasks=multitasks, incorrect_spans_key=incorrect_spans_key, + scorer=scorer, ) def add_multitask_objective(self, mt_component): @@ -227,17 +246,6 @@ cdef class EntityRecognizer(Parser): if move[0] in ("B", "I", "L", "U")) return tuple(sorted(labels)) - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The NER precision, recall and f-scores. - - DOCS: https://spacy.io/api/entityrecognizer#score - """ - validate_examples(examples, "EntityRecognizer.score") - return get_ner_prf(examples) - def scored_ents(self, beams): """Return a dictionary of (start, end, label) tuples with corresponding scores for each beam/doc that was processed. diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 4372645af..9eddc1e3f 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -81,6 +81,17 @@ cdef class Pipe: DOCS: https://spacy.io/api/pipe#score """ + if hasattr(self, "scorer") and self.scorer is not None: + scorer_kwargs = {} + # use default settings from cfg (e.g., threshold) + if hasattr(self, "cfg") and isinstance(self.cfg, dict): + scorer_kwargs.update(self.cfg) + # override self.cfg["labels"] with self.labels + if hasattr(self, "labels"): + scorer_kwargs["labels"] = self.labels + # override with kwargs settings + scorer_kwargs.update(kwargs) + return self.scorer(examples, **scorer_kwargs) return {} @property diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 60102efcb..77f4e8adb 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -1,26 +1,32 @@ # cython: infer_types=True, profile=True, binding=True -from typing import Optional, List +from typing import Optional, List, Callable import srsly from ..tokens.doc cimport Doc + from .pipe import Pipe +from .senter import senter_score from ..language import Language from ..scorer import Scorer -from ..training import validate_examples from .. import util +# see #9050 +BACKWARD_OVERWRITE = False + @Language.factory( "sentencizer", assigns=["token.is_sent_start", "doc.sents"], - default_config={"punct_chars": None}, + default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) def make_sentencizer( nlp: Language, name: str, - punct_chars: Optional[List[str]] + punct_chars: Optional[List[str]], + overwrite: bool, + scorer: Optional[Callable], ): - return Sentencizer(name, punct_chars=punct_chars) + return Sentencizer(name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer) class Sentencizer(Pipe): @@ -41,12 +47,20 @@ class Sentencizer(Pipe): '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。'] - def __init__(self, name="sentencizer", *, punct_chars=None): + def __init__( + self, + name="sentencizer", + *, + punct_chars=None, + overwrite=BACKWARD_OVERWRITE, + scorer=senter_score, + ): """Initialize the sentencizer. punct_chars (list): Punctuation characters to split on. Will be serialized with the nlp object. - RETURNS (Sentencizer): The sentencizer component. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_spans for the attribute "sents". DOCS: https://spacy.io/api/sentencizer#init """ @@ -55,6 +69,8 @@ class Sentencizer(Pipe): self.punct_chars = set(punct_chars) else: self.punct_chars = set(self.default_punct_chars) + self.overwrite = overwrite + self.scorer = scorer def __call__(self, doc): """Apply the sentencizer to a Doc and set Token.is_sent_start. @@ -115,29 +131,12 @@ class Sentencizer(Pipe): for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] for j, tag_id in enumerate(doc_tag_ids): - # Don't clobber existing sentence boundaries - if doc.c[j].sent_start == 0: + if doc.c[j].sent_start == 0 or self.overwrite: if tag_id: doc.c[j].sent_start = 1 else: doc.c[j].sent_start = -1 - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. - - DOCS: https://spacy.io/api/sentencizer#score - """ - def has_sents(doc): - return doc.has_annotation("SENT_START") - - validate_examples(examples, "Sentencizer.score") - results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) - del results["sents_per_type"] - return results - def to_bytes(self, *, exclude=tuple()): """Serialize the sentencizer to a bytestring. @@ -145,7 +144,7 @@ class Sentencizer(Pipe): DOCS: https://spacy.io/api/sentencizer#to_bytes """ - return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)}) + return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars), "overwrite": self.overwrite}) def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the sentencizer from a bytestring. @@ -157,6 +156,7 @@ class Sentencizer(Pipe): """ cfg = srsly.msgpack_loads(bytes_data) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) + self.overwrite = cfg.get("overwrite", self.overwrite) return self def to_disk(self, path, *, exclude=tuple()): @@ -166,7 +166,7 @@ class Sentencizer(Pipe): """ path = util.ensure_path(path) path = path.with_suffix(".json") - srsly.write_json(path, {"punct_chars": list(self.punct_chars)}) + srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite}) def from_disk(self, path, *, exclude=tuple()): @@ -178,4 +178,5 @@ class Sentencizer(Pipe): path = path.with_suffix(".json") cfg = srsly.read_json(path) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) + self.overwrite = cfg.get("overwrite", self.overwrite) return self diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index f9472abf5..54ce021af 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -1,5 +1,6 @@ # cython: infer_types=True, profile=True, binding=True from itertools import islice +from typing import Optional, Callable import srsly from thinc.api import Model, SequenceCategoricalCrossentropy, Config @@ -11,8 +12,11 @@ from ..language import Language from ..errors import Errors from ..scorer import Scorer from ..training import validate_examples, validate_get_examples +from ..util import registry from .. import util +# See #9050 +BACKWARD_OVERWRITE = False default_model_config = """ [model] @@ -34,11 +38,25 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "senter", assigns=["token.is_sent_start"], - default_config={"model": DEFAULT_SENTER_MODEL}, + default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) -def make_senter(nlp: Language, name: str, model: Model): - return SentenceRecognizer(nlp.vocab, model, name) +def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]): + return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer) + + +def senter_score(examples, **kwargs): + def has_sents(doc): + return doc.has_annotation("SENT_START") + + results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) + del results["sents_per_type"] + return results + + +@registry.scorers("spacy.senter_scorer.v1") +def make_senter_scorer(): + return senter_score class SentenceRecognizer(Tagger): @@ -46,13 +64,23 @@ class SentenceRecognizer(Tagger): DOCS: https://spacy.io/api/sentencerecognizer """ - def __init__(self, vocab, model, name="senter"): + def __init__( + self, + vocab, + model, + name="senter", + *, + overwrite=BACKWARD_OVERWRITE, + scorer=senter_score, + ): """Initialize a sentence recognizer. vocab (Vocab): The shared vocabulary. model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_spans for the attribute "sents". DOCS: https://spacy.io/api/sentencerecognizer#init """ @@ -60,7 +88,8 @@ class SentenceRecognizer(Tagger): self.model = model self.name = name self._rehearsal_model = None - self.cfg = {} + self.cfg = {"overwrite": overwrite} + self.scorer = scorer @property def labels(self): @@ -85,13 +114,13 @@ class SentenceRecognizer(Tagger): if isinstance(docs, Doc): docs = [docs] cdef Doc doc + cdef bint overwrite = self.cfg["overwrite"] for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): - # Don't clobber existing sentence boundaries - if doc.c[j].sent_start == 0: + if doc.c[j].sent_start == 0 or overwrite: if tag_id == 1: doc.c[j].sent_start = 1 else: @@ -153,18 +182,3 @@ class SentenceRecognizer(Tagger): def add_label(self, label, values=None): raise NotImplementedError - - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. - DOCS: https://spacy.io/api/sentencerecognizer#score - """ - def has_sents(doc): - return doc.has_annotation("SENT_START") - - validate_examples(examples, "SentenceRecognizer.score") - results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) - del results["sents_per_type"] - return results diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 84a9b69cc..5b84ce8fb 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -104,6 +104,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester: "max_positive": None, "model": DEFAULT_SPANCAT_MODEL, "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, + "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, }, default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, ) @@ -113,8 +114,9 @@ def make_spancat( suggester: Suggester, model: Model[Tuple[List[Doc], Ragged], Floats2d], spans_key: str, - threshold: float = 0.5, - max_positive: Optional[int] = None, + scorer: Optional[Callable], + threshold: float, + max_positive: Optional[int], ) -> "SpanCategorizer": """Create a SpanCategorizer component. The span categorizer consists of two parts: a suggester function that proposes candidate spans, and a labeller @@ -144,9 +146,28 @@ def make_spancat( threshold=threshold, max_positive=max_positive, name=name, + scorer=scorer, ) +def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + kwargs = dict(kwargs) + attr_prefix = "spans_" + key = kwargs["spans_key"] + kwargs.setdefault("attr", f"{attr_prefix}{key}") + kwargs.setdefault("allow_overlap", True) + kwargs.setdefault( + "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], []) + ) + kwargs.setdefault("has_annotation", lambda doc: key in doc.spans) + return Scorer.score_spans(examples, **kwargs) + + +@registry.scorers("spacy.spancat_scorer.v1") +def make_spancat_scorer(): + return spancat_score + + class SpanCategorizer(TrainablePipe): """Pipeline component to label spans of text. @@ -163,8 +184,25 @@ class SpanCategorizer(TrainablePipe): spans_key: str = "spans", threshold: float = 0.5, max_positive: Optional[int] = None, + scorer: Optional[Callable] = spancat_score, ) -> None: """Initialize the span categorizer. + vocab (Vocab): The shared vocabulary. + model (thinc.api.Model): The Thinc Model powering the pipeline component. + name (str): The component instance name, used to add entries to the + losses during training. + spans_key (str): Key of the Doc.spans dict to save the spans under. + During initialization and training, the component will look for + spans on the reference document under the same key. Defaults to + `"spans"`. + threshold (float): Minimum probability to consider a prediction + positive. Spans with a positive prediction will be saved on the Doc. + Defaults to 0.5. + max_positive (Optional[int]): Maximum number of labels to consider + positive per span. Defaults to None, indicating no limit. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_spans for the Doc.spans[spans_key] with overlapping + spans allowed. DOCS: https://spacy.io/api/spancategorizer#init """ @@ -178,6 +216,7 @@ class SpanCategorizer(TrainablePipe): self.suggester = suggester self.model = model self.name = name + self.scorer = scorer @property def key(self) -> str: @@ -379,26 +418,6 @@ class SpanCategorizer(TrainablePipe): else: self.model.initialize() - def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. - - DOCS: https://spacy.io/api/spancategorizer#score - """ - validate_examples(examples, "SpanCategorizer.score") - self._validate_categories(examples) - kwargs = dict(kwargs) - attr_prefix = "spans_" - kwargs.setdefault("attr", f"{attr_prefix}{self.key}") - kwargs.setdefault("allow_overlap", True) - kwargs.setdefault( - "getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], []) - ) - kwargs.setdefault("has_annotation", lambda doc: self.key in doc.spans) - return Scorer.score_spans(examples, **kwargs) - def _validate_categories(self, examples: Iterable[Example]): # TODO pass diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index fa260bdd6..a9cbac37a 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -1,4 +1,5 @@ # cython: infer_types=True, profile=True, binding=True +from typing import Callable, Optional import numpy import srsly from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config @@ -18,8 +19,11 @@ from ..parts_of_speech import X from ..errors import Errors, Warnings from ..scorer import Scorer from ..training import validate_examples, validate_get_examples +from ..util import registry from .. import util +# See #9050 +BACKWARD_OVERWRITE = False default_model_config = """ [model] @@ -41,10 +45,16 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "tagger", assigns=["token.tag"], - default_config={"model": DEFAULT_TAGGER_MODEL}, + default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}}, default_score_weights={"tag_acc": 1.0}, ) -def make_tagger(nlp: Language, name: str, model: Model): +def make_tagger( + nlp: Language, + name: str, + model: Model, + overwrite: bool, + scorer: Optional[Callable], +): """Construct a part-of-speech tagger component. model (Model[List[Doc], List[Floats2d]]): A model instance that predicts @@ -52,7 +62,16 @@ def make_tagger(nlp: Language, name: str, model: Model): in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to 1). """ - return Tagger(nlp.vocab, model, name) + return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer) + + +def tagger_score(examples, **kwargs): + return Scorer.score_token_attr(examples, "tag", **kwargs) + + +@registry.scorers("spacy.tagger_scorer.v1") +def make_tagger_scorer(): + return tagger_score class Tagger(TrainablePipe): @@ -60,13 +79,23 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger """ - def __init__(self, vocab, model, name="tagger"): + def __init__( + self, + vocab, + model, + name="tagger", + *, + overwrite=BACKWARD_OVERWRITE, + scorer=tagger_score, + ): """Initialize a part-of-speech tagger. vocab (Vocab): The shared vocabulary. model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_token_attr for the attribute "tag". DOCS: https://spacy.io/api/tagger#init """ @@ -74,8 +103,9 @@ class Tagger(TrainablePipe): self.model = model self.name = name self._rehearsal_model = None - cfg = {"labels": []} + cfg = {"labels": [], "overwrite": overwrite} self.cfg = dict(sorted(cfg.items())) + self.scorer = scorer @property def labels(self): @@ -135,13 +165,13 @@ class Tagger(TrainablePipe): docs = [docs] cdef Doc doc cdef Vocab vocab = self.vocab + cdef bint overwrite = self.cfg["overwrite"] for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): - # Don't clobber preset POS tags - if doc.c[j].tag == 0: + if doc.c[j].tag == 0 or overwrite: doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] def update(self, examples, *, drop=0., sgd=None, losses=None): @@ -289,15 +319,3 @@ class Tagger(TrainablePipe): self.cfg["labels"].append(label) self.vocab.strings.add(label) return 1 - - def score(self, examples, **kwargs): - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by - Scorer.score_token_attr for the attributes "tag". - - DOCS: https://spacy.io/api/tagger#score - """ - validate_examples(examples, "Tagger.score") - return Scorer.score_token_attr(examples, "tag", **kwargs) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 085b949cc..30a65ec52 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -10,6 +10,7 @@ from ..training import Example, validate_examples, validate_get_examples from ..errors import Errors from ..scorer import Scorer from ..tokens import Doc +from ..util import registry from ..vocab import Vocab @@ -70,7 +71,11 @@ subword_features = true @Language.factory( "textcat", assigns=["doc.cats"], - default_config={"threshold": 0.5, "model": DEFAULT_SINGLE_TEXTCAT_MODEL}, + default_config={ + "threshold": 0.5, + "model": DEFAULT_SINGLE_TEXTCAT_MODEL, + "scorer": {"@scorers": "spacy.textcat_scorer.v1"}, + }, default_score_weights={ "cats_score": 1.0, "cats_score_desc": None, @@ -86,7 +91,11 @@ subword_features = true }, ) def make_textcat( - nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float + nlp: Language, + name: str, + model: Model[List[Doc], List[Floats2d]], + threshold: float, + scorer: Optional[Callable], ) -> "TextCategorizer": """Create a TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered @@ -95,8 +104,23 @@ def make_textcat( model (Model[List[Doc], List[Floats2d]]): A model instance that predicts scores for each category. threshold (float): Cutoff to consider a prediction "positive". + scorer (Optional[Callable]): The scoring method. """ - return TextCategorizer(nlp.vocab, model, name, threshold=threshold) + return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer) + + +def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + return Scorer.score_cats( + examples, + "cats", + multi_label=False, + **kwargs, + ) + + +@registry.scorers("spacy.textcat_scorer.v1") +def make_textcat_scorer(): + return textcat_score class TextCategorizer(TrainablePipe): @@ -106,7 +130,13 @@ class TextCategorizer(TrainablePipe): """ def __init__( - self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float + self, + vocab: Vocab, + model: Model, + name: str = "textcat", + *, + threshold: float, + scorer: Optional[Callable] = textcat_score, ) -> None: """Initialize a text categorizer for single-label classification. @@ -115,6 +145,8 @@ class TextCategorizer(TrainablePipe): name (str): The component instance name, used to add entries to the losses during training. threshold (float): Cutoff to consider a prediction "positive". + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_cats for the attribute "cats". DOCS: https://spacy.io/api/textcategorizer#init """ @@ -124,6 +156,7 @@ class TextCategorizer(TrainablePipe): self._rehearsal_model = None cfg = {"labels": [], "threshold": threshold, "positive_label": None} self.cfg = dict(cfg) + self.scorer = scorer @property def labels(self) -> Tuple[str]: @@ -353,26 +386,6 @@ class TextCategorizer(TrainablePipe): assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. - - DOCS: https://spacy.io/api/textcategorizer#score - """ - validate_examples(examples, "TextCategorizer.score") - self._validate_categories(examples) - kwargs.setdefault("threshold", self.cfg["threshold"]) - kwargs.setdefault("positive_label", self.cfg["positive_label"]) - return Scorer.score_cats( - examples, - "cats", - labels=self.labels, - multi_label=False, - **kwargs, - ) - def _validate_categories(self, examples: Iterable[Example]): """Check whether the provided examples all have single-label cats annotations.""" for ex in examples: diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 65961a38c..a7bfacca7 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -5,10 +5,11 @@ from thinc.api import Model, Config from thinc.types import Floats2d from ..language import Language -from ..training import Example, validate_examples, validate_get_examples +from ..training import Example, validate_get_examples from ..errors import Errors from ..scorer import Scorer from ..tokens import Doc +from ..util import registry from ..vocab import Vocab from .textcat import TextCategorizer @@ -70,7 +71,11 @@ subword_features = true @Language.factory( "textcat_multilabel", assigns=["doc.cats"], - default_config={"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL}, + default_config={ + "threshold": 0.5, + "model": DEFAULT_MULTI_TEXTCAT_MODEL, + "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"}, + }, default_score_weights={ "cats_score": 1.0, "cats_score_desc": None, @@ -86,7 +91,11 @@ subword_features = true }, ) def make_multilabel_textcat( - nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float + nlp: Language, + name: str, + model: Model[List[Doc], List[Floats2d]], + threshold: float, + scorer: Optional[Callable], ) -> "TextCategorizer": """Create a TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered @@ -97,7 +106,23 @@ def make_multilabel_textcat( scores for each category. threshold (float): Cutoff to consider a prediction "positive". """ - return MultiLabel_TextCategorizer(nlp.vocab, model, name, threshold=threshold) + return MultiLabel_TextCategorizer( + nlp.vocab, model, name, threshold=threshold, scorer=scorer + ) + + +def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: + return Scorer.score_cats( + examples, + "cats", + multi_label=True, + **kwargs, + ) + + +@registry.scorers("spacy.textcat_multilabel_scorer.v1") +def make_textcat_multilabel_scorer(): + return textcat_multilabel_score class MultiLabel_TextCategorizer(TextCategorizer): @@ -113,6 +138,7 @@ class MultiLabel_TextCategorizer(TextCategorizer): name: str = "textcat_multilabel", *, threshold: float, + scorer: Optional[Callable] = textcat_multilabel_score, ) -> None: """Initialize a text categorizer for multi-label classification. @@ -130,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer): self._rehearsal_model = None cfg = {"labels": [], "threshold": threshold} self.cfg = dict(cfg) + self.scorer = scorer def initialize( # type: ignore[override] self, @@ -166,24 +193,6 @@ class MultiLabel_TextCategorizer(TextCategorizer): assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: - """Score a batch of examples. - - examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats. - - DOCS: https://spacy.io/api/textcategorizer#score - """ - validate_examples(examples, "MultiLabel_TextCategorizer.score") - kwargs.setdefault("threshold", self.cfg["threshold"]) - return Scorer.score_cats( - examples, - "cats", - labels=self.labels, - multi_label=True, - **kwargs, - ) - def _validate_categories(self, examples: Iterable[Example]): """This component allows any type of single- or multi-label annotations. This method overwrites the more strict one from 'textcat'.""" diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd index d5cdbb511..65daa8b22 100644 --- a/spacy/pipeline/trainable_pipe.pxd +++ b/spacy/pipeline/trainable_pipe.pxd @@ -5,3 +5,4 @@ cdef class TrainablePipe(Pipe): cdef public Vocab vocab cdef public object model cdef public object cfg + cdef public object scorer diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 5e11f5972..2571af102 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -49,7 +49,8 @@ cdef class Parser(TrainablePipe): beam_density=0.0, beam_update_prob=0.0, multitasks=tuple(), - incorrect_spans_key=None + incorrect_spans_key=None, + scorer=None, ): """Create a Parser. @@ -86,6 +87,7 @@ cdef class Parser(TrainablePipe): incorrect_spans_key (Optional[str]): Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group, under this key. + scorer (Optional[Callable]): The scoring method. Defaults to None. """ self.vocab = vocab self.name = name @@ -117,6 +119,7 @@ cdef class Parser(TrainablePipe): self.add_multitask_objective(multitask) self._rehearsal_model = None + self.scorer = scorer def __getnewargs_ex__(self): """This allows pickling the Parser and its keyword-only init arguments""" diff --git a/spacy/schemas.py b/spacy/schemas.py index 73ddc45b1..b3ea11d8b 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -351,7 +351,8 @@ class ConfigSchemaPretrain(BaseModel): # fmt: off max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for") dropout: StrictFloat = Field(..., title="Dropout rate") - n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency") + n_save_every: Optional[StrictInt] = Field(..., title="Saving additional temporary model after n batches within an epoch") + n_save_epoch: Optional[StrictInt] = Field(..., title="Saving model after every n epoch") optimizer: Optimizer = Field(..., title="The optimizer to use") corpus: StrictStr = Field(..., title="Path in the config to the training data") batcher: Batcher = Field(..., title="Batcher for the training data") diff --git a/spacy/scorer.py b/spacy/scorer.py index ebab2382d..75e5b3317 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -247,18 +247,21 @@ class Scorer: missing_values: Set[Any] = MISSING_VALUES, # type: ignore[assignment] **cfg, ) -> Dict[str, Any]: - """Return PRF scores per feat for a token attribute in UFEATS format. + """Return micro PRF and PRF scores per feat for a token attribute in + UFEATS format. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. - missing_values (Set[Any]): Attribute values to treat as missing annotation - in the reference annotation. - RETURNS (dict): A dictionary containing the per-feat PRF scores under - the key attr_per_feat. + missing_values (Set[Any]): Attribute values to treat as missing + annotation in the reference annotation. + RETURNS (dict): A dictionary containing the micro PRF scores under the + key attr_micro_p/r/f and the per-feat PRF scores under + attr_per_feat. """ + micro_score = PRFScore() per_feat = {} for example in examples: pred_doc = example.predicted @@ -300,15 +303,22 @@ class Scorer: pred_per_feat[field] = set() pred_per_feat[field].add((gold_i, feat)) for field in per_feat: + micro_score.score_set(pred_per_feat.get(field, set()), gold_per_feat.get(field, set())) per_feat[field].score_set( pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) ) - score_key = f"{attr}_per_feat" - if any([len(v) for v in per_feat.values()]): - result = {k: v.to_dict() for k, v in per_feat.items()} - return {score_key: result} + result: Dict[str, Any] = {} + if len(micro_score) > 0: + result[f"{attr}_micro_p"] = micro_score.precision + result[f"{attr}_micro_r"] = micro_score.recall + result[f"{attr}_micro_f"] = micro_score.fscore + result[f"{attr}_per_feat"] = {k: v.to_dict() for k, v in per_feat.items()} else: - return {score_key: None} + result[f"{attr}_micro_p"] = None + result[f"{attr}_micro_r"] = None + result[f"{attr}_micro_f"] = None + result[f"{attr}_per_feat"] = None + return result @staticmethod def score_spans( @@ -545,7 +555,7 @@ class Scorer: @staticmethod def score_links( - examples: Iterable[Example], *, negative_labels: Iterable[str] + examples: Iterable[Example], *, negative_labels: Iterable[str], **cfg ) -> Dict[str, Any]: """Returns PRF for predicted links on the entity level. To disentangle the performance of the NEL from the NER, @@ -721,7 +731,7 @@ class Scorer: } -def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]: +def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Compute micro-PRF and per-entity PRF scores for a sequence of examples.""" score_per_type = defaultdict(PRFScore) for eg in examples: diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 07768d347..370180135 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -8,10 +8,10 @@ from murmurhash.mrmr cimport hash64 from .typedefs cimport attr_t, hash_t -cpdef hash_t hash_string(unicode string) except 0 +cpdef hash_t hash_string(str string) except 0 cdef hash_t hash_utf8(char* utf8_string, int length) nogil -cdef unicode decode_Utf8Str(const Utf8Str* string) +cdef str decode_Utf8Str(const Utf8Str* string) ctypedef union Utf8Str: @@ -25,5 +25,5 @@ cdef class StringStore: cdef vector[hash_t] keys cdef public PreshMap _map - cdef const Utf8Str* intern_unicode(self, unicode py_string) + cdef const Utf8Str* intern_unicode(self, str py_string) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 4a20cb8af..39fc441e9 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -33,7 +33,7 @@ def get_string_id(key): return hash_utf8(chars, len(chars)) -cpdef hash_t hash_string(unicode string) except 0: +cpdef hash_t hash_string(str string) except 0: chars = string.encode("utf8") return hash_utf8(chars, len(chars)) @@ -46,7 +46,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil: return hash32(utf8_string, length, 1) -cdef unicode decode_Utf8Str(const Utf8Str* string): +cdef str decode_Utf8Str(const Utf8Str* string): cdef int i, length if string.s[0] < sizeof(string.s) and string.s[0] != 0: return string.s[1:string.s[0]+1].decode("utf8") @@ -107,17 +107,17 @@ cdef class StringStore: def __getitem__(self, object string_or_id): """Retrieve a string from a given hash, or vice versa. - string_or_id (bytes, unicode or uint64): The value to encode. + string_or_id (bytes, str or uint64): The value to encode. Returns (str / uint64): The value to be retrieved. """ - if isinstance(string_or_id, basestring) and len(string_or_id) == 0: + if isinstance(string_or_id, str) and len(string_or_id) == 0: return 0 elif string_or_id == 0: return "" elif string_or_id in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string_or_id] cdef hash_t key - if isinstance(string_or_id, unicode): + if isinstance(string_or_id, str): key = hash_string(string_or_id) return key elif isinstance(string_or_id, bytes): @@ -135,14 +135,14 @@ cdef class StringStore: def as_int(self, key): """If key is an int, return it; otherwise, get the int value.""" - if not isinstance(key, basestring): + if not isinstance(key, str): return key else: return self[key] def as_string(self, key): """If key is a string, return it; otherwise, get the string value.""" - if isinstance(key, basestring): + if isinstance(key, str): return key else: return self[key] @@ -153,7 +153,7 @@ cdef class StringStore: string (str): The string to add. RETURNS (uint64): The string's hash value. """ - if isinstance(string, unicode): + if isinstance(string, str): if string in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string] key = hash_string(string) @@ -189,7 +189,7 @@ cdef class StringStore: return True elif string in SYMBOLS_BY_STR: return True - elif isinstance(string, unicode): + elif isinstance(string, str): key = hash_string(string) else: string = string.encode("utf8") @@ -269,7 +269,7 @@ cdef class StringStore: for string in strings: self.add(string) - cdef const Utf8Str* intern_unicode(self, unicode py_string): + cdef const Utf8Str* intern_unicode(self, str py_string): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode("utf8") return self._intern_utf8(byte_string, len(byte_string)) diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py index 28cb66714..738a751a0 100644 --- a/spacy/tests/doc/test_pickle_doc.py +++ b/spacy/tests/doc/test_pickle_doc.py @@ -5,9 +5,11 @@ from spacy.compat import pickle def test_pickle_single_doc(): nlp = Language() doc = nlp("pickle roundtrip") + doc._context = 3 data = pickle.dumps(doc, 1) doc2 = pickle.loads(data) assert doc2.text == "pickle roundtrip" + assert doc2._context == 3 def test_list_of_docs_pickles_efficiently(): diff --git a/spacy/tests/lang/ca/test_exception.py b/spacy/tests/lang/ca/test_exception.py index cfb574b63..499027ab1 100644 --- a/spacy/tests/lang/ca/test_exception.py +++ b/spacy/tests/lang/ca/test_exception.py @@ -11,7 +11,18 @@ def test_ca_tokenizer_handles_abbr(ca_tokenizer, text, lemma): def test_ca_tokenizer_handles_exc_in_text(ca_tokenizer): - text = "La Núria i el Pere han vingut aprox. a les 7 de la tarda." - tokens = ca_tokenizer(text) - assert len(tokens) == 15 - assert tokens[7].text == "aprox." + text = "La Dra. Puig viu a la pl. dels Til·lers." + doc = ca_tokenizer(text) + assert [t.text for t in doc] == [ + "La", + "Dra.", + "Puig", + "viu", + "a", + "la", + "pl.", + "d", + "els", + "Til·lers", + ".", + ] diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py index a3c76ab5b..afbdf3696 100644 --- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py @@ -2,7 +2,14 @@ import pytest @pytest.mark.parametrize( - "text,expected_tokens", [("d'un", ["d'", "un"]), ("s'ha", ["s'", "ha"])] + "text,expected_tokens", + [ + ("d'un", ["d'", "un"]), + ("s'ha", ["s'", "ha"]), + ("del", ["d", "el"]), + ("cantar-te", ["cantar", "-te"]), + ("-hola", ["-", "hola"]), + ], ) def test_contractions(ca_tokenizer, text, expected_tokens): """Test that the contractions are split into two tokens""" diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py index 55bad0e94..5db7af553 100644 --- a/spacy/tests/lang/ca/test_text.py +++ b/spacy/tests/lang/ca/test_text.py @@ -12,17 +12,20 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer): una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida.""" tokens = ca_tokenizer(text) - assert len(tokens) == 140 + assert len(tokens) == 146 @pytest.mark.parametrize( "text,length", [ - ("Perquè va anar-hi?", 4), + ("Perquè va anar-hi?", 5), + ("El cotxe dels veins.", 6), ("“Ah no?”", 5), ("""Sí! "Anem", va contestar el Joan Carles""", 11), ("Van córrer aprox. 10km", 5), ("Llavors perqué...", 3), + ("Vull parlar-te'n demà al matí", 8), + ("Vull explicar-t'ho demà al matí", 8), ], ) def test_ca_tokenizer_handles_cnts(ca_tokenizer, text, length): diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py index 6041611e6..21879a569 100644 --- a/spacy/tests/lang/ja/test_lemmatization.py +++ b/spacy/tests/lang/ja/test_lemmatization.py @@ -8,3 +8,17 @@ import pytest def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma): test_lemma = ja_tokenizer(word)[0].lemma_ assert test_lemma == lemma + + +@pytest.mark.parametrize( + "word,norm", + [ + ("SUMMER", "サマー"), + ("食べ物", "食べ物"), + ("綜合", "総合"), + ("コンピュータ", "コンピューター"), + ], +) +def test_ja_lemmatizer_norm(ja_tokenizer, word, norm): + test_norm = ja_tokenizer(word)[0].norm_ + assert test_norm == norm diff --git a/spacy/tests/lang/ja/test_morphologizer_factory.py b/spacy/tests/lang/ja/test_morphologizer_factory.py new file mode 100644 index 000000000..a4e038d01 --- /dev/null +++ b/spacy/tests/lang/ja/test_morphologizer_factory.py @@ -0,0 +1,9 @@ +import pytest +from spacy.lang.ja import Japanese + + +def test_ja_morphologizer_factory(): + pytest.importorskip("sudachipy") + nlp = Japanese() + morphologizer = nlp.add_pipe("morphologizer") + assert morphologizer.cfg["extend"] is True diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py index e05a363bf..011eb470f 100644 --- a/spacy/tests/lang/ja/test_serialize.py +++ b/spacy/tests/lang/ja/test_serialize.py @@ -1,3 +1,5 @@ +import pickle + from spacy.lang.ja import Japanese from ...util import make_tempdir @@ -31,3 +33,9 @@ def test_ja_tokenizer_serialize(ja_tokenizer): nlp_r.from_disk(d) assert nlp_bytes == nlp_r.to_bytes() assert nlp_r.tokenizer.split_mode == "B" + + +def test_ja_tokenizer_pickle(ja_tokenizer): + b = pickle.dumps(ja_tokenizer) + ja_tokenizer_re = pickle.loads(b) + assert ja_tokenizer.to_bytes() == ja_tokenizer_re.to_bytes() diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index c8c85d655..098884cf0 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -34,22 +34,22 @@ SENTENCE_TESTS = [ ] tokens1 = [ - DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None), - DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None), + DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None), + DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None), ] tokens2 = [ - DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None), - DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None), - DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", reading="イイン", sub_tokens=None), - DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", reading="カイ", sub_tokens=None), + DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None), + DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None), + DetailedToken(surface="委員", tag="名詞-普通名詞-一般", inf="", lemma="委員", norm="委員", reading="イイン", sub_tokens=None), + DetailedToken(surface="会", tag="名詞-普通名詞-一般", inf="", lemma="会", norm="会", reading="カイ", sub_tokens=None), ] tokens3 = [ - DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", reading="センキョ", sub_tokens=None), - DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", reading="カンリ", sub_tokens=None), - DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", reading="イインカイ", sub_tokens=None), + DetailedToken(surface="選挙", tag="名詞-普通名詞-サ変可能", inf="", lemma="選挙", norm="選挙", reading="センキョ", sub_tokens=None), + DetailedToken(surface="管理", tag="名詞-普通名詞-サ変可能", inf="", lemma="管理", norm="管理", reading="カンリ", sub_tokens=None), + DetailedToken(surface="委員会", tag="名詞-普通名詞-一般", inf="", lemma="委員会", norm="委員会", reading="イインカイ", sub_tokens=None), ] SUB_TOKEN_TESTS = [ - ("選挙管理委員会", [None, None, None, None], [None, None, [tokens1]], [[tokens2, tokens3]]) + ("選挙管理委員会", [None, None, [tokens1]], [[tokens2, tokens3]]) ] # fmt: on @@ -111,18 +111,16 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): assert len(nlp_c(text)) == len_c -@pytest.mark.parametrize( - "text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS -) +@pytest.mark.parametrize("text,sub_tokens_list_b,sub_tokens_list_c", SUB_TOKEN_TESTS) def test_ja_tokenizer_sub_tokens( - ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c + ja_tokenizer, text, sub_tokens_list_b, sub_tokens_list_c ): nlp_a = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "A"}}}) nlp_b = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "B"}}}) nlp_c = Japanese.from_config({"nlp": {"tokenizer": {"split_mode": "C"}}}) - assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a - assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a + assert ja_tokenizer(text).user_data.get("sub_tokens") is None + assert nlp_a(text).user_data.get("sub_tokens") is None assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c @@ -132,16 +130,24 @@ def test_ja_tokenizer_sub_tokens( [ ( "取ってつけた", - ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"), - ("トッ", "テ", "ツケ", "タ"), + (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]), + (["トッ"], ["テ"], ["ツケ"], ["タ"]), + ), + ( + "2=3", + ([], [], []), + (["ニ"], ["_"], ["サン"]) ), ], ) def test_ja_tokenizer_inflections_reading_forms( ja_tokenizer, text, inflections, reading_forms ): - assert ja_tokenizer(text).user_data["inflections"] == inflections - assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms + tokens = ja_tokenizer(text) + test_inflections = [tt.morph.get("Inflection") for tt in tokens] + assert test_inflections == list(inflections) + test_readings = [tt.morph.get("Reading") for tt in tokens] + assert test_readings == list(reading_forms) def test_ja_tokenizer_emptyish_texts(ja_tokenizer): diff --git a/spacy/tests/lang/ko/test_serialize.py b/spacy/tests/lang/ko/test_serialize.py new file mode 100644 index 000000000..75288fcc5 --- /dev/null +++ b/spacy/tests/lang/ko/test_serialize.py @@ -0,0 +1,24 @@ +import pickle + +from spacy.lang.ko import Korean +from ...util import make_tempdir + + +def test_ko_tokenizer_serialize(ko_tokenizer): + tokenizer_bytes = ko_tokenizer.to_bytes() + nlp = Korean() + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + with make_tempdir() as d: + file_path = d / "tokenizer" + ko_tokenizer.to_disk(file_path) + nlp = Korean() + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + +def test_ko_tokenizer_pickle(ko_tokenizer): + b = pickle.dumps(ko_tokenizer) + ko_tokenizer_re = pickle.loads(b) + assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes() diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py index 91a048764..5cf6eb1a6 100644 --- a/spacy/tests/lang/ky/test_tokenizer.py +++ b/spacy/tests/lang/ky/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index b39109455..6a7a404fd 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match): ("www.google.com", True), ("google.com", True), ("sydney.com", True), - ("2girls1cup.org", True), + ("1abc2def.org", True), ("http://stupid", True), ("www.hi", True), + ("example.com/example", True), ("dog", False), ("1.2", False), ("1.a", False), diff --git a/spacy/tests/lang/th/test_serialize.py b/spacy/tests/lang/th/test_serialize.py new file mode 100644 index 000000000..a3de4bf54 --- /dev/null +++ b/spacy/tests/lang/th/test_serialize.py @@ -0,0 +1,24 @@ +import pickle + +from spacy.lang.th import Thai +from ...util import make_tempdir + + +def test_th_tokenizer_serialize(th_tokenizer): + tokenizer_bytes = th_tokenizer.to_bytes() + nlp = Thai() + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + with make_tempdir() as d: + file_path = d / "tokenizer" + th_tokenizer.to_disk(file_path) + nlp = Thai() + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + +def test_th_tokenizer_pickle(th_tokenizer): + b = pickle.dumps(th_tokenizer) + th_tokenizer_re = pickle.loads(b) + assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes() diff --git a/spacy/tests/lang/ti/test_text.py b/spacy/tests/lang/ti/test_text.py index 177a9e4b2..d21005640 100644 --- a/spacy/tests/lang/ti/test_text.py +++ b/spacy/tests/lang/ti/test_text.py @@ -37,7 +37,7 @@ def test_ti_tokenizer_handles_cnts(ti_tokenizer, text, length): ("10.000", True), ("1000", True), ("999,0", True), - ("ሐደ", True), + ("ሓደ", True), ("ክልተ", True), ("ትሪልዮን", True), ("ከልቢ", False), diff --git a/spacy/tests/lang/vi/test_serialize.py b/spacy/tests/lang/vi/test_serialize.py index ed4652df7..55dab799c 100644 --- a/spacy/tests/lang/vi/test_serialize.py +++ b/spacy/tests/lang/vi/test_serialize.py @@ -1,3 +1,5 @@ +import pickle + from spacy.lang.vi import Vietnamese from ...util import make_tempdir @@ -31,3 +33,9 @@ def test_vi_tokenizer_serialize(vi_tokenizer): nlp_r.from_disk(d) assert nlp_bytes == nlp_r.to_bytes() assert nlp_r.tokenizer.use_pyvi is False + + +def test_vi_tokenizer_pickle(vi_tokenizer): + b = pickle.dumps(vi_tokenizer) + vi_tokenizer_re = pickle.loads(b) + assert vi_tokenizer.to_bytes() == vi_tokenizer_re.to_bytes() diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index 9c750ffd0..dab3ebf57 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -32,24 +32,6 @@ def pattern_dicts(): ] -@registry.misc("attribute_ruler_patterns") -def attribute_ruler_patterns(): - return [ - { - "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]], - "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, - }, - # one pattern sets the lemma - {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}}, - # another pattern sets the morphology - { - "patterns": [[{"ORTH": "test"}]], - "attrs": {"MORPH": "Case=Nom|Number=Sing"}, - "index": 0, - }, - ] - - @pytest.fixture def tag_map(): return { @@ -121,7 +103,25 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): assert doc.has_annotation("LEMMA") assert doc.has_annotation("MORPH") nlp.remove_pipe("attribute_ruler") + # initialize with patterns from misc registry + @registry.misc("attribute_ruler_patterns") + def attribute_ruler_patterns(): + return [ + { + "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]], + "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"}, + }, + # one pattern sets the lemma + {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}}, + # another pattern sets the morphology + { + "patterns": [[{"ORTH": "test"}]], + "attrs": {"MORPH": "Case=Nom|Number=Sing"}, + "index": 0, + }, + ] + nlp.config["initialize"]["components"]["attribute_ruler"] = { "patterns": {"@misc": "attribute_ruler_patterns"} } @@ -162,6 +162,26 @@ def test_attributeruler_score(nlp, pattern_dicts): assert scores["lemma_acc"] == pytest.approx(0.2) # no morphs are set assert scores["morph_acc"] is None + nlp.remove_pipe("attribute_ruler") + + # test with custom scorer + @registry.misc("weird_scorer.v1") + def make_weird_scorer(): + def weird_scorer(examples, weird_score, **kwargs): + return {"weird_score": weird_score} + + return weird_scorer + + ruler = nlp.add_pipe( + "attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}} + ) + ruler.initialize(lambda: [], patterns=pattern_dicts) + scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345}) + assert scores["weird_score"] == 0.12345 + assert "token_acc" in scores + assert "lemma_acc" not in scores + scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456}) + assert scores["weird_score"] == 0.23456 def test_attributeruler_rule_order(nlp): diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 9680d70d2..11d6f0477 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -8,6 +8,7 @@ from spacy.language import Language from spacy.tests.util import make_tempdir from spacy.morphology import Morphology from spacy.attrs import MORPH +from spacy.tokens import Doc def test_label_types(): @@ -137,6 +138,41 @@ def test_overfitting_IO(): assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags + # Test overwrite+extend settings + # (note that "" is unset, "_" is set and empty) + morphs = ["Feat=V", "Feat=N", "_"] + doc = Doc(nlp.vocab, words=["blue", "ham", "like"], morphs=morphs) + orig_morphs = [str(t.morph) for t in doc] + orig_pos_tags = [t.pos_ for t in doc] + morphologizer = nlp.get_pipe("morphologizer") + + # don't overwrite or extend + morphologizer.cfg["overwrite"] = False + doc = morphologizer(doc) + assert [str(t.morph) for t in doc] == orig_morphs + assert [t.pos_ for t in doc] == orig_pos_tags + + # overwrite and extend + morphologizer.cfg["overwrite"] = True + morphologizer.cfg["extend"] = True + doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""]) + doc = morphologizer(doc) + assert [str(t.morph) for t in doc] == ["Feat=N|That=A|This=A", "Feat=V"] + + # extend without overwriting + morphologizer.cfg["overwrite"] = False + morphologizer.cfg["extend"] = True + doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", "That=B"]) + doc = morphologizer(doc) + assert [str(t.morph) for t in doc] == ["Feat=A|That=A|This=A", "Feat=V|That=B"] + + # overwrite without extending + morphologizer.cfg["overwrite"] = True + morphologizer.cfg["extend"] = False + doc = Doc(nlp.vocab, words=["I", "like"], morphs=["Feat=A|That=A|This=A", ""]) + doc = morphologizer(doc) + assert [str(t.morph) for t in doc] == ["Feat=N", "Feat=V"] + # Test with unset morph and partial POS nlp.remove_pipe("morphologizer") nlp.add_pipe("morphologizer") diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 3fe9363bf..ab403ab54 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -1,7 +1,9 @@ import pytest import pickle +from thinc.api import get_current_ops from spacy.vocab import Vocab from spacy.strings import StringStore +from spacy.vectors import Vectors from ..util import make_tempdir @@ -129,7 +131,11 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2): @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) def test_pickle_vocab(strings, lex_attr): vocab = Vocab(strings=strings) + ops = get_current_ops() + vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1) + vocab.vectors = vectors vocab[strings[0]].norm_ = lex_attr vocab_pickled = pickle.dumps(vocab) vocab_unpickled = pickle.loads(vocab_pickled) assert vocab.to_bytes() == vocab_unpickled.to_bytes() + assert vocab_unpickled.vectors.mode == "floret" diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 72bbe04e5..00ae2c056 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,5 +1,6 @@ import pytest from click import NoSuchOption +from packaging.specifiers import SpecifierSet from spacy.training import docs_to_json, offsets_to_biluo_tags from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate @@ -491,19 +492,27 @@ def test_string_to_list_intify(value): assert string_to_list(value, intify=True) == [1, 2, 3] +@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release") def test_download_compatibility(): - model_name = "en_core_web_sm" - compatibility = get_compatibility() - version = get_version(model_name, compatibility) - assert get_minor_version(about.__version__) == get_minor_version(version) + spec = SpecifierSet("==" + about.__version__) + spec.prereleases = False + if about.__version__ in spec: + model_name = "en_core_web_sm" + compatibility = get_compatibility() + version = get_version(model_name, compatibility) + assert get_minor_version(about.__version__) == get_minor_version(version) +@pytest.mark.skip(reason="Temporarily skip until v3.2.0 release") def test_validate_compatibility_table(): - model_pkgs, compat = get_model_pkgs() - spacy_version = get_minor_version(about.__version__) - current_compat = compat.get(spacy_version, {}) - assert len(current_compat) > 0 - assert "en_core_web_sm" in current_compat + spec = SpecifierSet("==" + about.__version__) + spec.prereleases = False + if about.__version__ in spec: + model_pkgs, compat = get_model_pkgs() + spacy_version = get_minor_version(about.__version__) + current_compat = compat.get(spacy_version, {}) + assert len(current_compat) > 0 + assert "en_core_web_sm" in current_compat @pytest.mark.parametrize("component_name", ["ner", "textcat", "spancat", "tagger"]) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 8dbb6fd75..c5fdc8eb0 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -8,7 +8,7 @@ from spacy.vocab import Vocab from spacy.training import Example from spacy.lang.en import English from spacy.lang.de import German -from spacy.util import registry, ignore_error, raise_error +from spacy.util import registry, ignore_error, raise_error, find_matching_language import spacy from thinc.api import CupyOps, NumpyOps, get_current_ops @@ -255,6 +255,38 @@ def test_language_pipe_error_handler_custom(en_vocab, n_process): assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"] +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process): + """Test the error handling of nlp.pipe with input as tuples""" + Language.component("my_evil_component", func=evil_component) + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.add_pipe("my_evil_component") + texts = [ + ("TEXT 111", 111), + ("TEXT 222", 222), + ("TEXT 333", 333), + ("TEXT 342", 342), + ("TEXT 666", 666), + ] + with pytest.raises(ValueError): + list(nlp.pipe(texts, as_tuples=True)) + nlp.set_error_handler(warn_error) + logger = logging.getLogger("spacy") + with mock.patch.object(logger, "warning") as mock_warning: + tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process)) + # HACK/TODO? the warnings in child processes don't seem to be + # detected by the mock logger + if n_process == 1: + mock_warning.assert_called() + assert mock_warning.call_count == 2 + assert len(tuples) + mock_warning.call_count == len(texts) + assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111) + assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333) + assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666) + + @pytest.mark.parametrize("n_process", [1, 2]) def test_language_pipe_error_handler_pipe(en_vocab, n_process): """Test the error handling of a component's pipe method""" @@ -512,6 +544,55 @@ def test_spacy_blank(): assert nlp.meta["name"] == "my_custom_model" +@pytest.mark.parametrize( + "lang,target", + [ + ("en", "en"), + ("fra", "fr"), + ("fre", "fr"), + ("iw", "he"), + ("mo", "ro"), + ("mul", "xx"), + ("no", "nb"), + ("pt-BR", "pt"), + ("xx", "xx"), + ("zh-Hans", "zh"), + ("zh-Hant", None), + ("zxx", None), + ], +) +def test_language_matching(lang, target): + """ + Test that we can look up languages by equivalent or nearly-equivalent + language codes. + """ + assert find_matching_language(lang) == target + + +@pytest.mark.parametrize( + "lang,target", + [ + ("en", "en"), + ("fra", "fr"), + ("fre", "fr"), + ("iw", "he"), + ("mo", "ro"), + ("mul", "xx"), + ("no", "nb"), + ("pt-BR", "pt"), + ("xx", "xx"), + ("zh-Hans", "zh"), + ], +) +def test_blank_languages(lang, target): + """ + Test that we can get spacy.blank in various languages, including codes + that are defined to be equivalent or that match by CLDR language matching. + """ + nlp = spacy.blank(lang) + assert nlp.lang == target + + @pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab]) def test_language_init_invalid_vocab(value): err_fragment = "invalid value" @@ -540,6 +621,32 @@ def test_language_source_and_vectors(nlp2): assert nlp.vocab.vectors.to_bytes() == vectors_bytes +@pytest.mark.parametrize("n_process", [1, 2]) +def test_pass_doc_to_pipeline(nlp, n_process): + texts = ["cats", "dogs", "guinea pigs"] + docs = [nlp.make_doc(text) for text in texts] + assert not any(len(doc.cats) for doc in docs) + doc = nlp(docs[0]) + assert doc.text == texts[0] + assert len(doc.cats) > 0 + if isinstance(get_current_ops(), NumpyOps) or n_process < 2: + docs = nlp.pipe(docs, n_process=n_process) + assert [doc.text for doc in docs] == texts + assert all(len(doc.cats) for doc in docs) + + +def test_invalid_arg_to_pipeline(nlp): + str_list = ["This is a text.", "This is another."] + with pytest.raises(ValueError): + nlp(str_list) # type: ignore + assert len(list(nlp.pipe(str_list))) == 2 + int_list = [1, 2, 3] + with pytest.raises(ValueError): + list(nlp.pipe(int_list)) # type: ignore + with pytest.raises(ValueError): + nlp(int_list) # type: ignore + + @pytest.mark.skipif( not isinstance(get_current_ops(), CupyOps), reason="test requires GPU" ) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 45cbdf45b..f17d5e62e 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -139,6 +139,12 @@ def test_load_model_blank_shortcut(): nlp = util.load_model("blank:en") assert nlp.lang == "en" assert nlp.pipeline == [] + + # ImportError for loading an unsupported language + with pytest.raises(ImportError): + util.load_model("blank:zxx") + + # ImportError for requesting an invalid language code that isn't registered with pytest.raises(ImportError): util.load_model("blank:fjsfijsdof") diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 16cc97f6d..6e15fa2de 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -249,6 +249,7 @@ def test_tag_score(tagged_doc): assert results["tag_acc"] == 1.0 assert results["pos_acc"] == 1.0 assert results["morph_acc"] == 1.0 + assert results["morph_micro_f"] == 1.0 assert results["morph_per_feat"]["NounType"]["f"] == 1.0 # Gold annotation is modified @@ -272,6 +273,7 @@ def test_tag_score(tagged_doc): assert results["tag_acc"] == 0.9 assert results["pos_acc"] == 0.9 assert results["morph_acc"] == approx(0.8) + assert results["morph_micro_f"] == approx(0.8461538) assert results["morph_per_feat"]["NounType"]["f"] == 1.0 assert results["morph_per_feat"]["Poss"]["f"] == 0.0 assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index 9a98e049e..85716377a 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -45,3 +45,9 @@ def test_tokenizer_handles_emoji(tokenizer, text, length): if sys.maxunicode >= 1114111: tokens = tokenizer(text) assert len(tokens) == length + + +def test_tokenizer_degree(tokenizer): + for u in "cfkCFK": + assert [t.text for t in tokenizer(f"°{u}.")] == ["°", f"{u}", "."] + assert [t[1] for t in tokenizer.explain(f"°{u}.")] == ["°", f"{u}", "."] diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 7d0c16745..192faa67b 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -2,7 +2,7 @@ import pytest import re from spacy.vocab import Vocab from spacy.tokenizer import Tokenizer -from spacy.util import ensure_path +from spacy.util import ensure_path, compile_prefix_regex, compile_suffix_regex from spacy.lang.en import English @@ -212,3 +212,20 @@ def test_tokenizer_flush_specials(en_vocab): assert [t.text for t in tokenizer1("a a.")] == ["a a", "."] tokenizer1.rules = {} assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."] + + +def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): + # the prefix and suffix matches overlap in the suffix lookbehind + prefixes = ['a(?=.)'] + suffixes = [r'(?<=\w)\.', r'(?<=a)\d+\.'] + prefix_re = compile_prefix_regex(prefixes) + suffix_re = compile_suffix_regex(suffixes) + tokenizer = Tokenizer( + en_vocab, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + ) + tokens = [t.text for t in tokenizer("a10.")] + assert tokens == ["a", "10", "."] + explain_tokens = [t[1] for t in tokenizer.explain("a10.")] + assert tokens == explain_tokens diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index cd428be15..48636a4eb 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -524,6 +524,30 @@ def test_roundtrip_docs_to_docbin(doc): assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"] assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] +def test_docbin_user_data_serialized(doc): + doc.user_data["check"] = True + nlp = English() + + with make_tempdir() as tmpdir: + output_file = tmpdir / "userdata.spacy" + DocBin(docs=[doc], store_user_data=True).to_disk(output_file) + reloaded_docs = DocBin().from_disk(output_file).get_docs(nlp.vocab) + reloaded_doc = list(reloaded_docs)[0] + + assert reloaded_doc.user_data["check"] == True + +def test_docbin_user_data_not_serialized(doc): + # this isn't serializable, but that shouldn't cause an error + doc.user_data["check"] = set() + nlp = English() + + with make_tempdir() as tmpdir: + output_file = tmpdir / "userdata.spacy" + DocBin(docs=[doc], store_user_data=False).to_disk(output_file) + reloaded_docs = DocBin().from_disk(output_file).get_docs(nlp.vocab) + reloaded_doc = list(reloaded_docs)[0] + + assert "check" not in reloaded_doc.user_data @pytest.mark.parametrize( "tokens_a,tokens_b,expected", diff --git a/spacy/tests/universe/test_universe_json.py b/spacy/tests/universe/test_universe_json.py new file mode 100644 index 000000000..295889186 --- /dev/null +++ b/spacy/tests/universe/test_universe_json.py @@ -0,0 +1,17 @@ +import json +import re +from pathlib import Path + + +def test_universe_json(): + + root_dir = Path(__file__).parent + universe_file = root_dir / "universe.json" + + with universe_file.open() as f: + universe_data = json.load(f) + for entry in universe_data["resources"]: + if "github" in entry: + assert not re.match( + r"^(http:)|^(https:)", entry["github"] + ), "Github field should be user/repo, not a url" diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 23597455f..f2e74c3c9 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -1,12 +1,14 @@ import pytest import numpy -from numpy.testing import assert_allclose, assert_equal +from numpy.testing import assert_allclose, assert_equal, assert_almost_equal from thinc.api import get_current_ops +from spacy.lang.en import English from spacy.vocab import Vocab from spacy.vectors import Vectors from spacy.tokenizer import Tokenizer from spacy.strings import hash_string # type: ignore from spacy.tokens import Doc +from spacy.training.initialize import convert_vectors from ..util import add_vecs_to_vocab, get_cosine, make_tempdir @@ -29,22 +31,6 @@ def vectors(): ] -@pytest.fixture -def ngrams_vectors(): - return [ - ("apple", OPS.asarray([1, 2, 3])), - ("app", OPS.asarray([-0.1, -0.2, -0.3])), - ("ppl", OPS.asarray([-0.2, -0.3, -0.4])), - ("pl", OPS.asarray([0.7, 0.8, 0.9])), - ] - - -@pytest.fixture() -def ngrams_vocab(en_vocab, ngrams_vectors): - add_vecs_to_vocab(en_vocab, ngrams_vectors) - return en_vocab - - @pytest.fixture def data(): return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f") @@ -125,6 +111,7 @@ def test_init_vectors_with_data(strings, data): def test_init_vectors_with_shape(strings): v = Vectors(shape=(len(strings), 3)) assert v.shape == (len(strings), 3) + assert v.is_full is False def test_get_vector(strings, data): @@ -180,30 +167,6 @@ def test_vectors_token_vector(tokenizer_v, vectors, text): assert all([a == b for a, b in zip(vectors[1][1], doc[2].vector)]) -@pytest.mark.parametrize("text", ["apple"]) -def test_vectors__ngrams_word(ngrams_vocab, ngrams_vectors, text): - assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors[0][1]) - - -@pytest.mark.parametrize("text", ["applpie"]) -def test_vectors__ngrams_subword(ngrams_vocab, ngrams_vectors, text): - truth = list(ngrams_vocab.get_vector(text, 1, 6)) - test = list( - [ - ( - ngrams_vectors[1][1][i] - + ngrams_vectors[2][1][i] - + ngrams_vectors[3][1][i] - ) - / 3 - for i in range(len(ngrams_vectors[1][1])) - ] - ) - eps = [abs(truth[i] - test[i]) for i in range(len(truth))] - for i in eps: - assert i < 1e-6 - - @pytest.mark.parametrize("text", ["apple", "orange"]) def test_vectors_lexeme_vector(vocab, text): lex = vocab[text] @@ -379,3 +342,178 @@ def test_vector_is_oov(): assert vocab["cat"].is_oov is False assert vocab["dog"].is_oov is False assert vocab["hamster"].is_oov is True + + +def test_init_vectors_unset(): + v = Vectors(shape=(10, 10)) + assert v.is_full is False + assert v.data.shape == (10, 10) + + with pytest.raises(ValueError): + v = Vectors(shape=(10, 10), mode="floret") + + v = Vectors(data=OPS.xp.zeros((10, 10)), mode="floret", hash_count=1) + assert v.is_full is True + + +def test_vectors_clear(): + data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") + v = Vectors(data=data, keys=["A", "B", "C"]) + assert v.is_full is True + assert hash_string("A") in v + v.clear() + # no keys + assert v.key2row == {} + assert list(v) == [] + assert v.is_full is False + assert "A" not in v + with pytest.raises(KeyError): + v["A"] + + +def test_vectors_get_batch(): + data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") + v = Vectors(data=data, keys=["A", "B", "C"]) + # check with mixed int/str keys + words = ["C", "B", "A", v.strings["B"]] + rows = v.find(keys=words) + vecs = OPS.as_contig(v.data[rows]) + assert_equal(OPS.to_numpy(vecs), OPS.to_numpy(v.get_batch(words))) + + +@pytest.fixture() +def floret_vectors_hashvec_str(): + """The full hashvec table from floret with the settings: + bucket 10, dim 10, minn 2, maxn 3, hash count 2, hash seed 2166136261, + bow <, eow >""" + return """10 10 2 3 2 2166136261 < > +0 -2.2611 3.9302 2.6676 -11.233 0.093715 -10.52 -9.6463 -0.11853 2.101 -0.10145 +1 -3.12 -1.7981 10.7 -6.171 4.4527 10.967 9.073 6.2056 -6.1199 -2.0402 +2 9.5689 5.6721 -8.4832 -1.2249 2.1871 -3.0264 -2.391 -5.3308 -3.2847 -4.0382 +3 3.6268 4.2759 -1.7007 1.5002 5.5266 1.8716 -12.063 0.26314 2.7645 2.4929 +4 -11.683 -7.7068 2.1102 2.214 7.2202 0.69799 3.2173 -5.382 -2.0838 5.0314 +5 -4.3024 8.0241 2.0714 -1.0174 -0.28369 1.7622 7.8797 -1.7795 6.7541 5.6703 +6 8.3574 -5.225 8.6529 8.5605 -8.9465 3.767 -5.4636 -1.4635 -0.98947 -0.58025 +7 -10.01 3.3894 -4.4487 1.1669 -11.904 6.5158 4.3681 0.79913 -6.9131 -8.687 +8 -5.4576 7.1019 -8.8259 1.7189 4.955 -8.9157 -3.8905 -0.60086 -2.1233 5.892 +9 8.0678 -4.4142 3.6236 4.5889 -2.7611 2.4455 0.67096 -4.2822 2.0875 4.6274 +""" + + +@pytest.fixture() +def floret_vectors_vec_str(): + """The top 10 rows from floret with the settings above, to verify + that the spacy floret vectors are equivalent to the fasttext static + vectors.""" + return """10 10 +, -5.7814 2.6918 0.57029 -3.6985 -2.7079 1.4406 1.0084 1.7463 -3.8625 -3.0565 +. 3.8016 -1.759 0.59118 3.3044 -0.72975 0.45221 -2.1412 -3.8933 -2.1238 -0.47409 +der 0.08224 2.6601 -1.173 1.1549 -0.42821 -0.097268 -2.5589 -1.609 -0.16968 0.84687 +die -2.8781 0.082576 1.9286 -0.33279 0.79488 3.36 3.5609 -0.64328 -2.4152 0.17266 +und 2.1558 1.8606 -1.382 0.45424 -0.65889 1.2706 0.5929 -2.0592 -2.6949 -1.6015 +" -1.1242 1.4588 -1.6263 1.0382 -2.7609 -0.99794 -0.83478 -1.5711 -1.2137 1.0239 +in -0.87635 2.0958 4.0018 -2.2473 -1.2429 2.3474 1.8846 0.46521 -0.506 -0.26653 +von -0.10589 1.196 1.1143 -0.40907 -1.0848 -0.054756 -2.5016 -1.0381 -0.41598 0.36982 +( 0.59263 2.1856 0.67346 1.0769 1.0701 1.2151 1.718 -3.0441 2.7291 3.719 +) 0.13812 3.3267 1.657 0.34729 -3.5459 0.72372 0.63034 -1.6145 1.2733 0.37798 +""" + + +def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): + nlp = English() + nlp_plain = English() + # load both vec and hashvec tables + with make_tempdir() as tmpdir: + p = tmpdir / "test.hashvec" + with open(p, "w") as fileh: + fileh.write(floret_vectors_hashvec_str) + convert_vectors(nlp, p, truncate=0, prune=-1, mode="floret") + p = tmpdir / "test.vec" + with open(p, "w") as fileh: + fileh.write(floret_vectors_vec_str) + convert_vectors(nlp_plain, p, truncate=0, prune=-1) + + word = "der" + # ngrams: full padded word + padded 2-grams + padded 3-grams + ngrams = nlp.vocab.vectors._get_ngrams(word) + assert ngrams == ["", "", ""] + # rows: 2 rows per ngram + rows = OPS.xp.asarray( + [ + h % nlp.vocab.vectors.data.shape[0] + for ngram in ngrams + for h in nlp.vocab.vectors._get_ngram_hashes(ngram) + ], + dtype="uint32", + ) + assert_equal( + OPS.to_numpy(rows), + numpy.asarray([5, 6, 7, 5, 8, 2, 8, 9, 3, 3, 4, 6, 7, 3, 0, 2]), + ) + assert len(rows) == len(ngrams) * nlp.vocab.vectors.hash_count + # all vectors are equivalent for plain static table vs. hash ngrams + for word in nlp_plain.vocab.vectors: + word = nlp_plain.vocab.strings.as_string(word) + assert_almost_equal( + nlp.vocab[word].vector, nlp_plain.vocab[word].vector, decimal=3 + ) + + # every word has a vector + assert nlp.vocab[word * 5].has_vector + + # check that single and batched vector lookups are identical + words = [s for s in nlp_plain.vocab.vectors] + single_vecs = OPS.to_numpy(OPS.asarray([nlp.vocab[word].vector for word in words])) + batch_vecs = OPS.to_numpy(nlp.vocab.vectors.get_batch(words)) + assert_equal(single_vecs, batch_vecs) + + # an empty key returns 0s + assert_equal( + OPS.to_numpy(nlp.vocab[""].vector), + numpy.zeros((nlp.vocab.vectors.data.shape[0],)), + ) + # an empty batch returns 0s + assert_equal( + OPS.to_numpy(nlp.vocab.vectors.get_batch([""])), + numpy.zeros((1, nlp.vocab.vectors.data.shape[0])), + ) + # an empty key within a batch returns 0s + assert_equal( + OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]), + numpy.zeros((nlp.vocab.vectors.data.shape[0],)), + ) + + # the loaded ngram vector table cannot be modified + # except for clear: warning, then return without modifications + vector = list(range(nlp.vocab.vectors.shape[1])) + orig_bytes = nlp.vocab.vectors.to_bytes(exclude=["strings"]) + with pytest.warns(UserWarning): + nlp.vocab.set_vector("the", vector) + assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) + with pytest.warns(UserWarning): + nlp.vocab[word].vector = vector + assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) + with pytest.warns(UserWarning): + nlp.vocab.vectors.add("the", row=6) + assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) + with pytest.warns(UserWarning): + nlp.vocab.vectors.resize(shape=(100, 10)) + assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) + with pytest.raises(ValueError): + nlp.vocab.vectors.clear() + + # data and settings are serialized correctly + with make_tempdir() as d: + nlp.vocab.to_disk(d) + vocab_r = Vocab() + vocab_r.from_disk(d) + assert nlp.vocab.vectors.to_bytes() == vocab_r.vectors.to_bytes() + assert_equal( + OPS.to_numpy(nlp.vocab.vectors.data), OPS.to_numpy(vocab_r.vectors.data) + ) + assert_equal(nlp.vocab.vectors._get_cfg(), vocab_r.vectors._get_cfg()) + assert_almost_equal( + OPS.to_numpy(nlp.vocab[word].vector), + OPS.to_numpy(vocab_r[word].vector), + decimal=6, + ) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 719e8e6f5..fa38a1015 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -23,10 +23,12 @@ cdef class Tokenizer: cdef object _infix_finditer cdef object _rules cdef PhraseMatcher _special_matcher - cdef int _property_init_count # TODO: unused, remove in v3.1 - cdef int _property_init_max # TODO: unused, remove in v3.1 + # TODO next two are unused and should be removed in v4 + # https://github.com/explosion/spaCy/pull/9150 + cdef int _unused_int1 + cdef int _unused_int2 - cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases) + cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) cdef int _apply_special_cases(self, Doc doc) except -1 cdef void _filter_special_spans(self, vector[SpanC] &original, vector[SpanC] &filtered, int doc_len) nogil @@ -37,13 +39,13 @@ cdef class Tokenizer: cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, int* has_special, bint with_special_cases) except -1 - cdef int _tokenize(self, Doc tokens, unicode span, hash_t key, + cdef int _tokenize(self, Doc tokens, str span, hash_t key, int* has_special, bint with_special_cases) except -1 - cdef unicode _split_affixes(self, Pool mem, unicode string, + cdef str _split_affixes(self, Pool mem, str string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes, int* has_special, bint with_special_cases) - cdef int _attach_tokens(self, Doc tokens, unicode string, + cdef int _attach_tokens(self, Doc tokens, str string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes, int* has_special, bint with_special_cases) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 5a89e5a17..f8df13610 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,6 +1,4 @@ # cython: embedsignature=True, profile=True, binding=True -from __future__ import unicode_literals - from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc from libc.string cimport memcpy, memset @@ -132,7 +130,7 @@ cdef class Tokenizer: self.url_match) return (self.__class__, args, None, None) - def __call__(self, unicode string): + def __call__(self, str string): """Tokenize a string. string (str): The string to tokenize. @@ -145,7 +143,7 @@ cdef class Tokenizer: return doc @cython.boundscheck(False) - cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases): + cdef Doc _tokenize_affixes(self, str string, bint with_special_cases): """Tokenize according to affix and token_match settings. string (str): The string to tokenize. @@ -161,7 +159,7 @@ cdef class Tokenizer: cdef int start = 0 cdef int has_special = 0 cdef bint in_ws = string[0].isspace() - cdef unicode span + cdef str span # The task here is much like string.split, but not quite # We find spans of whitespace and non-space characters, and ignore # spans that are exactly ' '. So, our sequences will all be separated @@ -373,7 +371,7 @@ cdef class Tokenizer: return False return True - cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1: + cdef int _tokenize(self, Doc tokens, str span, hash_t orig_key, int* has_special, bint with_special_cases) except -1: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef int orig_size @@ -385,16 +383,16 @@ cdef class Tokenizer: self._save_cached(&tokens.c[orig_size], orig_key, has_special, tokens.length - orig_size) - cdef unicode _split_affixes(self, Pool mem, unicode string, + cdef str _split_affixes(self, Pool mem, str string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes, int* has_special, bint with_special_cases): cdef size_t i - cdef unicode prefix - cdef unicode suffix - cdef unicode minus_pre - cdef unicode minus_suf + cdef str prefix + cdef str suffix + cdef str minus_pre + cdef str minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: if self.token_match and self.token_match(string): @@ -410,7 +408,7 @@ cdef class Tokenizer: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) break - suf_len = self.find_suffix(string) + suf_len = self.find_suffix(string[pre_len:]) if suf_len != 0: suffix = string[-suf_len:] minus_suf = string[:-suf_len] @@ -430,7 +428,7 @@ cdef class Tokenizer: suffixes.push_back(self.vocab.get(mem, suffix)) return string - cdef int _attach_tokens(self, Doc tokens, unicode string, + cdef int _attach_tokens(self, Doc tokens, str string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes, int* has_special, @@ -440,7 +438,7 @@ cdef class Tokenizer: cdef int split, end cdef const LexemeC* const* lexemes cdef const LexemeC* lexeme - cdef unicode span + cdef str span cdef int i if prefixes.size(): for i in range(prefixes.size()): @@ -513,7 +511,7 @@ cdef class Tokenizer: cached.data.lexemes = lexemes self._cache.set(key, cached) - def find_infix(self, unicode string): + def find_infix(self, str string): """Find internal split points of the string, such as hyphens. string (str): The string to segment. @@ -527,7 +525,7 @@ cdef class Tokenizer: return 0 return list(self.infix_finditer(string)) - def find_prefix(self, unicode string): + def find_prefix(self, str string): """Find the length of a prefix that should be segmented from the string, or None if no prefix rules match. @@ -541,7 +539,7 @@ cdef class Tokenizer: match = self.prefix_search(string) return (match.end() - match.start()) if match is not None else 0 - def find_suffix(self, unicode string): + def find_suffix(self, str string): """Find the length of a suffix that should be segmented from the string, or None if no suffix rules match. @@ -579,7 +577,7 @@ cdef class Tokenizer: if attr not in (ORTH, NORM): raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk)) - def add_special_case(self, unicode string, substrings): + def add_special_case(self, str string, substrings): """Add a special-case tokenization rule. string (str): The string to specially tokenize. diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 510a2ea71..bd2bdb811 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -37,7 +37,7 @@ class DocBin: "spans": List[Dict[str, bytes]], # SpanGroups data for each doc "spaces": bytes, # Serialized numpy boolean array with spaces data "lengths": bytes, # Serialized numpy int32 array with the doc lengths - "strings": List[unicode] # List of unique strings in the token data + "strings": List[str] # List of unique strings in the token data "version": str, # DocBin version number } @@ -117,7 +117,8 @@ class DocBin: self.strings.add(token.ent_kb_id_) self.strings.add(token.ent_id_) self.cats.append(doc.cats) - self.user_data.append(srsly.msgpack_dumps(doc.user_data)) + if self.store_user_data: + self.user_data.append(srsly.msgpack_dumps(doc.user_data)) self.span_groups.append(doc.spans.to_bytes()) for key, group in doc.spans.items(): for span in group: diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index c74ee0b63..57d087958 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -56,7 +56,7 @@ cdef class Doc: cdef public bint has_unknown_spaces - cdef public list _py_tokens + cdef public object _context cdef int length cdef int max_length diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 2b18cee7a..46a10df03 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -29,6 +29,7 @@ class Doc: tensor: numpy.ndarray user_data: Dict[str, Any] has_unknown_spaces: bool + _context: Any @classmethod def set_extension( cls, @@ -138,8 +139,8 @@ class Doc: def count_by( self, attr_id: int, exclude: Optional[Any] = ..., counts: Optional[Any] = ... ) -> Dict[Any, int]: ... - def from_array(self, attrs: List[int], array: Ints2d) -> Doc: ... - def to_array(self, py_attr_ids: List[int]) -> numpy.ndarray: ... + def from_array(self, attrs: Union[int, str, List[Union[int, str]]], array: Ints2d) -> Doc: ... + def to_array(self, py_attr_ids: Union[int, str, List[Union[int, str]]]) -> numpy.ndarray: ... @staticmethod def from_docs( docs: List[Doc], diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1ee845934..362a17784 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -262,7 +262,7 @@ cdef class Doc: raise ValueError(Errors.E027) cdef const LexemeC* lexeme for word, has_space in zip(words, spaces): - if isinstance(word, unicode): + if isinstance(word, str): lexeme = self.vocab.get(self.mem, word) elif isinstance(word, bytes): raise ValueError(Errors.E028.format(value=word)) @@ -538,7 +538,13 @@ cdef class Doc: kb_id = self.vocab.strings.add(kb_id) alignment_modes = ("strict", "contract", "expand") if alignment_mode not in alignment_modes: - raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes))) + raise ValueError( + Errors.E202.format( + name="alignment", + mode=alignment_mode, + modes=", ".join(alignment_modes), + ) + ) cdef int start = token_by_char(self.c, self.length, start_idx) if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx): return None @@ -1371,7 +1377,7 @@ cdef class Doc: self.has_unknown_spaces = msg["has_unknown_spaces"] start = 0 cdef const LexemeC* lex - cdef unicode orth_ + cdef str orth_ text = msg["text"] attrs = msg["array_body"] for i in range(attrs.shape[0]): @@ -1432,7 +1438,7 @@ cdef class Doc: attributes are inherited from the syntactic root of the span. RETURNS (Token): The first newly merged token. """ - cdef unicode tag, lemma, ent_type + cdef str tag, lemma, ent_type attr_len = len(attributes) span_len = len(spans) if not attr_len == span_len: @@ -1704,17 +1710,18 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): def pickle_doc(doc): bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"]) hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks, - doc.user_token_hooks) + doc.user_token_hooks, doc._context) return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data)) def unpickle_doc(vocab, hooks_and_data, bytes_data): - user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data) + user_data, doc_hooks, span_hooks, token_hooks, _context = srsly.pickle_loads(hooks_and_data) doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude=["user_data"]) doc.user_hooks.update(doc_hooks) doc.user_span_hooks.update(span_hooks) doc.user_token_hooks.update(token_hooks) + doc._context = _context return doc diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index c9c807d7d..96f843a33 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - cimport numpy as np from libc.math cimport sqrt @@ -754,7 +752,7 @@ cdef class Span: def __get__(self): return self.root.ent_id_ - def __set__(self, unicode key): + def __set__(self, str key): raise NotImplementedError(Errors.E200.format(attr="ent_id_")) @property @@ -775,7 +773,7 @@ cdef class Span: def __get__(self): return self.doc.vocab.strings[self.label] - def __set__(self, unicode label_): + def __set__(self, str label_): self.label = self.doc.vocab.strings.add(label_) property kb_id_: @@ -783,7 +781,7 @@ cdef class Span: def __get__(self): return self.doc.vocab.strings[self.kb_id] - def __set__(self, unicode kb_id_): + def __set__(self, str kb_id_): self.kb_id = self.doc.vocab.strings.add(kb_id_) diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx index eb9221584..6cfa75237 100644 --- a/spacy/tokens/span_group.pyx +++ b/spacy/tokens/span_group.pyx @@ -63,7 +63,7 @@ cdef class SpanGroup: doc = self._doc_ref() if doc is None: # referent has been garbage collected - raise RuntimeError(Errors.E866) + raise RuntimeError(Errors.E865) return doc @property diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index c5baae510..aa97e2b07 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -267,7 +267,7 @@ cdef class Token: """RETURNS (str): The text content of the span (with trailing whitespace). """ - cdef unicode orth = self.vocab.strings[self.c.lex.orth] + cdef str orth = self.vocab.strings[self.c.lex.orth] if self.c.spacy: return orth + " " else: @@ -820,7 +820,7 @@ cdef class Token: def __get__(self): return self.vocab.strings[self.norm] - def __set__(self, unicode norm_): + def __set__(self, str norm_): self.c.norm = self.vocab.strings.add(norm_) @property @@ -858,7 +858,7 @@ cdef class Token: def __get__(self): return self.vocab.strings[self.c.lemma] - def __set__(self, unicode lemma_): + def __set__(self, str lemma_): self.c.lemma = self.vocab.strings.add(lemma_) property pos_: @@ -892,7 +892,7 @@ cdef class Token: def __get__(self): return self.vocab.strings[self.c.dep] - def __set__(self, unicode label): + def __set__(self, str label): self.c.dep = self.vocab.strings.add(label) @property diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 99fe7c19f..a4feb01f4 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -7,5 +7,5 @@ from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F40 from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401 from .gold_io import docs_to_json, read_json_file # noqa: F401 from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401 -from .loggers import console_logger, wandb_logger_v3 as wandb_logger # noqa: F401 +from .loggers import console_logger # noqa: F401 from .callbacks import create_copy_from_base_model # noqa: F401 diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 96abcc7cd..13ccfeb93 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -13,7 +13,7 @@ import warnings from .pretrain import get_tok2vec_ref from ..lookups import Lookups -from ..vectors import Vectors +from ..vectors import Vectors, Mode as VectorsMode from ..errors import Errors, Warnings from ..schemas import ConfigSchemaTraining from ..util import registry, load_model_from_config, resolve_dot_names, logger @@ -160,7 +160,13 @@ def load_vectors_into_model( err = ConfigValidationError.from_error(e, title=title, desc=desc) raise err from None - if len(vectors_nlp.vocab.vectors.keys()) == 0: + if ( + len(vectors_nlp.vocab.vectors.keys()) == 0 + and vectors_nlp.vocab.vectors.mode != VectorsMode.floret + ) or ( + vectors_nlp.vocab.vectors.data.shape[0] == 0 + and vectors_nlp.vocab.vectors.mode == VectorsMode.floret + ): logger.warning(Warnings.W112.format(name=name)) for lex in nlp.vocab: @@ -197,41 +203,80 @@ def convert_vectors( truncate: int, prune: int, name: Optional[str] = None, + mode: str = VectorsMode.default, ) -> None: vectors_loc = ensure_path(vectors_loc) if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): - nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) + nlp.vocab.vectors = Vectors( + strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb")) + ) for lex in nlp.vocab: if lex.rank and lex.rank != OOV_RANK: nlp.vocab.vectors.add(lex.orth, row=lex.rank) # type: ignore[attr-defined] else: if vectors_loc: logger.info(f"Reading vectors from {vectors_loc}") - vectors_data, vector_keys = read_vectors(vectors_loc, truncate) + vectors_data, vector_keys, floret_settings = read_vectors( + vectors_loc, + truncate, + mode=mode, + ) logger.info(f"Loaded vectors from {vectors_loc}") else: vectors_data, vector_keys = (None, None) - if vector_keys is not None: + if vector_keys is not None and mode != VectorsMode.floret: for word in vector_keys: if word not in nlp.vocab: nlp.vocab[word] if vectors_data is not None: - nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) + if mode == VectorsMode.floret: + nlp.vocab.vectors = Vectors( + strings=nlp.vocab.strings, + data=vectors_data, + **floret_settings, + ) + else: + nlp.vocab.vectors = Vectors( + strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys + ) if name is None: # TODO: Is this correct? Does this matter? nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" else: nlp.vocab.vectors.name = name nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name - if prune >= 1: + if prune >= 1 and mode != VectorsMode.floret: nlp.vocab.prune_vectors(prune) -def read_vectors(vectors_loc: Path, truncate_vectors: int): +def read_vectors( + vectors_loc: Path, truncate_vectors: int, *, mode: str = VectorsMode.default +): f = ensure_shape(vectors_loc) - shape = tuple(int(size) for size in next(f).split()) - if truncate_vectors >= 1: - shape = (truncate_vectors, shape[1]) + header_parts = next(f).split() + shape = tuple(int(size) for size in header_parts[:2]) + floret_settings = {} + if mode == VectorsMode.floret: + if len(header_parts) != 8: + raise ValueError( + "Invalid header for floret vectors. " + "Expected: bucket dim minn maxn hash_count hash_seed BOW EOW" + ) + floret_settings = { + "mode": "floret", + "minn": int(header_parts[2]), + "maxn": int(header_parts[3]), + "hash_count": int(header_parts[4]), + "hash_seed": int(header_parts[5]), + "bow": header_parts[6], + "eow": header_parts[7], + } + if truncate_vectors >= 1: + raise ValueError(Errors.E860) + else: + assert len(header_parts) == 2 + if truncate_vectors >= 1: + shape = (truncate_vectors, shape[1]) vectors_data = numpy.zeros(shape=shape, dtype="f") vectors_keys = [] for i, line in enumerate(tqdm.tqdm(f)): @@ -244,7 +289,7 @@ def read_vectors(vectors_loc: Path, truncate_vectors: int): vectors_keys.append(word) if i == truncate_vectors - 1: break - return vectors_data, vectors_keys + return vectors_data, vectors_keys, floret_settings def open_file(loc: Union[str, Path]) -> IO: @@ -271,7 +316,7 @@ def ensure_shape(vectors_loc): lines = open_file(vectors_loc) first_line = next(lines) try: - shape = tuple(int(size) for size in first_line.split()) + shape = tuple(int(size) for size in first_line.split()[:2]) except ValueError: shape = None if shape is not None: diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 602e0ff3e..edd0f1959 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -4,7 +4,6 @@ import tqdm import sys from ..util import registry -from .. import util from ..errors import Errors if TYPE_CHECKING: @@ -99,167 +98,3 @@ def console_logger(progress_bar: bool = False): return log_step, finalize return setup_printer - - -@registry.loggers("spacy.WandbLogger.v2") -def wandb_logger_v2( - project_name: str, - remove_config_values: List[str] = [], - model_log_interval: Optional[int] = None, - log_dataset_dir: Optional[str] = None, -): - try: - import wandb - - # test that these are available - from wandb import init, log, join # noqa: F401 - except ImportError: - raise ImportError(Errors.E880) - - console = console_logger(progress_bar=False) - - def setup_logger( - nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr - ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]: - config = nlp.config.interpolate() - config_dot = util.dict_to_dot(config) - for field in remove_config_values: - del config_dot[field] - config = util.dot_to_dict(config_dot) - run = wandb.init(project=project_name, config=config, reinit=True) - console_log_step, console_finalize = console(nlp, stdout, stderr) - - def log_dir_artifact( - path: str, - name: str, - type: str, - metadata: Optional[Dict[str, Any]] = {}, - aliases: Optional[List[str]] = [], - ): - dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata) - dataset_artifact.add_dir(path, name=name) - wandb.log_artifact(dataset_artifact, aliases=aliases) - - if log_dataset_dir: - log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset") - - def log_step(info: Optional[Dict[str, Any]]): - console_log_step(info) - if info is not None: - score = info["score"] - other_scores = info["other_scores"] - losses = info["losses"] - wandb.log({"score": score}) - if losses: - wandb.log({f"loss_{k}": v for k, v in losses.items()}) - if isinstance(other_scores, dict): - wandb.log(other_scores) - if model_log_interval and info.get("output_path"): - if info["step"] % model_log_interval == 0 and info["step"] != 0: - log_dir_artifact( - path=info["output_path"], - name="pipeline_" + run.id, - type="checkpoint", - metadata=info, - aliases=[ - f"epoch {info['epoch']} step {info['step']}", - "latest", - "best" - if info["score"] == max(info["checkpoints"])[0] - else "", - ], - ) - - def finalize() -> None: - console_finalize() - wandb.join() - - return log_step, finalize - - return setup_logger - - -@registry.loggers("spacy.WandbLogger.v3") -def wandb_logger_v3( - project_name: str, - remove_config_values: List[str] = [], - model_log_interval: Optional[int] = None, - log_dataset_dir: Optional[str] = None, - entity: Optional[str] = None, - run_name: Optional[str] = None, -): - try: - import wandb - - # test that these are available - from wandb import init, log, join # noqa: F401 - except ImportError: - raise ImportError(Errors.E880) - - console = console_logger(progress_bar=False) - - def setup_logger( - nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr - ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]: - config = nlp.config.interpolate() - config_dot = util.dict_to_dot(config) - for field in remove_config_values: - del config_dot[field] - config = util.dot_to_dict(config_dot) - run = wandb.init( - project=project_name, config=config, entity=entity, reinit=True - ) - - if run_name: - wandb.run.name = run_name - - console_log_step, console_finalize = console(nlp, stdout, stderr) - - def log_dir_artifact( - path: str, - name: str, - type: str, - metadata: Optional[Dict[str, Any]] = {}, - aliases: Optional[List[str]] = [], - ): - dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata) - dataset_artifact.add_dir(path, name=name) - wandb.log_artifact(dataset_artifact, aliases=aliases) - - if log_dataset_dir: - log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset") - - def log_step(info: Optional[Dict[str, Any]]): - console_log_step(info) - if info is not None: - score = info["score"] - other_scores = info["other_scores"] - losses = info["losses"] - wandb.log({"score": score}) - if losses: - wandb.log({f"loss_{k}": v for k, v in losses.items()}) - if isinstance(other_scores, dict): - wandb.log(other_scores) - if model_log_interval and info.get("output_path"): - if info["step"] % model_log_interval == 0 and info["step"] != 0: - log_dir_artifact( - path=info["output_path"], - name="pipeline_" + run.id, - type="checkpoint", - metadata=info, - aliases=[ - f"epoch {info['epoch']} step {info['step']}", - "latest", - "best" - if info["score"] == max(info["checkpoints"])[0] - else "", - ], - ) - - def finalize() -> None: - console_finalize() - wandb.join() - - return log_step, finalize - - return setup_logger diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index 2328ebbc7..7830196bc 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -49,7 +49,10 @@ def pretrain( objective = model.attrs["loss"] # TODO: move this to logger function? tracker = ProgressTracker(frequency=10000) - msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") + if P["n_save_epoch"]: + msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume} - saving every {P['n_save_epoch']} epoch") + else: + msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) @@ -78,7 +81,12 @@ def pretrain( msg.row(progress, **row_settings) if P["n_save_every"] and (batch_id % P["n_save_every"] == 0): _save_model(epoch, is_temp=True) - _save_model(epoch) + + if P["n_save_epoch"]: + if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1: + _save_model(epoch) + else: + _save_model(epoch) tracker.epoch_loss = 0.0 diff --git a/spacy/util.py b/spacy/util.py index cf62a4ecd..e14f6030f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -17,6 +17,7 @@ import numpy import srsly import catalogue from catalogue import RegistryError, Registry +import langcodes import sys import warnings from packaging.specifiers import SpecifierSet, InvalidSpecifier @@ -29,6 +30,7 @@ import tempfile import shutil import shlex import inspect +import pkgutil import logging try: @@ -94,6 +96,7 @@ class registry(thinc.registry): readers = catalogue.create("spacy", "readers", entry_points=True) augmenters = catalogue.create("spacy", "augmenters", entry_points=True) loggers = catalogue.create("spacy", "loggers", entry_points=True) + scorers = catalogue.create("spacy", "scorers", entry_points=True) # These are factories registered via third-party packages and the # spacy_factories entry point. This registry only exists so we can easily # load them via the entry points. The "true" factories are added via the @@ -257,20 +260,89 @@ def lang_class_is_loaded(lang: str) -> bool: return lang in registry.languages +def find_matching_language(lang: str) -> Optional[str]: + """ + Given an IETF language code, find a supported spaCy language that is a + close match for it (according to Unicode CLDR language-matching rules). + This allows for language aliases, ISO 639-2 codes, more detailed language + tags, and close matches. + + Returns the language code if a matching language is available, or None + if there is no matching language. + + >>> find_matching_language('en') + 'en' + >>> find_matching_language('pt-BR') # Brazilian Portuguese + 'pt' + >>> find_matching_language('fra') # an ISO 639-2 code for French + 'fr' + >>> find_matching_language('iw') # obsolete alias for Hebrew + 'he' + >>> find_matching_language('no') # Norwegian + 'nb' + >>> find_matching_language('mo') # old code for ro-MD + 'ro' + >>> find_matching_language('zh-Hans') # Simplified Chinese + 'zh' + >>> find_matching_language('zxx') + None + """ + import spacy.lang # noqa: F401 + if lang == 'xx': + return 'xx' + + # Find out which language modules we have + possible_languages = [] + for modinfo in pkgutil.iter_modules(spacy.lang.__path__): # type: ignore + code = modinfo.name + if code == 'xx': + # Temporarily make 'xx' into a valid language code + possible_languages.append('mul') + elif langcodes.tag_is_valid(code): + possible_languages.append(code) + + # Distances from 1-9 allow near misses like Bosnian -> Croatian and + # Norwegian -> Norwegian Bokmål. A distance of 10 would include several + # more possibilities, like variants of Chinese like 'wuu', but text that + # is labeled that way is probably trying to be distinct from 'zh' and + # shouldn't automatically match. + match = langcodes.closest_supported_match( + lang, possible_languages, max_distance=9 + ) + if match == 'mul': + # Convert 'mul' back to spaCy's 'xx' + return 'xx' + else: + return match + + def get_lang_class(lang: str) -> Type["Language"]: """Import and load a Language class. - lang (str): Two-letter language code, e.g. 'en'. + lang (str): IETF language code, such as 'en'. RETURNS (Language): Language class. """ # Check if language is registered / entry point is available if lang in registry.languages: return registry.languages.get(lang) else: + # Find the language in the spacy.lang subpackage try: module = importlib.import_module(f".lang.{lang}", "spacy") except ImportError as err: - raise ImportError(Errors.E048.format(lang=lang, err=err)) from err + # Find a matching language. For example, if the language 'no' is + # requested, we can use language-matching to load `spacy.lang.nb`. + try: + match = find_matching_language(lang) + except langcodes.tag_parser.LanguageTagError: + # proceed to raising an import error + match = None + + if match: + lang = match + module = importlib.import_module(f".lang.{lang}", "spacy") + else: + raise ImportError(Errors.E048.format(lang=lang, err=err)) from err set_lang_class(lang, getattr(module, module.__all__[0])) # type: ignore[attr-defined] return registry.languages.get(lang) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 7cb3322c2..6d6783af4 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,16 +1,23 @@ cimport numpy as np +from libc.stdint cimport uint32_t from cython.operator cimport dereference as deref from libcpp.set cimport set as cppset +from murmurhash.mrmr cimport hash128_x64 import functools import numpy +from typing import cast +import warnings +from enum import Enum import srsly from thinc.api import get_array_module, get_current_ops +from thinc.backends import get_array_ops +from thinc.types import Floats2d from .strings cimport StringStore from .strings import get_string_id -from .errors import Errors +from .errors import Errors, Warnings from . import util @@ -18,18 +25,13 @@ def unpickle_vectors(bytes_data): return Vectors().from_bytes(bytes_data) -class GlobalRegistry: - """Global store of vectors, to avoid repeatedly loading the data.""" - data = {} +class Mode(str, Enum): + default = "default" + floret = "floret" @classmethod - def register(cls, name, data): - cls.data[name] = data - return functools.partial(cls.get, name) - - @classmethod - def get(cls, name): - return cls.data[name] + def values(cls): + return list(cls.__members__.keys()) cdef class Vectors: @@ -37,45 +39,93 @@ cdef class Vectors: Vectors data is kept in the vectors.data attribute, which should be an instance of numpy.ndarray (for CPU vectors) or cupy.ndarray - (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to - rows in the vectors.data table. + (for GPU vectors). - Multiple keys can be mapped to the same vector, and not all of the rows in - the table need to be assigned - so len(list(vectors.keys())) may be - greater or smaller than vectors.shape[0]. + In the default mode, `vectors.key2row` is a dictionary mapping word hashes + to rows in the vectors.data table. Multiple keys can be mapped to the same + vector, and not all of the rows in the table need to be assigned - so + len(list(vectors.keys())) may be greater or smaller than vectors.shape[0]. + + In floret mode, the floret settings (minn, maxn, etc.) are used to + calculate the vector from the rows corresponding to the key's ngrams. DOCS: https://spacy.io/api/vectors """ + cdef public object strings cdef public object name + cdef readonly object mode cdef public object data cdef public object key2row cdef cppset[int] _unset + cdef readonly uint32_t minn + cdef readonly uint32_t maxn + cdef readonly uint32_t hash_count + cdef readonly uint32_t hash_seed + cdef readonly unicode bow + cdef readonly unicode eow - def __init__(self, *, shape=None, data=None, keys=None, name=None): + def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): """Create a new vector store. + strings (StringStore): The string store. shape (tuple): Size of the table, as (# entries, # columns) data (numpy.ndarray or cupy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. name (str): A name to identify the vectors table. + mode (str): Vectors mode: "default" or "floret" (default: "default"). + minn (int): The floret char ngram minn (default: 0). + maxn (int): The floret char ngram maxn (default: 0). + hash_count (int): The floret hash count (1-4, default: 1). + hash_seed (int): The floret hash seed (default: 0). + bow (str): The floret BOW string (default: "<"). + eow (str): The floret EOW string (default: ">"). DOCS: https://spacy.io/api/vectors#init """ + self.strings = strings + if self.strings is None: + self.strings = StringStore() self.name = name - if data is None: - if shape is None: - shape = (0,0) - ops = get_current_ops() - data = ops.xp.zeros(shape, dtype="f") - self.data = data + if mode not in Mode.values(): + raise ValueError( + Errors.E202.format( + name="vectors", + mode=mode, + modes=str(Mode.values()) + ) + ) + self.mode = Mode(mode).value self.key2row = {} - if self.data is not None: - self._unset = cppset[int]({i for i in range(self.data.shape[0])}) - else: + self.minn = minn + self.maxn = maxn + self.hash_count = hash_count + self.hash_seed = hash_seed + self.bow = bow + self.eow = eow + if self.mode == Mode.default: + if data is None: + if shape is None: + shape = (0,0) + ops = get_current_ops() + data = ops.xp.zeros(shape, dtype="f") + self._unset = cppset[int]({i for i in range(data.shape[0])}) + else: + self._unset = cppset[int]() + self.data = data + if keys is not None: + for i, key in enumerate(keys): + self.add(key, row=i) + elif self.mode == Mode.floret: + if maxn < minn: + raise ValueError(Errors.E863) + if hash_count < 1 or hash_count >= 5: + raise ValueError(Errors.E862) + if data is None: + raise ValueError(Errors.E864) + if keys is not None: + raise ValueError(Errors.E861) + self.data = data self._unset = cppset[int]() - if keys is not None: - for i, key in enumerate(keys): - self.add(key, row=i) @property def shape(self): @@ -106,6 +156,8 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#is_full """ + if self.mode == Mode.floret: + return True return self._unset.size() == 0 @property @@ -113,7 +165,8 @@ cdef class Vectors: """Get the number of keys in the table. Note that this is the number of all keys, not just unique vectors. - RETURNS (int): The number of keys in the table. + RETURNS (int): The number of keys in the table for default vectors. + For floret vectors, return -1. DOCS: https://spacy.io/api/vectors#n_keys """ @@ -125,25 +178,33 @@ cdef class Vectors: def __getitem__(self, key): """Get a vector by key. If the key is not found, a KeyError is raised. - key (int): The key to get the vector for. + key (str/int): The key to get the vector for. RETURNS (ndarray): The vector for the key. DOCS: https://spacy.io/api/vectors#getitem """ - i = self.key2row[key] - if i is None: - raise KeyError(Errors.E058.format(key=key)) - else: - return self.data[i] + if self.mode == Mode.default: + i = self.key2row.get(get_string_id(key), None) + if i is None: + raise KeyError(Errors.E058.format(key=key)) + else: + return self.data[i] + elif self.mode == Mode.floret: + return self.get_batch([key])[0] + raise KeyError(Errors.E058.format(key=key)) def __setitem__(self, key, vector): """Set a vector for the given key. - key (int): The key to set the vector for. + key (str/int): The key to set the vector for. vector (ndarray): The vector to set. DOCS: https://spacy.io/api/vectors#setitem """ + if self.mode == Mode.floret: + warnings.warn(Warnings.W115.format(method="Vectors.__setitem__")) + return + key = get_string_id(key) i = self.key2row[key] self.data[i] = vector if self._unset.count(i): @@ -175,7 +236,10 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#contains """ - return key in self.key2row + if self.mode == Mode.floret: + return True + else: + return key in self.key2row def resize(self, shape, inplace=False): """Resize the underlying vectors array. If inplace=True, the memory @@ -192,6 +256,9 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#resize """ + if self.mode == Mode.floret: + warnings.warn(Warnings.W115.format(method="Vectors.resize")) + return -1 xp = get_array_module(self.data) if inplace: if shape[1] != self.data.shape[1]: @@ -244,16 +311,23 @@ cdef class Vectors: def find(self, *, key=None, keys=None, row=None, rows=None): """Look up one or more keys by row, or vice versa. - key (str / int): Find the row that the given key points to. + key (Union[int, str]): Find the row that the given key points to. Returns int, -1 if missing. - keys (iterable): Find rows that the keys point to. + keys (Iterable[Union[int, str]]): Find rows that the keys point to. Returns ndarray. row (int): Find the first key that points to the row. Returns int. - rows (iterable): Find the keys that point to the rows. + rows (Iterable[int]): Find the keys that point to the rows. Returns ndarray. RETURNS: The requested key, keys, row or rows. """ + if self.mode == Mode.floret: + raise ValueError( + Errors.E858.format( + mode=self.mode, + alternative="Use Vectors[key] instead.", + ) + ) if sum(arg is None for arg in (key, keys, row, rows)) != 3: bad_kwargs = {"key": key, "keys": keys, "row": row, "rows": rows} raise ValueError(Errors.E059.format(kwargs=bad_kwargs)) @@ -273,6 +347,67 @@ cdef class Vectors: results = [row2key[row] for row in rows] return xp.asarray(results, dtype="uint64") + def _get_ngram_hashes(self, unicode s): + """Calculate up to 4 32-bit hash values with MurmurHash3_x64_128 using + the floret hash settings. + key (str): The string key. + RETURNS: A list of the integer hashes. + """ + cdef uint32_t[4] out + chars = s.encode("utf8") + cdef char* utf8_string = chars + hash128_x64(utf8_string, len(chars), self.hash_seed, &out) + rows = [out[i] for i in range(min(self.hash_count, 4))] + return rows + + def _get_ngrams(self, unicode key): + """Get all padded ngram strings using the ngram settings. + key (str): The string key. + RETURNS: A list of the ngram strings for the padded key. + """ + key = self.bow + key + self.eow + ngrams = [key] + [ + key[start:start+ngram_size] + for ngram_size in range(self.minn, self.maxn + 1) + for start in range(0, len(key) - ngram_size + 1) + ] + return ngrams + + def get_batch(self, keys): + """Get the vectors for the provided keys efficiently as a batch. + keys (Iterable[Union[int, str]]): The keys. + RETURNS: The requested vectors from the vector table. + """ + ops = get_array_ops(self.data) + if self.mode == Mode.default: + rows = self.find(keys=keys) + vecs = self.data[rows] + elif self.mode == Mode.floret: + keys = [self.strings.as_string(key) for key in keys] + if sum(len(key) for key in keys) == 0: + return ops.xp.zeros((len(keys), self.data.shape[1])) + unique_keys = tuple(set(keys)) + row_index = {key: i for i, key in enumerate(unique_keys)} + rows = [row_index[key] for key in keys] + indices = [] + lengths = [] + for key in unique_keys: + if key == "": + ngram_rows = [] + else: + ngram_rows = [ + h % self.data.shape[0] + for ngram in self._get_ngrams(key) + for h in self._get_ngram_hashes(ngram) + ] + indices.extend(ngram_rows) + lengths.append(len(ngram_rows)) + indices = ops.asarray(indices, dtype="int32") + lengths = ops.asarray(lengths, dtype="int32") + vecs = ops.reduce_mean(cast(Floats2d, self.data[indices]), lengths) + vecs = vecs[rows] + return ops.as_contig(vecs) + def add(self, key, *, vector=None, row=None): """Add a key to the table. Keys can be mapped to an existing vector by setting `row`, or a new vector can be added. @@ -284,6 +419,9 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#add """ + if self.mode == Mode.floret: + warnings.warn(Warnings.W115.format(method="Vectors.add")) + return -1 # use int for all keys and rows in key2row for more efficient access # and serialization key = int(get_string_id(key)) @@ -324,6 +462,11 @@ cdef class Vectors: RETURNS (tuple): The most similar entries as a `(keys, best_rows, scores)` tuple. """ + if self.mode == Mode.floret: + raise ValueError(Errors.E858.format( + mode=self.mode, + alternative="", + )) xp = get_array_module(self.data) filled = sorted(list({row for row in self.key2row.values()})) if len(filled) < n: @@ -368,7 +511,32 @@ cdef class Vectors: for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) - def to_disk(self, path, **kwargs): + def _get_cfg(self): + if self.mode == Mode.default: + return { + "mode": Mode(self.mode).value, + } + elif self.mode == Mode.floret: + return { + "mode": Mode(self.mode).value, + "minn": self.minn, + "maxn": self.maxn, + "hash_count": self.hash_count, + "hash_seed": self.hash_seed, + "bow": self.bow, + "eow": self.eow, + } + + def _set_cfg(self, cfg): + self.mode = Mode(cfg.get("mode", Mode.default)).value + self.minn = cfg.get("minn", 0) + self.maxn = cfg.get("maxn", 0) + self.hash_count = cfg.get("hash_count", 0) + self.hash_seed = cfg.get("hash_seed", 0) + self.bow = cfg.get("bow", "<") + self.eow = cfg.get("eow", ">") + + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. path (str / Path): A path to a directory, which will be created if @@ -390,12 +558,14 @@ cdef class Vectors: save_array(self.data, _file) serializers = { + "strings": lambda p: self.strings.to_disk(p.with_suffix(".json")), "vectors": lambda p: save_vectors(p), - "key2row": lambda p: srsly.write_msgpack(p, self.key2row) + "key2row": lambda p: srsly.write_msgpack(p, self.key2row), + "vectors.cfg": lambda p: srsly.write_json(p, self._get_cfg()), } - return util.to_disk(path, serializers, []) + return util.to_disk(path, serializers, exclude) - def from_disk(self, path, **kwargs): + def from_disk(self, path, *, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. @@ -422,17 +592,23 @@ cdef class Vectors: if path.exists(): self.data = ops.xp.load(str(path)) + def load_settings(path): + if path.exists(): + self._set_cfg(srsly.read_json(path)) + serializers = { + "strings": lambda p: self.strings.from_disk(p.with_suffix(".json")), "vectors": load_vectors, "keys": load_keys, "key2row": load_key2row, + "vectors.cfg": load_settings, } - util.from_disk(path, serializers, []) + util.from_disk(path, serializers, exclude) self._sync_unset() return self - def to_bytes(self, **kwargs): + def to_bytes(self, *, exclude=tuple()): """Serialize the current state to a binary string. exclude (list): String names of serialization fields to exclude. @@ -447,12 +623,14 @@ cdef class Vectors: return srsly.msgpack_dumps(self.data) serializers = { + "strings": lambda: self.strings.to_bytes(), "key2row": lambda: srsly.msgpack_dumps(self.key2row), - "vectors": serialize_weights + "vectors": serialize_weights, + "vectors.cfg": lambda: srsly.json_dumps(self._get_cfg()), } - return util.to_bytes(serializers, []) + return util.to_bytes(serializers, exclude) - def from_bytes(self, data, **kwargs): + def from_bytes(self, data, *, exclude=tuple()): """Load state from a binary string. data (bytes): The data to load from. @@ -469,13 +647,25 @@ cdef class Vectors: self.data = xp.asarray(srsly.msgpack_loads(b)) deserializers = { + "strings": lambda b: self.strings.from_bytes(b), "key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)), - "vectors": deserialize_weights + "vectors": deserialize_weights, + "vectors.cfg": lambda b: self._set_cfg(srsly.json_loads(b)) } - util.from_bytes(data, deserializers, []) + util.from_bytes(data, deserializers, exclude) self._sync_unset() return self + def clear(self): + """Clear all entries in the vector table. + + DOCS: https://spacy.io/api/vectors#clear + """ + if self.mode == Mode.floret: + raise ValueError(Errors.E859) + self.key2row = {} + self._sync_unset() + def _sync_unset(self): filled = {row for row in self.key2row.values()} self._unset = cppset[int]({row for row in range(self.data.shape[0]) if row not in filled}) diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 9067476f7..9c951b2b7 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -27,21 +27,21 @@ cdef class Vocab: cdef Pool mem cdef readonly StringStore strings cdef public Morphology morphology - cdef public object vectors + cdef public object _vectors cdef public object _lookups cdef public object writing_system cdef public object get_noun_chunks cdef readonly int length - cdef public object data_dir + cdef public object _unused_object # TODO remove in v4, see #9150 cdef public object lex_attr_getters cdef public object cfg - cdef const LexemeC* get(self, Pool mem, unicode string) except NULL + cdef const LexemeC* get(self, Pool mem, str string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL cdef const TokenC* make_fused_token(self, substrings) except NULL - cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL + cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 - cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL + cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef PreshMap _by_orth diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index 603ef1ae7..713e85c01 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -71,7 +71,7 @@ def unpickle_vocab( sstore: StringStore, vectors: Any, morphology: Any, - data_dir: Any, + _unused_object: Any, lex_attr_getters: Any, lookups: Any, get_noun_chunks: Any, diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index b4dfd22f5..e2e7ad1db 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -14,7 +14,7 @@ from .attrs cimport LANG, ORTH from .compat import copy_reg from .errors import Errors from .attrs import intify_attrs, NORM, IS_STOP -from .vectors import Vectors +from .vectors import Vectors, Mode as VectorsMode from .util import registry from .lookups import Lookups from . import util @@ -60,7 +60,7 @@ cdef class Vocab: vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. oov_prob (float): Default OOV probability. - vectors_name (unicode): Optional name to identify the vectors table. + vectors_name (str): Optional name to identify the vectors table. get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]): A function that yields base noun phrases used for Doc.noun_chunks. """ @@ -77,11 +77,21 @@ cdef class Vocab: _ = self[string] self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings) - self.vectors = Vectors(name=vectors_name) + self.vectors = Vectors(strings=self.strings, name=vectors_name) self.lookups = lookups self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks + property vectors: + def __get__(self): + return self._vectors + + def __set__(self, vectors): + for s in vectors.strings: + self.strings.add(s) + self._vectors = vectors + self._vectors.strings = self.strings + @property def lang(self): langfunc = None @@ -105,7 +115,7 @@ cdef class Vocab: See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`, `Token.check_flag`. - flag_getter (callable): A function `f(unicode) -> bool`, to get the + flag_getter (callable): A function `f(str) -> bool`, to get the flag value. flag_id (int): An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If -1, the lowest @@ -128,7 +138,7 @@ cdef class Vocab: self.lex_attr_getters[flag_id] = flag_getter return flag_id - cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: + cdef const LexemeC* get(self, Pool mem, str string) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` if necessary using memory acquired from the given pool. If the pool is the lexicon's own memory, the lexeme is saved in the lexicon. @@ -162,7 +172,7 @@ cdef class Vocab: else: return self._new_lexeme(mem, self.strings[orth]) - cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: + cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL: # I think this heuristic is bad, and the Vocab should always # own the lexemes. It avoids weird bugs this way, as it's how the thing # was originally supposed to work. The best solution to the growing @@ -184,7 +194,7 @@ cdef class Vocab: if self.lex_attr_getters is not None: for attr, func in self.lex_attr_getters.items(): value = func(string) - if isinstance(value, unicode): + if isinstance(value, str): value = self.strings.add(value) if value is not None: Lexeme.set_struct_attr(lex, attr, value) @@ -201,7 +211,7 @@ cdef class Vocab: def __contains__(self, key): """Check whether the string or int key has an entry in the vocabulary. - string (unicode): The ID string. + string (str): The ID string. RETURNS (bool) Whether the string has an entry in the vocabulary. DOCS: https://spacy.io/api/vocab#contains @@ -209,7 +219,7 @@ cdef class Vocab: cdef hash_t int_key if isinstance(key, bytes): int_key = self.strings[key.decode("utf8")] - elif isinstance(key, unicode): + elif isinstance(key, str): int_key = self.strings[key] else: int_key = key @@ -234,7 +244,7 @@ cdef class Vocab: previously unseen unicode string is given, a new lexeme is created and stored. - id_or_string (int or unicode): The integer ID of a word, or its unicode + id_or_string (int or str): The integer ID of a word, or its unicode string. If `int >= Lexicon.size`, `IndexError` is raised. If `id_or_string` is neither an int nor a unicode string, `ValueError` is raised. @@ -247,7 +257,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#getitem """ cdef attr_t orth - if isinstance(id_or_string, unicode): + if isinstance(id_or_string, str): orth = self.strings.add(id_or_string) else: orth = id_or_string @@ -282,10 +292,10 @@ cdef class Vocab: if width is not None and shape is not None: raise ValueError(Errors.E065.format(width=width, shape=shape)) elif shape is not None: - self.vectors = Vectors(shape=shape) + self.vectors = Vectors(strings=self.strings, shape=shape) else: width = width if width is not None else self.vectors.data.shape[1] - self.vectors = Vectors(shape=(self.vectors.shape[0], width)) + self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width)) def prune_vectors(self, nr_row, batch_size=1024): """Reduce the current vector table to `nr_row` unique entries. Words @@ -314,6 +324,8 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#prune_vectors """ + if self.vectors.mode != VectorsMode.default: + raise ValueError(Errors.E866) ops = get_current_ops() xp = get_array_module(self.vectors.data) # Make sure all vectors are in the vocab @@ -328,7 +340,7 @@ cdef class Vocab: keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) - self.vectors = Vectors(data=keep, keys=keys[:nr_row], name=self.vectors.name) + self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) syn_keys = ops.to_numpy(syn_keys) remap = {} @@ -340,19 +352,12 @@ cdef class Vocab: remap[word] = (synonym, score) return remap - def get_vector(self, orth, minn=None, maxn=None): + def get_vector(self, orth): """Retrieve a vector for a word in the vocabulary. Words can be looked up by string or int ID. If no vectors data is loaded, ValueError is raised. - If `minn` is defined, then the resulting vector uses Fasttext's - subword features by average over ngrams of `orth`. - orth (int / unicode): The hash value of a word, or its unicode string. - minn (int): Minimum n-gram length used for Fasttext's ngram computation. - Defaults to the length of `orth`. - maxn (int): Maximum n-gram length used for Fasttext's ngram computation. - Defaults to the length of `orth`. RETURNS (numpy.ndarray or cupy.ndarray): A word vector. Size and shape determined by the `vocab.vectors` instance. Usually, a numpy ndarray of shape (300,) and dtype float32. @@ -361,47 +366,17 @@ cdef class Vocab: """ if isinstance(orth, str): orth = self.strings.add(orth) - word = self[orth].orth_ - if orth in self.vectors.key2row: + if self.has_vector(orth): return self.vectors[orth] xp = get_array_module(self.vectors.data) vectors = xp.zeros((self.vectors_length,), dtype="f") - if minn is None: - return vectors - # Fasttext's ngram computation taken from - # https://github.com/facebookresearch/fastText - # Assign default ngram limit to maxn which is the length of the word. - if maxn is None: - maxn = len(word) - ngrams_size = 0; - for i in range(len(word)): - ngram = "" - if (word[i] and 0xC0) == 0x80: - continue - n = 1 - j = i - while (j < len(word) and n <= maxn): - if n > maxn: - break - ngram += word[j] - j = j + 1 - while (j < len(word) and (word[j] and 0xC0) == 0x80): - ngram += word[j] - j = j + 1 - if (n >= minn and not (n == 1 and (i == 0 or j == len(word)))): - if self.strings[ngram] in self.vectors.key2row: - vectors = xp.add(self.vectors[self.strings[ngram]], vectors) - ngrams_size += 1 - n = n + 1 - if ngrams_size > 0: - vectors = vectors * (1.0/ngrams_size) return vectors def set_vector(self, orth, vector): """Set a vector for a word in the vocabulary. Words can be referenced by string or int ID. - orth (int / unicode): The word. + orth (int / str): The word. vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set. DOCS: https://spacy.io/api/vocab#set_vector @@ -417,13 +392,14 @@ cdef class Vocab: self.vectors.resize((new_rows, width)) lex = self[orth] # Add word to vocab if necessary row = self.vectors.add(orth, vector=vector) - lex.rank = row + if row >= 0: + lex.rank = row def has_vector(self, orth): """Check whether a word has a vector. Returns False if no vectors have been loaded. Words can be looked up by string or int ID. - orth (int / unicode): The word. + orth (int / str): The word. RETURNS (bool): Whether the word has a vector. DOCS: https://spacy.io/api/vocab#has_vector @@ -448,7 +424,7 @@ cdef class Vocab: def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. - path (unicode or Path): A path to a directory, which will be created if + path (str or Path): A path to a directory, which will be created if it doesn't exist. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -461,7 +437,7 @@ cdef class Vocab: if "strings" not in exclude: self.strings.to_disk(path / "strings.json") if "vectors" not in "exclude": - self.vectors.to_disk(path) + self.vectors.to_disk(path, exclude=["strings"]) if "lookups" not in "exclude": self.lookups.to_disk(path) @@ -469,7 +445,7 @@ cdef class Vocab: """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. + path (str or Path): A path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (Vocab): The modified `Vocab` object. @@ -504,7 +480,7 @@ cdef class Vocab: if self.vectors is None: return None else: - return self.vectors.to_bytes() + return self.vectors.to_bytes(exclude=["strings"]) getters = { "strings": lambda: self.strings.to_bytes(), @@ -526,7 +502,7 @@ cdef class Vocab: if self.vectors is None: return None else: - return self.vectors.from_bytes(b) + return self.vectors.from_bytes(b, exclude=["strings"]) setters = { "strings": lambda b: self.strings.from_bytes(b), @@ -551,21 +527,21 @@ def pickle_vocab(vocab): sstore = vocab.strings vectors = vocab.vectors morph = vocab.morphology - data_dir = vocab.data_dir + _unused_object = vocab._unused_object lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters) lookups = vocab.lookups get_noun_chunks = vocab.get_noun_chunks return (unpickle_vocab, - (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, get_noun_chunks)) + (sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks)) -def unpickle_vocab(sstore, vectors, morphology, data_dir, +def unpickle_vocab(sstore, vectors, morphology, _unused_object, lex_attr_getters, lookups, get_noun_chunks): cdef Vocab vocab = Vocab() vocab.vectors = vectors vocab.strings = sstore vocab.morphology = morphology - vocab.data_dir = data_dir + vocab._unused_object = _unused_object vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters) vocab.lookups = lookups vocab.get_noun_chunks = get_noun_chunks diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index a253ca9f8..965bffbcc 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -48,12 +48,13 @@ Initialize the attribute ruler. > ruler = nlp.add_pipe("attribute_ruler") > ``` -| Name | Description | -| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ | -| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ | -| _keyword-only_ | | -| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ | +| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ | +| _keyword-only_ | | +| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag`", `"pos"`, `"morph"` and `"lemma"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | ## AttributeRuler.\_\_call\_\_ {#call tag="method"} @@ -175,21 +176,6 @@ Load attribute ruler patterns from morph rules. | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ | -## AttributeRuler.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = ruler.score(examples) -> ``` - -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ | - ## AttributeRuler.to_disk {#to_disk tag="method"} Serialize the pipe to disk. diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index a4462af56..b872181f9 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -203,11 +203,12 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr | Name | Description | | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ | +| `lang` | Pipeline language [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as `en`. ~~str (positional)~~ | | `vectors_loc` | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | | `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | | `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | +| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ | | `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ | | `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index c48172a22..118cdc611 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -105,6 +105,7 @@ shortcut for this and instantiate the component using its string name and | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ | | `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_deps`](/api/scorer#score_deps) for the attribute `"dep"` ignoring the labels `p` and `punct` and [`Scorer.score_spans`](/api/scorer/#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ | ## DependencyParser.\_\_call\_\_ {#call tag="method"} @@ -273,21 +274,6 @@ predicted scores. | `scores` | Scores representing the model's predictions. ~~StateClass~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## DependencyParser.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = parser.score(examples) -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). ~~Dict[str, Union[float, Dict[str, float]]]~~ | - ## DependencyParser.create_optimizer {#create_optimizer tag="method"} Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index bbc8f3942..3d3372679 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -51,15 +51,17 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| Setting | Description | +| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py @@ -92,18 +94,20 @@ custom knowledge base, you should either call [`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the [`initialize`](/api/entitylinker#initialize) call. -| Name | Description | -| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | +| Name | Description | +| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | ## EntityLinker.\_\_call\_\_ {#call tag="method"} @@ -269,21 +273,6 @@ pipe's entity linking model and context encoder. Delegates to | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | -## EntityLinker.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = entity_linker.score(examples) -> ``` - -| Name | Description | -| ----------- | ---------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ | - ## EntityLinker.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index ba7022c14..2f7a88fbf 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -66,6 +66,7 @@ architectures and their arguments and hyperparameters. | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | | `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ | +| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/ner.pyx @@ -269,21 +270,6 @@ predicted scores. | `scores` | Scores representing the model's predictions. ~~StateClass~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## EntityRecognizer.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = ner.score(examples) -> ``` - -| Name | Description | -| ----------- | --------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | - ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index c9c3ec365..fb33642f8 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -61,6 +61,7 @@ how the component should be configured. You can override its settings via the | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | +| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entityruler.py diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 45c42040e..8d7686243 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -1044,7 +1044,7 @@ details. | Name | Description | | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ | -| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ | +| `lang` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en' for English. ~~str~~ | | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ | ## Defaults {#defaults} diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 8cb869f64..2fa040917 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -56,11 +56,13 @@ data format used by the lookup and rule-based lemmatizers, see > nlp.add_pipe("lemmatizer", config=config) > ``` -| Setting | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ | -| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | -| `model` | **Not yet implemented:** the model to use. ~~Model~~ | +| Setting | Description | +| -------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ | +| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | +| `model` | **Not yet implemented:** the model to use. ~~Model~~ | +| _keyword-only_ | | +| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ | Many languages specify a default lemmatizer mode other than `lookup` if a better lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 00af83e6f..434c56833 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -42,9 +42,12 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("morphologizer", config=config) > ``` -| Setting | Description | -| ------- | ------------------------------------------------------------------------------------------------------- | -| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| Setting | Description | +| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | +| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx @@ -56,6 +59,19 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). +The `overwrite` and `extend` settings determine how existing annotation is +handled (with the example for existing annotation `A=B|C=D` + predicted +annotation `C=E|X=Y`): + +- `overwrite=True, extend=True`: overwrite values of existing features, add any + new features (`A=B|C=D` + `C=E|X=Y` → `A=B|C=E|X=Y`) +- `overwrite=True, extend=False`: overwrite completely, removing any existing + features (`A=B|C=D` + `C=E|X=Y` → `C=E|X=Y`) +- `overwrite=False, extend=True`: keep values of existing features, add any new + features (`A=B|C=D` + `C=E|X=Y` → `A=B|C=D|X=Y`) +- `overwrite=False, extend=False`: do not modify the existing annotation if set + (`A=B|C=D` + `C=E|X=Y` → `A=B|C=D`) + > #### Example > > ```python @@ -71,11 +87,15 @@ shortcut for this and instantiate the component using its string name and > morphologizer = Morphologizer(nlp.vocab, model) > ``` -| Name | Description | -| ------- | -------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| Name | Description | +| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | +| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | ## Morphologizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index 2f856c667..263942e3e 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -297,10 +297,12 @@ Score a batch of examples. > scores = pipe.score(examples) > ``` -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| _keyword-only_ | +| `\*\*kwargs` | Any additional settings to pass on to the scorer. ~~Any~~ | +| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## TrainablePipe.create_optimizer {#create_optimizer tag="method"} diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index c8163091f..8dbe3b276 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -27,9 +27,13 @@ Create a new `Scorer`. > scorer = Scorer(nlp) > ``` -| Name | Description | -| ----- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. ~~Language~~ | +| Name | Description | +| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ | +| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ | +| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ | +| _keyword-only_ | | +| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ | ## Scorer.score {#score tag="method"} @@ -37,15 +41,20 @@ Calculate the scores for a list of [`Example`](/api/example) objects using the scoring methods provided by the components in the pipeline. The returned `Dict` contains the scores provided by the individual pipeline -components. For the scoring methods provided by the `Scorer` and use by the core -pipeline components, the individual score names start with the `Token` or `Doc` -attribute being scored: +components. For the scoring methods provided by the `Scorer` and used by the +core pipeline components, the individual score names start with the `Token` or +`Doc` attribute being scored: -- `token_acc`, `token_p`, `token_r`, `token_f`, +- `token_acc`, `token_p`, `token_r`, `token_f` - `sents_p`, `sents_r`, `sents_f` -- `tag_acc`, `pos_acc`, `morph_acc`, `morph_per_feat`, `lemma_acc` +- `tag_acc` +- `pos_acc` +- `morph_acc`, `morph_micro_p`, `morph_micro_r`, `morph_micro_f`, + `morph_per_feat` +- `lemma_acc` - `dep_uas`, `dep_las`, `dep_las_per_type` - `ents_p`, `ents_r` `ents_f`, `ents_per_type` +- `spans_sc_p`, `spans_sc_r`, `spans_sc_f` - `cats_score` (depends on config, description provided in `cats_score_desc`), `cats_micro_p`, `cats_micro_r`, `cats_micro_f`, `cats_macro_p`, `cats_macro_r`, `cats_macro_f`, `cats_macro_auc`, `cats_f_per_type`, @@ -80,7 +89,7 @@ Docs with `has_unknown_spaces` are skipped during scoring. > ``` | Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------- | +| ----------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ | @@ -120,14 +129,14 @@ scoring. > print(scores["morph_per_feat"]) > ``` -| Name | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | -| `attr` | The attribute to score. ~~str~~ | -| _keyword-only_ | | -| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | -| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ | -| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | +| Name | Description | +| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | +| `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~ | +| **RETURNS** | A dictionary containing the micro PRF scores under the key `{attr}_micro_p/r/f` and the per-feature PRF scores under `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} @@ -253,3 +262,11 @@ entities that overlap between the gold reference and the predictions. | _keyword-only_ | | | `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~ | | **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ | + +## get_ner_prf {#get_ner_prf new="3"} + +Compute micro-PRF and per-entity PRF scores. + +| Name | Description | +| ---------- | ------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index 8d8e57319..29bf10393 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -39,9 +39,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("senter", config=config) > ``` -| Setting | Description | -| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| Setting | Description | +| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/senter.pyx @@ -70,11 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| ------- | -------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| Name | Description | +| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ | ## SentenceRecognizer.\_\_call\_\_ {#call tag="method"} @@ -248,21 +253,6 @@ predicted scores. | `scores` | Scores representing the model's predictions. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## SentenceRecognizer.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = senter.score(examples) -> ``` - -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ | - ## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index ef2465c27..b75c7a2f1 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -37,9 +37,11 @@ how the component should be configured. You can override its settings via the > nlp.add_pipe("sentencizer", config=config) > ``` -| Setting | Description | -| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` | +| Setting | Description | +| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"` ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/sentencizer.pyx @@ -60,10 +62,12 @@ Initialize the sentencizer. > sentencizer = Sentencizer() > ``` -| Name | Description | -| -------------- | ----------------------------------------------------------------------------------------------------------------------- | -| _keyword-only_ | | -| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ | +| Name | Description | +| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"` ~~Optional[Callable]~~ | ```python ### punct_chars defaults @@ -122,21 +126,6 @@ applied to the `Doc` in order. | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Sentencizer.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = sentencizer.score(examples) -> ``` - -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]~~ | - ## Sentencizer.to_disk {#to_disk tag="method"} Save the sentencizer settings (punctuation characters) to a directory. Will diff --git a/website/docs/api/spancategorizer.md b/website/docs/api/spancategorizer.md index 4edc6fb5b..26fcaefdf 100644 --- a/website/docs/api/spancategorizer.md +++ b/website/docs/api/spancategorizer.md @@ -59,6 +59,7 @@ architectures and their arguments and hyperparameters. | `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~ | | `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | | `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/spancat.py @@ -257,22 +258,6 @@ predicted scores. | `spans_scores` | Scores representing the model's predictions. ~~Tuple[Ragged, Floats2d]~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## SpanCategorizer.score {#score tag="method"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = spancat.score(examples) -> ``` - -| Name | Description | -| -------------- | ---------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ | - ## SpanCategorizer.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index f34456b0c..93b6bc88b 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -40,9 +40,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("tagger", config=config) > ``` -| Setting | Description | -| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| Setting | Description | +| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/tagger.pyx @@ -69,11 +71,14 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| Name | Description | +| ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | ## Tagger.\_\_call\_\_ {#call tag="method"} @@ -264,21 +269,6 @@ predicted scores. | `scores` | Scores representing the model's predictions. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## Tagger.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = tagger.score(examples) -> ``` - -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Dict[str, float]~~ | - ## Tagger.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 62a921d02..47f868637 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -112,13 +112,14 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | ## TextCategorizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index c78a1de03..4361db4c0 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -83,7 +83,7 @@ Create a blank pipeline of a given language class. This function is the twin of | Name | Description | | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ | +| `name` | [IETF language tag](https://www.w3.org/International/articles/language-tags/), such as 'en', of the language class to load. ~~str~~ | | _keyword-only_ | | | `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | @@ -373,6 +373,7 @@ factories. | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | | `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | +| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. | | `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | ### spacy-transformers registry {#registry-transformers} @@ -410,10 +411,13 @@ finished. To log each training step, a [`spacy train`](/api/cli#train), including information such as the training loss and the accuracy scores on the development set. -There are two built-in logging functions: a logger printing results to the -console in tabular format (which is the default), and one that also sends the -results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of -using one of the built-in loggers listed here, you can also +The built-in, default logger is the ConsoleLogger, which prints results to the +console in tabular format. The +[spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as +a dependency of spaCy, enables other loggers: currently it provides one that sends +results to a [Weights & Biases](https://www.wandb.com/) dashboard. + +Instead of using one of the built-in loggers, you can [implement your own](/usage/training#custom-logging). #### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"} @@ -462,63 +466,6 @@ start decreasing across epochs. -#### spacy.WandbLogger.v3 {#WandbLogger tag="registered function"} - -> #### Installation -> -> ```bash -> $ pip install wandb -> $ wandb login -> ``` - -Built-in logger that sends the results of each training step to the dashboard of -the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights -& Biases should be installed, and you should be logged in. The logger will send -the full config file to W&B, as well as various system information such as -memory utilization, network traffic, disk IO, GPU statistics, etc. This will -also include information such as your hostname and operating system, as well as -the location of your Python executable. - - - -Note that by default, the full (interpolated) -[training config](/usage/training#config) is sent over to the W&B dashboard. If -you prefer to **exclude certain information** such as path names, you can list -those fields in "dot notation" in the `remove_config_values` parameter. These -fields will then be removed from the config before uploading, but will otherwise -remain in the config file stored on your local system. - - - -> #### Example config -> -> ```ini -> [training.logger] -> @loggers = "spacy.WandbLogger.v3" -> project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] -> log_dataset_dir = "corpus" -> model_log_interval = 1000 -> ``` - -| Name | Description | -| ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | -| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | -| `model_log_interval` | Steps to wait between logging model checkpoints to W&B dasboard (default: None). ~~Optional[int]~~ | -| `log_dataset_dir` | Directory containing dataset to be logged and versioned as W&B artifact (default: None). ~~Optional[str]~~ | -| `run_name` | The name of the run. If you don't specify a run_name, the name will be created by wandb library. (default: None ). ~~Optional[str]~~ | -| `entity` | An entity is a username or team name where you're sending runs. If you don't specify an entity, the run will be sent to your default entity, which is usually your username. (default: None). ~~Optional[str]~~ | - - - -Get started with tracking your spaCy training runs in Weights & Biases using our -project template. It trains on the IMDB Movie Review Dataset and includes a -simple config with the built-in `WandbLogger`, as well as a custom example of -creating variants of the config for a simple hyperparameter grid search and -logging the results. - - ## Readers {#readers} diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 1a7f7a3f5..84d2c00ad 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -8,15 +8,30 @@ new: 2 Vectors data is kept in the `Vectors.data` attribute, which should be an instance of `numpy.ndarray` (for CPU vectors) or `cupy.ndarray` (for GPU -vectors). Multiple keys can be mapped to the same vector, and not all of the -rows in the table need to be assigned – so `vectors.n_keys` may be greater or -smaller than `vectors.shape[0]`. +vectors). + +As of spaCy v3.2, `Vectors` supports two types of vector tables: + +- `default`: A standard vector table (as in spaCy v3.1 and earlier) where each + key is mapped to one row in the vector table. Multiple keys can be mapped to + the same vector, and not all of the rows in the table need to be assigned – so + `vectors.n_keys` may be greater or smaller than `vectors.shape[0]`. +- `floret`: Only supports vectors trained with + [floret](https://github.com/explosion/floret), an extended version of + [fastText](https://fasttext.cc) that produces compact vector tables by + combining fastText's subword ngrams with Bloom embeddings. The compact tables + are similar to the [`HashEmbed`](https://thinc.ai/docs/api-layers#hashembed) + embeddings already used in many spaCy components. Each word is represented as + the sum of one or more rows as determined by the settings related to character + ngrams and the hash table. ## Vectors.\_\_init\_\_ {#init tag="method"} -Create a new vector store. You can set the vector values and keys directly on -initialization, or supply a `shape` keyword argument to create an empty table -you can add vectors to later. +Create a new vector store. With the default mode, you can set the vector values +and keys directly on initialization, or supply a `shape` keyword argument to +create an empty table you can add vectors to later. In floret mode, the complete +vector data and settings must be provided on initialization and cannot be +modified later. > #### Example > @@ -30,13 +45,21 @@ you can add vectors to later. > vectors = Vectors(data=data, keys=keys) > ``` -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| _keyword-only_ | | -| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ | -| `data` | The vector data. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | -| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ | -| `name` | A name to identify the vectors table. ~~str~~ | +| Name | Description | +| ----------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `strings` | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~ | +| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ | +| `data` | The vector data. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ | +| `name` | A name to identify the vectors table. ~~str~~ | +| `mode` 3.2 | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ | +| `minn` 3.2 | The floret char ngram minn (default: `0`). ~~int~~ | +| `maxn` 3.2 | The floret char ngram maxn (default: `0`). ~~int~~ | +| `hash_count` 3.2 | The floret hash count. Supported values: 1--4 (default: `1`). ~~int~~ | +| `hash_seed` 3.2 | The floret hash seed (default: `0`). ~~int~~ | +| `bow` 3.2 | The floret BOW string (default: `"<"`). ~~str~~ | +| `eow` 3.2 | The floret EOW string (default: `">"`). ~~str~~ | ## Vectors.\_\_getitem\_\_ {#getitem tag="method"} @@ -53,12 +76,12 @@ raised. | Name | Description | | ----------- | ---------------------------------------------------------------- | -| `key` | The key to get the vector for. ~~int~~ | +| `key` | The key to get the vector for. ~~Union[int, str]~~ | | **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Vectors.\_\_setitem\_\_ {#setitem tag="method"} -Set a vector for the given key. +Set a vector for the given key. Not supported for `floret` mode. > #### Example > @@ -75,7 +98,8 @@ Set a vector for the given key. ## Vectors.\_\_iter\_\_ {#iter tag="method"} -Iterate over the keys in the table. +Iterate over the keys in the table. In `floret` mode, the keys table is not +used. > #### Example > @@ -105,7 +129,8 @@ Return the number of vectors in the table. ## Vectors.\_\_contains\_\_ {#contains tag="method"} -Check whether a key has been mapped to a vector entry in the table. +Check whether a key has been mapped to a vector entry in the table. In `floret` +mode, returns `True` for all keys. > #### Example > @@ -123,11 +148,8 @@ Check whether a key has been mapped to a vector entry in the table. ## Vectors.add {#add tag="method"} Add a key to the table, optionally setting a vector value as well. Keys can be -mapped to an existing vector by setting `row`, or a new vector can be added. -When adding string keys, keep in mind that the `Vectors` class itself has no -[`StringStore`](/api/stringstore), so you have to store the hash-to-string -mapping separately. If you need to manage the strings, you should use the -`Vectors` via the [`Vocab`](/api/vocab) class, e.g. `vocab.vectors`. +mapped to an existing vector by setting `row`, or a new vector can be added. Not +supported for `floret` mode. > #### Example > @@ -152,7 +174,8 @@ Resize the underlying vectors array. If `inplace=True`, the memory is reallocated. This may cause other references to the data to become invalid, so only use `inplace=True` if you're sure that's what you want. If the number of vectors is reduced, keys mapped to rows that have been deleted are removed. -These removed items are returned as a list of `(key, row)` tuples. +These removed items are returned as a list of `(key, row)` tuples. Not supported +for `floret` mode. > #### Example > @@ -168,7 +191,8 @@ These removed items are returned as a list of `(key, row)` tuples. ## Vectors.keys {#keys tag="method"} -A sequence of the keys in the table. +A sequence of the keys in the table. In `floret` mode, the keys table is not +used. > #### Example > @@ -185,7 +209,7 @@ A sequence of the keys in the table. Iterate over vectors that have been assigned to at least one key. Note that some vectors may be unassigned, so the number of vectors returned may be less than -the length of the vectors table. +the length of the vectors table. In `floret` mode, the keys table is not used. > #### Example > @@ -200,7 +224,8 @@ the length of the vectors table. ## Vectors.items {#items tag="method"} -Iterate over `(key, vector)` pairs, in order. +Iterate over `(key, vector)` pairs, in order. In `floret` mode, the keys table +is empty. > #### Example > @@ -215,7 +240,7 @@ Iterate over `(key, vector)` pairs, in order. ## Vectors.find {#find tag="method"} -Look up one or more keys by row, or vice versa. +Look up one or more keys by row, or vice versa. Not supported for `floret` mode. > #### Example > @@ -273,7 +298,8 @@ The vector size, i.e. `rows * dims`. Whether the vectors table is full and has no slots are available for new keys. If a table is full, it can be resized using -[`Vectors.resize`](/api/vectors#resize). +[`Vectors.resize`](/api/vectors#resize). In `floret` mode, the table is always +full and cannot be resized. > #### Example > @@ -291,7 +317,7 @@ If a table is full, it can be resized using Get the number of keys in the table. Note that this is the number of _all_ keys, not just unique vectors. If several keys are mapped to the same vectors, they -will be counted individually. +will be counted individually. In `floret` mode, the keys table is not used. > #### Example > @@ -311,7 +337,8 @@ For each of the given vectors, find the `n` most similar entries to it by cosine. Queries are by vector. Results are returned as a `(keys, best_rows, scores)` tuple. If `queries` is large, the calculations are performed in chunks to avoid consuming too much memory. You can set the -`batch_size` to control the size/space trade-off during the calculations. +`batch_size` to control the size/space trade-off during the calculations. Not +supported for `floret` mode. > #### Example > @@ -329,6 +356,21 @@ performed in chunks to avoid consuming too much memory. You can set the | `sort` | Whether to sort the entries returned by score. Defaults to `True`. ~~bool~~ | | **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. ~~Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]~~ | +## Vectors.get_batch {#get_batch tag="method" new="3.2"} + +Get the vectors for the provided keys efficiently as a batch. + +> #### Example +> +> ```python +> words = ["cat", "dog"] +> vectors = nlp.vocab.vectors.get_batch(words) +> ``` + +| Name | Description | +| ------ | --------------------------------------- | +| `keys` | The keys. ~~Iterable[Union[int, str]]~~ | + ## Vectors.to_disk {#to_disk tag="method"} Save the current state to a directory. diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index d1c9a0a81..3b79c4d0d 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -247,6 +247,10 @@ config can be used to configure the split mode to `A`, `B` or `C`. split_mode = "A" ``` +Extra information, such as reading, inflection form, and the SudachiPy +normalized form, is available in `Token.morph`. For `B` or `C` split modes, +subtokens are stored in `Doc.user_data["sub_tokens"]`. + If you run into errors related to `sudachipy`, which is currently under active diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 6f6cef7c8..e0e787a1d 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -1016,20 +1016,22 @@ commands: [Weights & Biases](https://www.wandb.com/) is a popular platform for experiment tracking. spaCy integrates with it out-of-the-box via the -[`WandbLogger`](/api/top-level#WandbLogger), which you can add as the -`[training.logger]` block of your training [config](/usage/training#config). The -results of each step are then logged in your project, together with the full -**training config**. This means that _every_ hyperparameter, registered function -name and argument will be tracked and you'll be able to see the impact it has on -your results. +[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger), which +you can add as the `[training.logger]` block of your training +[config](/usage/training#config). The results of each step are then logged in +your project, together with the full **training config**. This means that +_every_ hyperparameter, registered function name and argument will be tracked +and you'll be able to see the impact it has on your results. > #### Example config > > ```ini > [training.logger] -> @loggers = "spacy.WandbLogger.v2" +> @loggers = "spacy.WandbLogger.v3" > project_name = "monitor_spacy_training" > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] +> log_dataset_dir = "corpus" +> model_log_interval = 1000 > ``` ![Screenshot: Visualized training results](../images/wandb1.jpg) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index bd5ea7751..f46f0052b 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -942,8 +942,8 @@ During training, the results of each step are passed to a logger function. By default, these results are written to the console with the [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support for writing the log files to [Weights & Biases](https://www.wandb.com/) with the -[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function -receives a **dictionary** with the following keys: +[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger). On each +step, the logger function receives a **dictionary** with the following keys: | Key | Value | | -------------- | ----------------------------------------------------------------------------------------------------- | diff --git a/website/meta/languages.json b/website/meta/languages.json index 2ba117d53..a7dda6482 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -192,17 +192,10 @@ "models": [ "ja_core_news_sm", "ja_core_news_md", - "ja_core_news_lg" + "ja_core_news_lg", + "ja_core_news_trf" ], "dependencies": [ - { - "name": "Unidic", - "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj" - }, - { - "name": "Mecab", - "url": "https://github.com/taku910/mecab" - }, { "name": "SudachiPy", "url": "https://github.com/WorksApplications/SudachiPy" diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 554823ebf..69cec3376 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -31,7 +31,7 @@ const COMPONENT_LINKS = { const MODEL_META = { core: 'Vocabulary, syntax, entities, vectors', - core_sm: 'Vocabulary, syntax, entities', + core_no_vectors: 'Vocabulary, syntax, entities', dep: 'Vocabulary, syntax', ent: 'Named entities', sent: 'Sentence boundaries', @@ -41,14 +41,16 @@ const MODEL_META = { web: 'written text (blogs, news, comments)', news: 'written text (news, media)', wiki: 'Wikipedia', - uas: 'Unlabelled dependencies', - las: 'Labelled dependencies', - dep_uas: 'Unlabelled dependencies', - dep_las: 'Labelled dependencies', + uas: 'Unlabeled dependencies', + las: 'Labeled dependencies', + dep_uas: 'Unlabeled dependencies', + dep_las: 'Labeled dependencies', token_acc: 'Tokenization', tok: 'Tokenization', lemma: 'Lemmatization', morph: 'Morphological analysis', + lemma_acc: 'Lemmatization', + morph_acc: 'Morphological analysis', tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', tag_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', tag: 'Part-of-speech tags (fine grained tags, Token.tag)', @@ -115,8 +117,8 @@ function formatVectors(data) { return `${abbrNum(keys)} keys, ${abbrNum(vectors)} unique vectors (${width} dimensions)` } -function formatAccuracy(data) { - const exclude = ['speed'] +function formatAccuracy(data, lang) { + const exclude = (lang !== "ja") ? ['speed'] : ['speed', 'morph_acc'] if (!data) return [] return Object.keys(data) .map(label => { @@ -147,8 +149,7 @@ function formatModelMeta(data) { license: data.license, labels: isEmptyObj(data.labels) ? null : data.labels, vectors: formatVectors(data.vectors), - // TODO: remove accuracy fallback - accuracy: formatAccuracy(data.accuracy || data.performance), + accuracy: formatAccuracy(data.performance, data.lang), } } @@ -196,7 +197,7 @@ const Model = ({ const [isError, setIsError] = useState(true) const [meta, setMeta] = useState({}) const { type, genre, size } = getModelComponents(name) - const display_type = type === 'core' && size === 'sm' ? 'core_sm' : type + const display_type = type === 'core' && (size === 'sm' || size === 'trf') ? 'core_no_vectors' : type const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [ name, compatibility,