From 46250293705b946b762242b0beea38f313412c58 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 22 Sep 2020 19:04:49 +0200 Subject: [PATCH 01/38] Add pin for pyrsistent<0.17.0 (#6116) Add pin for pyrsistent<0.17.0 since pyrsistent>=0.17.1 is only compatible with python3.5+. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index b93def651..367eef111 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ plac>=0.9.6,<1.2.0 pathlib==1.0.1; python_version < "3.4" tqdm>=4.38.0,<5.0.0 # Optional dependencies +pyrsistent<0.17.0 jsonschema>=2.6.0,<3.1.0 # Development dependencies cython>=0.25 From 9b4979407d989aab01c9734c697ac73004abefe8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 22 Sep 2020 21:52:42 +0200 Subject: [PATCH 02/38] Fix overlapping German noun chunks (#6112) Add a similar fix as in #5470 to prevent the German noun chunks iterator from producing overlapping spans. --- spacy/lang/de/syntax_iterators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py index 73c1b1a6e..c5513abc0 100644 --- a/spacy/lang/de/syntax_iterators.py +++ b/spacy/lang/de/syntax_iterators.py @@ -38,9 +38,13 @@ def noun_chunks(doclike): close_app = doc.vocab.strings.add("nk") rbracket = 0 + prev_end = -1 for i, word in enumerate(doclike): if i < rbracket: continue + # Prevent nested chunks from being produced + if word.left_edge.i <= prev_end: + continue if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: rbracket = word.i + 1 # try to extend the span to the right @@ -48,6 +52,7 @@ def noun_chunks(doclike): for rdep in doc[word.i].rights: if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app: rbracket = rdep.i + 1 + prev_end = rbracket - 1 yield word.left_edge.i, rbracket, np_label From e4acb286582477caaf5486833781c5802374d171 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 22 Sep 2020 21:53:33 +0200 Subject: [PATCH 03/38] Fix norm in retokenizer split (#6111) Parallel to behavior in merge, reset norm on original token in retokenizer split. --- spacy/tests/doc/test_retokenize_split.py | 19 +++++++++++++++++++ spacy/tokens/_retokenize.pyx | 1 + 2 files changed, 20 insertions(+) diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index d074fddc6..d84c846de 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -198,3 +198,22 @@ def test_doc_retokenizer_realloc(en_vocab): token = doc[0] heads = [(token, 0)] * len(token) retokenizer.split(doc[token.i], list(token.text), heads=heads) + + +def test_doc_retokenizer_split_norm(en_vocab): + """#6060: reset norm in split""" + text = "The quick brownfoxjumpsoverthe lazy dog w/ white spots" + doc = Doc(en_vocab, words=text.split()) + + # Set custom norm on the w/ token. + doc[5].norm_ = "with" + + # Retokenize to split out the words in the token at doc[2]. + token = doc[2] + with doc.retokenize() as retokenizer: + retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)]) + + assert doc[9].text == "w/" + assert doc[9].norm_ == "with" + assert doc[5].text == "over" + assert doc[5].norm_ == "over" diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index abc9b731b..4a030bef6 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -355,6 +355,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): lex = doc.vocab.get(doc.mem, orth) token.lex = lex token.lemma = 0 # reset lemma + token.norm = 0 # reset norm if to_process_tensor: # setting the tensors of the split tokens to array of zeros doc.tensor[token_index + i] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32") From 7489d02deaae09f1d0901122c7c40c71f0e85560 Mon Sep 17 00:00:00 2001 From: Muhammad Fahmi Rasyid Date: Wed, 23 Sep 2020 19:02:26 +0700 Subject: [PATCH 04/38] Update Indonesian Example Phrases (#6124) * create contributor agreement * Update Indonesian example. (see #1107) Update Indonesian examples with more proper phrases. the current phrases contains sensitive and violent words. --- .github/contributors/rasyidf.md | 106 ++++++++++++++++++++++++++++++++ spacy/lang/id/examples.py | 4 +- 2 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/rasyidf.md diff --git a/.github/contributors/rasyidf.md b/.github/contributors/rasyidf.md new file mode 100644 index 000000000..4a70547a3 --- /dev/null +++ b/.github/contributors/rasyidf.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Muhammad Fahmi Rasyid | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-09-23 | +| GitHub username | rasyidf | +| Website (optional) | http://rasyidf.github.io | diff --git a/spacy/lang/id/examples.py b/spacy/lang/id/examples.py index 56ac9165e..7b4a4e513 100644 --- a/spacy/lang/id/examples.py +++ b/spacy/lang/id/examples.py @@ -11,8 +11,8 @@ Example sentences to test spaCy and its language models. sentences = [ - "Al Qaidah mengklaim bom mobil yang menewaskan 60 Orang di Mali", - "Abu Sayyaf mengeksekusi sandera warga Filipina", + "Indonesia merupakan negara kepulauan yang kaya akan budaya.", + "Berapa banyak warga yang dibutuhkan saat kerja bakti?", "Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.", "PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.", "Jakarta adalah kota besar yang nyaris tidak pernah tidur." From 27c5795ea5b036fda98292a6486353ba4dc47ed3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 25 Sep 2020 09:23:29 +0200 Subject: [PATCH 05/38] Fix version check in models directory [ci skip] --- website/src/templates/models.js | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 3c5e9d2a4..a1a6f3b5a 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -69,7 +69,12 @@ function isStableVersion(v) { function getLatestVersion(modelId, compatibility) { for (let [version, models] of Object.entries(compatibility)) { if (isStableVersion(version) && models[modelId]) { - return models[modelId][0] + const modelVersions = models[modelId] + for (let modelVersion of modelVersions) { + if (isStableVersion(modelVersion)) { + return modelVersion + } + } } } } From 4cbb954281ad47148667de130e5c4eb23e579edf Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Wed, 30 Sep 2020 07:26:06 -0400 Subject: [PATCH 06/38] reorder so tagmap is replaced only if a custom file is provided. (#6164) * reorder so tagmap is replaced only if a custom file is provided. * Remove unneeded variable initialization Co-authored-by: Adriane Boyd --- spacy/cli/debug_data.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 22540c779..7e6c99c06 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -59,10 +59,6 @@ def debug_data( if not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) - tag_map = {} - if tag_map_path is not None: - tag_map = srsly.read_json(tag_map_path) - # Initialize the model and pipeline pipeline = [p.strip() for p in pipeline.split(",")] if base_model: @@ -70,8 +66,11 @@ def debug_data( else: lang_cls = get_lang_class(lang) nlp = lang_cls() - # Replace tag map with provided mapping - nlp.vocab.morphology.load_tag_map(tag_map) + + if tag_map_path is not None: + tag_map = srsly.read_json(tag_map_path) + # Replace tag map with provided mapping + nlp.vocab.morphology.load_tag_map(tag_map) msg.divider("Data format validation") From 3243ddac8f699a69ce2e4e39ae80c62cfd30ad12 Mon Sep 17 00:00:00 2001 From: Yohei Tamura Date: Thu, 1 Oct 2020 21:01:52 +0900 Subject: [PATCH 07/38] Fix/span.sent (#6083) * add fail test * fix test * fix span.sent * Remove incorrect implicit check Co-authored-by: Adriane Boyd --- spacy/tests/doc/test_span.py | 20 +++++++++++++++++--- spacy/tokens/span.pyx | 5 ++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 107078df9..df41aedf5 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -174,19 +174,25 @@ def test_spans_by_character(doc): assert span1.end_char == span2.end_char assert span2.label_ == "GPE" - span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict") + span2 = doc.char_span( + span1.start_char, span1.end_char, label="GPE", alignment_mode="strict" + ) assert span1.start_char == span2.start_char assert span1.end_char == span2.end_char assert span2.label_ == "GPE" # alignment mode "contract" - span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract") + span2 = doc.char_span( + span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract" + ) assert span1.start_char == span2.start_char assert span1.end_char == span2.end_char assert span2.label_ == "GPE" # alignment mode "expand" - span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand") + span2 = doc.char_span( + span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand" + ) assert span1.start_char == span2.start_char assert span1.end_char == span2.end_char assert span2.label_ == "GPE" @@ -318,3 +324,11 @@ def test_span_boundaries(doc): _ = span[-5] with pytest.raises(IndexError): _ = span[5] + + +def test_sent(en_tokenizer): + doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.") + span = doc[1:3] + assert not span.doc.is_sentenced + with pytest.raises(ValueError): + span.sent diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 29b87fa8d..cf0775bae 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -391,8 +391,6 @@ cdef class Span: """RETURNS (Span): The sentence span that the span is a part of.""" if "sent" in self.doc.user_span_hooks: return self.doc.user_span_hooks["sent"](self) - # This should raise if not parsed / no custom sentence boundaries - self.doc.sents # Use `sent_start` token attribute to find sentence boundaries cdef int n = 0 if self.doc.is_sentenced: @@ -402,13 +400,14 @@ cdef class Span: start += -1 # Find end of the sentence end = self.end - n = 0 while end < self.doc.length and self.doc.c[end].sent_start != 1: end += 1 n += 1 if n >= self.doc.length: break return self.doc[start:end] + else: + raise ValueError(Errors.E030) @property def ents(self): From fb48de349cd588d601d7c9bdb072f8a51a848694 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 2 Oct 2020 20:31:14 +0200 Subject: [PATCH 08/38] bwd compat for pipe.begin_training --- spacy/errors.py | 4 +++- spacy/language.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index dbb25479d..2c076db52 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -85,7 +85,9 @@ class Warnings: "attribute or operator.") # TODO: fix numbering after merging develop into master - W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.") + W089 = ("The 'begin_training' method has been renamed to 'initialize', " + "for calls to 'nlp' as well as for the individual pipeline " + "components.") W090 = ("Could not locate any {format} files in path '{path}'.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") diff --git a/spacy/language.py b/spacy/language.py index 14b9f4eb0..36cd251f3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1207,7 +1207,11 @@ class Language: ) self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) for name, proc in self.pipeline: - if hasattr(proc, "initialize"): + # backwards compatibility for older components + if hasattr(proc, "begin_training"): + warnings.warn(Warnings.W089, DeprecationWarning) + proc.begin_training(get_examples, pipeline=self.pipeline, sgd=self._optimizer) + elif hasattr(proc, "initialize"): p_settings = I["components"].get(name, {}) p_settings = validate_init_settings( proc.initialize, p_settings, section="components", name=name From 3589a64d44efad29a340b13b505cc47a7fe2c797 Mon Sep 17 00:00:00 2001 From: Stanislav Schmidt Date: Fri, 2 Oct 2020 21:00:11 +0200 Subject: [PATCH 09/38] Change type of texts argument in pipe to iterable (#6186) * Change type of texts argument in pipe to iterable * Add contributor agreement --- .github/contributors/Stannislav.md | 106 +++++++++++++++++++++++++++++ spacy/language.py | 2 +- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/Stannislav.md diff --git a/.github/contributors/Stannislav.md b/.github/contributors/Stannislav.md new file mode 100644 index 000000000..899d6b09b --- /dev/null +++ b/.github/contributors/Stannislav.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Stanislav Schmidt | +| Company name (if applicable) | Blue Brain Project | +| Title or role (if applicable) | ML Engineer | +| Date | 2020-10-02 | +| GitHub username | Stannislav | +| Website (optional) | | diff --git a/spacy/language.py b/spacy/language.py index e9d195453..ee46da3c1 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -751,7 +751,7 @@ class Language(object): ): """Process texts as a stream, and yield `Doc` objects in order. - texts (iterator): A sequence of texts to process. + texts (iterable): A sequence of texts to process. as_tuples (bool): If set to True, inputs should be a sequence of (text, context) tuples. Output will then be a sequence of (doc, context) tuples. Defaults to False. From db419f6b2f31f603484d8cce2587f5fc2ad31825 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Oct 2020 14:57:46 +0200 Subject: [PATCH 10/38] Improve control of training progress and logging (#6184) * Make logging and progress easier to control * Update docs * Cleanup errors * Fix ConfigValidationError * Pass stdout/stderr, not wasabi.Printer * Fix type * Upd logging example * Fix logger example * Fix type --- spacy/cli/train.py | 20 ++++----- spacy/training/initialize.py | 2 +- spacy/training/loggers.py | 74 ++++++++++++++++++++++++---------- spacy/training/loop.py | 66 +++++++++++++++--------------- website/docs/usage/training.md | 41 +++++++++++-------- 5 files changed, 118 insertions(+), 85 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 57a88159d..0b27f63dc 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -3,6 +3,7 @@ from pathlib import Path from wasabi import msg import typer import logging +import sys from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code, setup_gpu @@ -39,7 +40,12 @@ def train_cli( DOCS: https://nightly.spacy.io/api/cli#train """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) - verify_cli_args(config_path, output_path) + # Make sure all files and paths exists if they are needed + if not config_path or not config_path.exists(): + msg.fail("Config file not found", config_path, exits=1) + if output_path is not None and not output_path.exists(): + output_path.mkdir() + msg.good(f"Created output directory: {output_path}") overrides = parse_config_overrides(ctx.args) import_code(code_path) setup_gpu(use_gpu) @@ -50,14 +56,4 @@ def train_cli( nlp = init_nlp(config, use_gpu=use_gpu) msg.good("Initialized pipeline") msg.divider("Training pipeline") - train(nlp, output_path, use_gpu=use_gpu, silent=False) - - -def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None: - # Make sure all files and paths exists if they are needed - if not config_path or not config_path.exists(): - msg.fail("Config file not found", config_path, exits=1) - if output_path is not None: - if not output_path.exists(): - output_path.mkdir() - msg.good(f"Created output directory: {output_path}") + train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index d64f211c4..7cb1555d7 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -102,7 +102,7 @@ def load_vectors_into_model( "with the packaged vectors. Make sure that the vectors package you're " "loading is compatible with the current version of spaCy." ) - err = ConfigValidationError.from_error(config=None, title=title, desc=desc) + err = ConfigValidationError.from_error(e, config=None, title=title, desc=desc) raise err from None nlp.vocab.vectors = vectors_nlp.vocab.vectors if add_strings: diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 0f054d433..be2da4bd8 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -1,18 +1,24 @@ -from typing import Dict, Any, Tuple, Callable, List +from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO +import wasabi +import tqdm +import sys from ..util import registry from .. import util from ..errors import Errors -from wasabi import msg @registry.loggers("spacy.ConsoleLogger.v1") -def console_logger(): +def console_logger(progress_bar: bool=False): def setup_printer( nlp: "Language", - ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: + stdout: IO=sys.stdout, + stderr: IO=sys.stderr + ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable]: + msg = wasabi.Printer(no_print=True) # we assume here that only components are enabled that should be trained & logged logged_pipes = nlp.pipe_names + eval_frequency = nlp.config["training"]["eval_frequency"] score_weights = nlp.config["training"]["score_weights"] score_cols = [col for col, value in score_weights.items() if value is not None] score_widths = [max(len(col), 6) for col in score_cols] @@ -22,10 +28,18 @@ def console_logger(): table_header = [col.upper() for col in table_header] table_widths = [3, 6] + loss_widths + score_widths + [6] table_aligns = ["r" for _ in table_widths] - msg.row(table_header, widths=table_widths) - msg.row(["-" * width for width in table_widths]) + stdout.write(msg.row(table_header, widths=table_widths)) + stdout.write(msg.row(["-" * width for width in table_widths])) + progress = None - def log_step(info: Dict[str, Any]): + def log_step(info: Optional[Dict[str, Any]]): + nonlocal progress + + if info is None: + # If we don't have a new checkpoint, just return. + if progress is not None: + progress.update(1) + return try: losses = [ "{0:.2f}".format(float(info["losses"][pipe_name])) @@ -39,24 +53,37 @@ def console_logger(): keys=list(info["losses"].keys()), ) ) from None + scores = [] for col in score_cols: score = info["other_scores"].get(col, 0.0) try: score = float(score) - if col != "speed": - score *= 100 - scores.append("{0:.2f}".format(score)) except TypeError: err = Errors.E916.format(name=col, score_type=type(score)) raise ValueError(err) from None + if col != "speed": + score *= 100 + scores.append("{0:.2f}".format(score)) + data = ( [info["epoch"], info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] ) - msg.row(data, widths=table_widths, aligns=table_aligns) + if progress is not None: + progress.close() + stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns)) + if progress_bar: + # Set disable=None, so that it disables on non-TTY + progress = tqdm.tqdm( + total=eval_frequency, + disable=None, + leave=False, + file=stderr + ) + progress.set_description(f"Epoch {info['epoch']+1}") def finalize(): pass @@ -70,10 +97,12 @@ def console_logger(): def wandb_logger(project_name: str, remove_config_values: List[str] = []): import wandb - console = console_logger() + console = console_logger(progress_bar=False) def setup_logger( nlp: "Language", + stdout: IO=sys.stdout, + stderr: IO=sys.stderr ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: config = nlp.config.interpolate() config_dot = util.dict_to_dot(config) @@ -81,18 +110,19 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []): del config_dot[field] config = util.dot_to_dict(config_dot) wandb.init(project=project_name, config=config, reinit=True) - console_log_step, console_finalize = console(nlp) + console_log_step, console_finalize = console(nlp, stdout, stderr) - def log_step(info: Dict[str, Any]): + def log_step(info: Optional[Dict[str, Any]]): console_log_step(info) - score = info["score"] - other_scores = info["other_scores"] - losses = info["losses"] - wandb.log({"score": score}) - if losses: - wandb.log({f"loss_{k}": v for k, v in losses.items()}) - if isinstance(other_scores, dict): - wandb.log(other_scores) + if info is not None: + score = info["score"] + other_scores = info["other_scores"] + losses = info["losses"] + wandb.log({"score": score}) + if losses: + wandb.log({f"loss_{k}": v for k, v in losses.items()}) + if isinstance(other_scores, dict): + wandb.log(other_scores) def finalize(): console_finalize() diff --git a/spacy/training/loop.py b/spacy/training/loop.py index e20cddd3e..093a9ebb3 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -1,11 +1,11 @@ -from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any +from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any, IO from typing import Optional, TYPE_CHECKING from pathlib import Path from timeit import default_timer as timer from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator import random -import tqdm -from wasabi import Printer +import wasabi +import sys from .example import Example from ..schemas import ConfigSchemaTraining @@ -21,7 +21,8 @@ def train( output_path: Optional[Path] = None, *, use_gpu: int = -1, - silent: bool = False, + stdout: IO=sys.stdout, + stderr: IO=sys.stderr ) -> None: """Train a pipeline. @@ -29,10 +30,15 @@ def train( output_path (Path): Optional output path to save trained model to. use_gpu (int): Whether to train on GPU. Make sure to call require_gpu before calling this function. - silent (bool): Whether to pretty-print outputs. + stdout (file): A file-like object to write output messages. To disable + printing, set to io.StringIO. + stderr (file): A second file-like object to write output messages. To disable + printing, set to io.StringIO. + RETURNS (Path / None): The path to the final exported model. """ - msg = Printer(no_print=silent) + # We use no_print here so we can respect the stdout/stderr options. + msg = wasabi.Printer(no_print=True) # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() if config["training"]["seed"] is not None: @@ -63,50 +69,44 @@ def train( eval_frequency=T["eval_frequency"], exclude=frozen_components, ) - msg.info(f"Pipeline: {nlp.pipe_names}") + stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}")) if frozen_components: - msg.info(f"Frozen components: {frozen_components}") - msg.info(f"Initial learn rate: {optimizer.learn_rate}") + stdout.write(msg.info(f"Frozen components: {frozen_components}")) + stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}")) with nlp.select_pipes(disable=frozen_components): - print_row, finalize_logger = train_logger(nlp) + log_step, finalize_logger = train_logger(nlp, stdout, stderr) try: - progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) - progress.set_description(f"Epoch 1") for batch, info, is_best_checkpoint in training_step_iterator: - progress.update(1) - if is_best_checkpoint is not None: - progress.close() - print_row(info) - if is_best_checkpoint and output_path is not None: - with nlp.select_pipes(disable=frozen_components): - update_meta(T, nlp, info) - with nlp.use_params(optimizer.averages): - nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-best") - progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) - progress.set_description(f"Epoch {info['epoch']}") + log_step(info if is_best_checkpoint else None) + if is_best_checkpoint is not None and output_path is not None: + with nlp.select_pipes(disable=frozen_components): + update_meta(T, nlp, info) + with nlp.use_params(optimizer.averages): + nlp = before_to_disk(nlp) + nlp.to_disk(output_path / "model-best") except Exception as e: - finalize_logger() if output_path is not None: # We don't want to swallow the traceback if we don't have a - # specific error. - msg.warn( - f"Aborting and saving the final best model. " - f"Encountered exception: {str(e)}" + # specific error, but we do want to warn that we're trying + # to do something here. + stdout.write( + msg.warn( + f"Aborting and saving the final best model. " + f"Encountered exception: {str(e)}" + ) ) - nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-final") raise e finally: finalize_logger() if output_path is not None: - final_model_path = output_path / "model-final" + final_model_path = output_path / "model-last" if optimizer.averages: with nlp.use_params(optimizer.averages): nlp.to_disk(final_model_path) else: nlp.to_disk(final_model_path) - msg.good(f"Saved pipeline to output directory", final_model_path) + # This will only run if we don't hit an error + stdout.write(msg.good("Saved pipeline to output directory", final_model_path)) def train_while_improving( diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 74d2f6de5..fb1efec1b 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -689,8 +689,8 @@ During training, the results of each step are passed to a logger function. By default, these results are written to the console with the [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support for writing the log files to [Weights & Biases](https://www.wandb.com/) with the -[`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a -**dictionary** with the following keys: +[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function +receives a **dictionary** with the following keys: | Key | Value | | -------------- | ---------------------------------------------------------------------------------------------- | @@ -715,30 +715,37 @@ tabular results to a file: ```python ### functions.py -from typing import Tuple, Callable, Dict, Any +import sys +from typing import IO, Tuple, Callable, Dict, Any import spacy +from spacy import Language from pathlib import Path @spacy.registry.loggers("my_custom_logger.v1") def custom_logger(log_path): - def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]: - with Path(log_path).open("w", encoding="utf8") as file_: - file_.write("step\\t") - file_.write("score\\t") - for pipe in nlp.pipe_names: - file_.write(f"loss_{pipe}\\t") - file_.write("\\n") + def setup_logger( + nlp: Language, + stdout: IO=sys.stdout, + stderr: IO=sys.stderr + ) -> Tuple[Callable, Callable]: + stdout.write(f"Logging to {log_path}\n") + log_file = Path(log_path).open("w", encoding="utf8") + log_file.write("step\\t") + log_file.write("score\\t") + for pipe in nlp.pipe_names: + log_file.write(f"loss_{pipe}\\t") + log_file.write("\\n") - def log_step(info: Dict[str, Any]): - with Path(log_path).open("a") as file_: - file_.write(f"{info['step']}\\t") - file_.write(f"{info['score']}\\t") + def log_step(info: Optional[Dict[str, Any]]): + if info: + log_file.write(f"{info['step']}\\t") + log_file.write(f"{info['score']}\\t") for pipe in nlp.pipe_names: - file_.write(f"{info['losses'][pipe]}\\t") - file_.write("\\n") + log_file.write(f"{info['losses'][pipe]}\\t") + log_file.write("\\n") def finalize(): - pass + log_file.close() return log_step, finalize From 7b127f307e648d4ddbb559efb0bf15c5620a4bcf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Oct 2020 16:06:42 +0200 Subject: [PATCH 11/38] Set version to v3.0.0a30 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index acf386ace..e61e5ab25 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a29" +__version__ = "3.0.0a30" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 35d695a031853b1b914ef36bdb84da84f2042ac4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 3 Oct 2020 16:08:24 +0200 Subject: [PATCH 12/38] Update docs --- website/docs/api/dependencyparser.md | 30 +++++++-- website/docs/api/entityrecognizer.md | 30 +++++++-- website/docs/api/morphologizer.md | 30 +++++++-- website/docs/api/tagger.md | 30 +++++++-- website/docs/api/textcategorizer.md | 35 +++++++--- website/docs/usage/training.md | 99 ++++++++++++++++++++++++---- 6 files changed, 209 insertions(+), 45 deletions(-) diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index ea4b779c7..fe8f7d8d5 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -176,12 +176,12 @@ This method was previously called `begin_training`. > path = "corpus/labels/parser.json > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ | ## DependencyParser.predict {#predict tag="method"} @@ -433,6 +433,24 @@ The labels currently added to the component. | ----------- | ------------------------------------------------------ | | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | +## DependencyParser.label_data {#label_data tag="property" new="3"} + +The labels currently added to the component and their internal meta information. +This is the data generated by [`init labels`](/api/cli#init-labels) and used by +[`DependencyParser.initialize`](/api/dependencyparser#initialize) to initialize +the model with a pre-defined label set. + +> #### Example +> +> ```python +> labels = parser.label_data +> parser.initialize(lambda: [], nlp=nlp, labels=labels) +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------------------------------- | +| **RETURNS** | The label data added to the component. ~~Dict[str, Dict[str, Dict[str, int]]]~~ | + ## Serialization fields {#serialization-fields} During serialization, spaCy will export several data fields used to restore diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 5fbd0b229..6ac0d163f 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -165,12 +165,12 @@ This method was previously called `begin_training`. > path = "corpus/labels/ner.json > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Dict[str, Dict[str, int]]]~~ | ## EntityRecognizer.predict {#predict tag="method"} @@ -421,6 +421,24 @@ The labels currently added to the component. | ----------- | ------------------------------------------------------ | | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | +## EntityRecognizer.label_data {#label_data tag="property" new="3"} + +The labels currently added to the component and their internal meta information. +This is the data generated by [`init labels`](/api/cli#init-labels) and used by +[`EntityRecognizer.initialize`](/api/entityrecognizer#initialize) to initialize +the model with a pre-defined label set. + +> #### Example +> +> ```python +> labels = ner.label_data +> ner.initialize(lambda: [], nlp=nlp, labels=labels) +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------------------------------- | +| **RETURNS** | The label data added to the component. ~~Dict[str, Dict[str, Dict[str, int]]]~~ | + ## Serialization fields {#serialization-fields} During serialization, spaCy will export several data fields used to restore diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 50e2bb33a..d32514fb0 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -147,12 +147,12 @@ config. > path = "corpus/labels/morphologizer.json > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | ## Morphologizer.predict {#predict tag="method"} @@ -377,6 +377,24 @@ coarse-grained POS as the feature `POS`. | ----------- | ------------------------------------------------------ | | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | +## Morphologizer.label_data {#label_data tag="property" new="3"} + +The labels currently added to the component and their internal meta information. +This is the data generated by [`init labels`](/api/cli#init-labels) and used by +[`Morphologizer.initialize`](/api/morphologizer#initialize) to initialize the +model with a pre-defined label set. + +> #### Example +> +> ```python +> labels = morphologizer.label_data +> morphologizer.initialize(lambda: [], nlp=nlp, labels=labels) +> ``` + +| Name | Description | +| ----------- | ----------------------------------------------- | +| **RETURNS** | The label data added to the component. ~~dict~~ | + ## Serialization fields {#serialization-fields} During serialization, spaCy will export several data fields used to restore diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index d7c56be67..2123004b6 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -148,12 +148,12 @@ This method was previously called `begin_training`. > path = "corpus/labels/tagger.json > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[list]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | ## Tagger.predict {#predict tag="method"} @@ -411,6 +411,24 @@ The labels currently added to the component. | ----------- | ------------------------------------------------------ | | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | +## Tagger.label_data {#label_data tag="property" new="3"} + +The labels currently added to the component and their internal meta information. +This is the data generated by [`init labels`](/api/cli#init-labels) and used by +[`Tagger.initialize`](/api/tagger#initialize) to initialize the model with a +pre-defined label set. + +> #### Example +> +> ```python +> labels = tagger.label_data +> tagger.initialize(lambda: [], nlp=nlp, labels=labels) +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------------- | +| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ | + ## Serialization fields {#serialization-fields} During serialization, spaCy will export several data fields used to restore diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index dd8c81040..0901a6fa9 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -29,7 +29,6 @@ architectures and their arguments and hyperparameters. > ```python > from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL > config = { -> "labels": [], > "threshold": 0.5, > "model": DEFAULT_TEXTCAT_MODEL, > } @@ -38,7 +37,6 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ | | `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | | `positive_label` | The positive label for a binary task with exclusive classes, None otherwise and by default. ~~Optional[str]~~ | | `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | @@ -61,7 +59,7 @@ architectures and their arguments and hyperparameters. > > # Construction from class > from spacy.pipeline import TextCategorizer -> textcat = TextCategorizer(nlp.vocab, model, labels=[], threshold=0.5, positive_label="POS") +> textcat = TextCategorizer(nlp.vocab, model, threshold=0.5, positive_label="POS") > ``` Create a new pipeline instance. In your application, you would normally use a @@ -74,7 +72,6 @@ shortcut for this and instantiate the component using its string name and | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | _keyword-only_ | | -| `labels` | The labels to use. ~~Iterable[str]~~ | | `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | | `positive_label` | The positive label for a binary task with exclusive classes, None otherwise. ~~Optional[str]~~ | @@ -161,12 +158,12 @@ This method was previously called `begin_training`. > path = "corpus/labels/textcat.json > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -| `labels` | The label information to add to the component. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[dict]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | ## TextCategorizer.predict {#predict tag="method"} @@ -425,6 +422,24 @@ The labels currently added to the component. | ----------- | ------------------------------------------------------ | | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | +## TextCategorizer.label_data {#label_data tag="property" new="3"} + +The labels currently added to the component and their internal meta information. +This is the data generated by [`init labels`](/api/cli#init-labels) and used by +[`TextCategorizer.initialize`](/api/textcategorizer#initialize) to initialize +the model with a pre-defined label set. + +> #### Example +> +> ```python +> labels = textcat.label_data +> textcat.initialize(lambda: [], nlp=nlp, labels=labels) +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------------- | +| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ | + ## Serialization fields {#serialization-fields} During serialization, spaCy will export several data fields used to restore diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 74d2f6de5..6317479bc 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -692,14 +692,14 @@ for writing the log files to [Weights & Biases](https://www.wandb.com/) with the [`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a **dictionary** with the following keys: -| Key | Value | -| -------------- | ---------------------------------------------------------------------------------------------- | -| `epoch` | How many passes over the data have been completed. ~~int~~ | -| `step` | How many steps have been completed. ~~int~~ | -| `score` | The main score from the last evaluation, measured on the dev set. ~~float~~ | -| `other_scores` | The other scores from the last evaluation, measured on the dev set. ~~Dict[str, Any]~~ | -| `losses` | The accumulated training losses, keyed by component name. ~~Dict[str, float]~~ | -| `checkpoints` | A list of previous results, where each result is a (score, step, epoch) tuple. ~~List[Tuple]~~ | +| Key | Value | +| -------------- | ----------------------------------------------------------------------------------------------------- | +| `epoch` | How many passes over the data have been completed. ~~int~~ | +| `step` | How many steps have been completed. ~~int~~ | +| `score` | The main score from the last evaluation, measured on the dev set. ~~float~~ | +| `other_scores` | The other scores from the last evaluation, measured on the dev set. ~~Dict[str, Any]~~ | +| `losses` | The accumulated training losses, keyed by component name. ~~Dict[str, float]~~ | +| `checkpoints` | A list of previous results, where each result is a `(score, step)` tuple. ~~List[Tuple[float, int]]~~ | You can easily implement and plug in your own logger that records the training results in a custom way, or sends them to an experiment management tracker of @@ -819,7 +819,84 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]: ### Customizing the initialization {#initialization} - +When you start training a new model from scratch, +[`spacy train`](/api/cli#train) will call +[`nlp.initialize`](/api/language#initialize) to initialize the pipeline for +training. This process typically includes the following: + +> #### config.cfg (excerpt) +> +> ```ini +> [initialize] +> vectors = ${paths.vectors} +> init_tok2vec = ${paths.init_tok2vec} +> +> [initialize.components] +> # Settings for components +> ``` + +1. Load in **data resources** defined in the `[initialize]` config, including + **word vectors** and + [pretrained](/usage/embeddings-transformers/#pretraining) **tok2vec + weights**. +2. Call the `initialize` methods of the tokenizer (if implemented, e.g. for + [Chinese](/usage/models#chinese)) and pipeline components with a callback to + access the training data, the current `nlp` object and any **custom + arguments** defined in the `[initialize]` config. +3. In **pipeline components**: if needed, use the data to + [infer missing shapes](/usage/layers-architectures#thinc-shape-inference) and + set up the label scheme if no labels are provided. Components may also load + other data like lookup tables or dictionaries. + +The initialization step allows the config to define **all settings** required +for the pipeline, while keeping a separation between settings and functions that +should only be used **before training** to set up the initial pipeline, and +logic and configuration that needs to be available **at runtime**. Without that +separation, TODO: + +![Illustration of pipeline lifecycle](../images/lifecycle.svg) + +#### Initializing labels {#initialization-labels} + +Built-in pipeline components like the +[`EntityRecognizer`](/api/entityrecognizer) or +[`DependencyParser`](/api/dependencyparser) need to know their available labels +and associated internal meta information to initialize their model weights. +Using the `get_examples` callback provided on initialization, they're able to +**read the labels off the training data** automatically, which is very +convenient – but it can also slow down the training process to compute this +information on every run. + +The [`init labels`](/api/cli#init-labels) command lets you auto-generate JSON +files containing the label data for all supported components. You can then pass +in the labels in the `[initialize]` settings for the respective components to +allow them to initialize faster. + +> #### config.cfg +> +> ```ini +> [initialize.components.ner] +> +> [initialize.components.ner.labels] +> @readers = "spacy.read_labels.v1" +> path = "corpus/labels/ner.json +> ``` + +```cli +$ python -m spacy init labels config.cfg ./corpus --paths.train ./corpus/train.spacy +``` + +Under the hood, the command delegates to the `label_data` property of the +pipeline components, for instance +[`EntityRecognizer.label_data`](/api/entityrecognizer#label_data). + + + +The JSON format differs for each component and some components need additional +meta information about their labels. The format exported by +[`init labels`](/api/cli#init-labels) matches what the components need, so you +should always let spaCy **auto-generate the labels** for you. + ## Data utilities {#data} @@ -1298,8 +1375,8 @@ of being dropped. > - [`nlp`](/api/language): The `nlp` object with the pipeline components and > their models. -> - [`nlp.initialize`](/api/language#initialize): Start the training and return -> an optimizer to update the component model weights. +> - [`nlp.initialize`](/api/language#initialize): Initialize the pipeline and +> return an optimizer to update the component model weights. > - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds > state between updates. > - [`nlp.update`](/api/language#update): Update component models with examples. From 989a96308f2d8333718279021d8f42d994404e60 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 3 Oct 2020 16:31:58 +0200 Subject: [PATCH 13/38] Tidy up, auto-format, types --- spacy/training/loggers.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index be2da4bd8..e8c948f54 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -1,5 +1,5 @@ from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO -import wasabi +from wasabi import Printer import tqdm import sys @@ -7,15 +7,16 @@ from ..util import registry from .. import util from ..errors import Errors +if TYPE_CHECKING: + from ..language import Language # noqa: F401 + @registry.loggers("spacy.ConsoleLogger.v1") -def console_logger(progress_bar: bool=False): +def console_logger(progress_bar: bool = False): def setup_printer( - nlp: "Language", - stdout: IO=sys.stdout, - stderr: IO=sys.stderr - ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable]: - msg = wasabi.Printer(no_print=True) + nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr + ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]: + msg = Printer(no_print=True) # we assume here that only components are enabled that should be trained & logged logged_pipes = nlp.pipe_names eval_frequency = nlp.config["training"]["eval_frequency"] @@ -32,14 +33,14 @@ def console_logger(progress_bar: bool=False): stdout.write(msg.row(["-" * width for width in table_widths])) progress = None - def log_step(info: Optional[Dict[str, Any]]): + def log_step(info: Optional[Dict[str, Any]]) -> None: nonlocal progress if info is None: # If we don't have a new checkpoint, just return. if progress is not None: progress.update(1) - return + return try: losses = [ "{0:.2f}".format(float(info["losses"][pipe_name])) @@ -78,14 +79,11 @@ def console_logger(progress_bar: bool=False): if progress_bar: # Set disable=None, so that it disables on non-TTY progress = tqdm.tqdm( - total=eval_frequency, - disable=None, - leave=False, - file=stderr + total=eval_frequency, disable=None, leave=False, file=stderr ) progress.set_description(f"Epoch {info['epoch']+1}") - def finalize(): + def finalize() -> None: pass return log_step, finalize @@ -100,10 +98,8 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []): console = console_logger(progress_bar=False) def setup_logger( - nlp: "Language", - stdout: IO=sys.stdout, - stderr: IO=sys.stderr - ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: + nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr + ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]: config = nlp.config.interpolate() config_dot = util.dict_to_dot(config) for field in remove_config_values: @@ -124,7 +120,7 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []): if isinstance(other_scores, dict): wandb.log(other_scores) - def finalize(): + def finalize() -> None: console_finalize() wandb.join() From dd542ec6a4d3784f20f44c726893a4a80c67baac Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 3 Oct 2020 17:07:38 +0200 Subject: [PATCH 14/38] Fix label initialization of textcat component (#6190) --- spacy/errors.py | 5 +- spacy/pipeline/senter.pyx | 4 -- spacy/pipeline/textcat.py | 52 +++++-------------- spacy/tests/pipeline/test_textcat.py | 50 +++++++++--------- .../serialize/test_serialize_pipeline.py | 8 +-- spacy/training/initialize.py | 30 ----------- website/docs/api/textcategorizer.md | 40 +++++++------- 7 files changed, 64 insertions(+), 125 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index dbb25479d..119b88369 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -497,8 +497,9 @@ class Errors: E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training " "data that does not appear to be a binary classification problem " "with two labels. Labels found: {labels}") - E920 = ("The textcat's 'positive_label' config setting '{pos_label}' " - "does not match any label in the training data. Labels found: {labels}") + E920 = ("The textcat's 'positive_label' setting '{pos_label}' " + "does not match any label in the training data or provided during " + "initialization. Available labels: {labels}") E921 = ("The method 'set_output' can only be called on components that have " "a Model with a 'resize_output' attribute. Otherwise, the output " "layer can not be dynamically changed.") diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 65c17c771..ec635de5c 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -71,10 +71,6 @@ class SentenceRecognizer(Tagger): # are 0 return tuple(["I", "S"]) - @property - def label_data(self): - return self.labels - def set_annotations(self, docs, batch_tag_ids): """Modify a batch of documents, using pre-computed scores. diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index a092d960f..989c65b8f 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -56,12 +56,7 @@ subword_features = true @Language.factory( "textcat", assigns=["doc.cats"], - default_config={ - "labels": [], - "threshold": 0.5, - "positive_label": None, - "model": DEFAULT_TEXTCAT_MODEL, - }, + default_config={"threshold": 0.5, "model": DEFAULT_TEXTCAT_MODEL}, default_score_weights={ "cats_score": 1.0, "cats_score_desc": None, @@ -75,12 +70,7 @@ subword_features = true }, ) def make_textcat( - nlp: Language, - name: str, - model: Model[List[Doc], List[Floats2d]], - labels: List[str], - threshold: float, - positive_label: Optional[str], + nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float, ) -> "TextCategorizer": """Create a TextCategorizer compoment. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels can @@ -90,19 +80,9 @@ def make_textcat( model (Model[List[Doc], List[Floats2d]]): A model instance that predicts scores for each category. - labels (list): A list of categories to learn. If empty, the model infers the - categories from the data. threshold (float): Cutoff to consider a prediction "positive". - positive_label (Optional[str]): The positive label for a binary task with exclusive classes, None otherwise. """ - return TextCategorizer( - nlp.vocab, - model, - name, - labels=labels, - threshold=threshold, - positive_label=positive_label, - ) + return TextCategorizer(nlp.vocab, model, name, threshold=threshold) class TextCategorizer(Pipe): @@ -112,14 +92,7 @@ class TextCategorizer(Pipe): """ def __init__( - self, - vocab: Vocab, - model: Model, - name: str = "textcat", - *, - labels: List[str], - threshold: float, - positive_label: Optional[str], + self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float ) -> None: """Initialize a text categorizer. @@ -127,9 +100,7 @@ class TextCategorizer(Pipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - labels (List[str]): The labels to use. threshold (float): Cutoff to consider a prediction "positive". - positive_label (Optional[str]): The positive label for a binary task with exclusive classes, None otherwise. DOCS: https://nightly.spacy.io/api/textcategorizer#init """ @@ -137,11 +108,7 @@ class TextCategorizer(Pipe): self.model = model self.name = name self._rehearsal_model = None - cfg = { - "labels": labels, - "threshold": threshold, - "positive_label": positive_label, - } + cfg = {"labels": [], "threshold": threshold, "positive_label": None} self.cfg = dict(cfg) @property @@ -348,6 +315,7 @@ class TextCategorizer(Pipe): *, nlp: Optional[Language] = None, labels: Optional[Dict] = None, + positive_label: Optional[str] = None, ): """Initialize the pipe for training, using a representative set of data examples. @@ -369,6 +337,14 @@ class TextCategorizer(Pipe): else: for label in labels: self.add_label(label) + if positive_label is not None: + if positive_label not in self.labels: + err = Errors.E920.format(pos_label=positive_label, labels=self.labels) + raise ValueError(err) + if len(self.labels) != 2: + err = Errors.E919.format(pos_label=positive_label, labels=self.labels) + raise ValueError(err) + self.cfg["positive_label"] = positive_label subbatch = list(islice(get_examples(), 10)) doc_sample = [eg.reference for eg in subbatch] label_sample, _ = self._examples_to_truth(subbatch) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index e0a785851..dd0159927 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -10,7 +10,6 @@ from spacy.tokens import Doc from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer from spacy.training import Example -from spacy.training.initialize import verify_textcat_config from ..util import make_tempdir @@ -21,6 +20,17 @@ TRAIN_DATA = [ ] +def make_get_examples(nlp): + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + def get_examples(): + return train_examples + + return get_examples + + @pytest.mark.skip(reason="Test is flakey when run with others") def test_simple_train(): nlp = Language() @@ -92,10 +102,7 @@ def test_no_label(): def test_implicit_label(): nlp = Language() nlp.add_pipe("textcat") - train_examples = [] - for t in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - nlp.initialize(get_examples=lambda: train_examples) + nlp.initialize(get_examples=make_get_examples(nlp)) def test_no_resize(): @@ -113,29 +120,26 @@ def test_no_resize(): def test_initialize_examples(): nlp = Language() textcat = nlp.add_pipe("textcat") - train_examples = [] for text, annotations in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for label, value in annotations.get("cats").items(): textcat.add_label(label) # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() - nlp.initialize(get_examples=lambda: train_examples) + get_examples = make_get_examples(nlp) + nlp.initialize(get_examples=get_examples) with pytest.raises(ValueError): nlp.initialize(get_examples=lambda: None) with pytest.raises(ValueError): - nlp.initialize(get_examples=train_examples) + nlp.initialize(get_examples=get_examples()) def test_overfitting_IO(): # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly fix_random_seed(0) nlp = English() + nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"} # Set exclusive labels - textcat = nlp.add_pipe( - "textcat", - config={"model": {"exclusive_classes": True}, "positive_label": "POSITIVE"}, - ) + textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}},) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) @@ -203,28 +207,26 @@ def test_textcat_configs(textcat_config): def test_positive_class(): nlp = English() - pipe_config = {"positive_label": "POS", "labels": ["POS", "NEG"]} - textcat = nlp.add_pipe("textcat", config=pipe_config) + textcat = nlp.add_pipe("textcat") + get_examples = make_get_examples(nlp) + textcat.initialize(get_examples, labels=["POS", "NEG"], positive_label="POS") assert textcat.labels == ("POS", "NEG") - verify_textcat_config(nlp, pipe_config) def test_positive_class_not_present(): nlp = English() - pipe_config = {"positive_label": "POS", "labels": ["SOME", "THING"]} - textcat = nlp.add_pipe("textcat", config=pipe_config) - assert textcat.labels == ("SOME", "THING") + textcat = nlp.add_pipe("textcat") + get_examples = make_get_examples(nlp) with pytest.raises(ValueError): - verify_textcat_config(nlp, pipe_config) + textcat.initialize(get_examples, labels=["SOME", "THING"], positive_label="POS") def test_positive_class_not_binary(): nlp = English() - pipe_config = {"positive_label": "POS", "labels": ["SOME", "THING", "POS"]} - textcat = nlp.add_pipe("textcat", config=pipe_config) - assert textcat.labels == ("SOME", "THING", "POS") + textcat = nlp.add_pipe("textcat") + get_examples = make_get_examples(nlp) with pytest.raises(ValueError): - verify_textcat_config(nlp, pipe_config) + textcat.initialize(get_examples, labels=["SOME", "THING", "POS"], positive_label="POS") def test_textcat_evaluation(): diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 1c605fea8..f90531dbb 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -136,13 +136,7 @@ def test_serialize_textcat_empty(en_vocab): # See issue #1105 cfg = {"model": DEFAULT_TEXTCAT_MODEL} model = registry.resolve(cfg, validate=True)["model"] - textcat = TextCategorizer( - en_vocab, - model, - labels=["ENTITY", "ACTION", "MODIFIER"], - threshold=0.5, - positive_label=None, - ) + textcat = TextCategorizer(en_vocab, model, threshold=0.5) textcat.to_bytes(exclude=["vocab"]) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 7cb1555d7..bbdf4f62b 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -50,9 +50,6 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) logger.info("Initialized pipeline components") - # Verify the config after calling 'initialize' to ensure labels - # are properly initialized - verify_config(nlp) return nlp @@ -152,33 +149,6 @@ def init_tok2vec( return False -def verify_config(nlp: "Language") -> None: - """Perform additional checks based on the config, loaded nlp object and training data.""" - # TODO: maybe we should validate based on the actual components, the list - # in config["nlp"]["pipeline"] instead? - for pipe_config in nlp.config["components"].values(): - # We can't assume that the component name == the factory - factory = pipe_config["factory"] - if factory == "textcat": - verify_textcat_config(nlp, pipe_config) - - -def verify_textcat_config(nlp: "Language", pipe_config: Dict[str, Any]) -> None: - # if 'positive_label' is provided: double check whether it's in the data and - # the task is binary - if pipe_config.get("positive_label"): - textcat_labels = nlp.get_pipe("textcat").labels - pos_label = pipe_config.get("positive_label") - if pos_label not in textcat_labels: - raise ValueError( - Errors.E920.format(pos_label=pos_label, labels=textcat_labels) - ) - if len(list(textcat_labels)) != 2: - raise ValueError( - Errors.E919.format(pos_label=pos_label, labels=textcat_labels) - ) - - def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: """RETURNS (List[str]): All sourced components in the original config, e.g. {"source": "en_core_web_sm"}. If the config contains a key diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 0901a6fa9..447765e15 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -35,11 +35,10 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("textcat", config=config) > ``` -| Setting | Description | -| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | -| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise and by default. ~~Optional[str]~~ | -| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | +| Setting | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/textcat.py @@ -59,21 +58,20 @@ architectures and their arguments and hyperparameters. > > # Construction from class > from spacy.pipeline import TextCategorizer -> textcat = TextCategorizer(nlp.vocab, model, threshold=0.5, positive_label="POS") +> textcat = TextCategorizer(nlp.vocab, model, threshold=0.5) > ``` Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| ---------------- | -------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | -| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise. ~~Optional[str]~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | ## TextCategorizer.\_\_call\_\_ {#call tag="method"} @@ -152,18 +150,20 @@ This method was previously called `begin_training`. > ```ini > ### config.cfg > [initialize.components.textcat] +> positive_label = "POS" > > [initialize.components.textcat.labels] > @readers = "spacy.read_labels.v1" > path = "corpus/labels/textcat.json > ``` -| Name | Description | -| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | +| Name | Description | +| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ | +| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise and by default. ~~Optional[str]~~ | ## TextCategorizer.predict {#predict tag="method"} From 7c4ab7e82c5eba0133dee880f5e79d86ec083b13 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 3 Oct 2020 17:16:10 +0200 Subject: [PATCH 15/38] Fix Lemmatizer.get_lookups_config --- spacy/lang/fr/lemmatizer.py | 15 ++++----------- spacy/lang/nl/lemmatizer.py | 14 ++++---------- spacy/lang/pl/lemmatizer.py | 25 ++++++++++--------------- spacy/tests/lang/test_lemmatizers.py | 11 +++++++++-- website/docs/api/lemmatizer.md | 21 ++++----------------- 5 files changed, 31 insertions(+), 55 deletions(-) diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index 0dd782cc4..bb5a270ab 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -1,4 +1,4 @@ -from typing import List, Dict +from typing import List, Tuple from ...pipeline import Lemmatizer from ...tokens import Token @@ -15,17 +15,10 @@ class FrenchLemmatizer(Lemmatizer): """ @classmethod - def get_lookups_config(cls, mode: str) -> Dict: + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: if mode == "rule": - return { - "required_tables": [ - "lemma_lookup", - "lemma_rules", - "lemma_exc", - "lemma_index", - ], - "optional_tables": [], - } + required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + return (required, []) else: return super().get_lookups_config(mode) diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py index 42b97a862..6c025dcf6 100644 --- a/spacy/lang/nl/lemmatizer.py +++ b/spacy/lang/nl/lemmatizer.py @@ -1,4 +1,4 @@ -from typing import List, Dict +from typing import List, Tuple from ...pipeline import Lemmatizer from ...tokens import Token @@ -6,16 +6,10 @@ from ...tokens import Token class DutchLemmatizer(Lemmatizer): @classmethod - def get_lookups_config(cls, mode: str) -> Dict: + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: if mode == "rule": - return { - "required_tables": [ - "lemma_lookup", - "lemma_rules", - "lemma_exc", - "lemma_index", - ], - } + required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + return (required, []) else: return super().get_lookups_config(mode) diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index 406ef9e4a..059d0609a 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -1,4 +1,4 @@ -from typing import List, Dict +from typing import List, Dict, Tuple from ...pipeline import Lemmatizer from ...tokens import Token @@ -11,21 +11,16 @@ class PolishLemmatizer(Lemmatizer): # lemmatization, as well as case-sensitive lemmatization for nouns. @classmethod - def get_lookups_config(cls, mode: str) -> Dict: + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: if mode == "pos_lookup": - return { - "required_tables": [ - "lemma_lookup_adj", - "lemma_lookup_adp", - "lemma_lookup_adv", - "lemma_lookup_aux", - "lemma_lookup_noun", - "lemma_lookup_num", - "lemma_lookup_part", - "lemma_lookup_pron", - "lemma_lookup_verb", - ] - } + # fmt: off + required = [ + "lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", + "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", + "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb" + ] + # fmt: on + return (required, []) else: return super().get_lookups_config(mode) diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py index 5f45664eb..a49d70d6b 100644 --- a/spacy/tests/lang/test_lemmatizers.py +++ b/spacy/tests/lang/test_lemmatizers.py @@ -23,8 +23,9 @@ def test_lemmatizer_initialize(lang, capfd): lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups + lang_cls = get_lang_class(lang) # Test that languages can be initialized - nlp = get_lang_class(lang)() + nlp = lang_cls() lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) assert not lemmatizer.lookups.tables nlp.config["initialize"]["components"]["lemmatizer"] = { @@ -41,7 +42,13 @@ def test_lemmatizer_initialize(lang, capfd): assert doc[0].lemma_ == "y" # Test initialization by calling .initialize() directly - nlp = get_lang_class(lang)() + nlp = lang_cls() lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) lemmatizer.initialize(lookups=lemmatizer_init_lookups()) assert nlp("x")[0].lemma_ == "y" + + # Test lookups config format + for mode in ("rule", "lookup", "pos_lookup"): + required, optional = lemmatizer.get_lookups_config(mode) + assert isinstance(required, list) + assert isinstance(optional, list) diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 27ea04432..e838c75b2 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -190,23 +190,10 @@ lemmatization entirely. Returns the lookups configuration settings for a given mode for use in [`Lemmatizer.load_lookups`](/api/lemmatizer#load_lookups). -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `mode` | The lemmatizer mode. ~~str~~ | -| **RETURNS** | The lookups configuration settings for this mode. Includes the keys `"required_tables"` and `"optional_tables"`, mapped to a list of table string names. ~~Dict[str, List[str]]~~ | - -## Lemmatizer.load_lookups {#load_lookups tag="classmethod"} - -Load and validate lookups tables. If the provided lookups is `None`, load the -default lookups tables according to the language and mode settings. Confirm that -all required tables for the language and mode are present. - -| Name | Description | -| ----------- | -------------------------------------------------------------------------------------------------- | -| `lang` | The language. ~~str~~ | -| `mode` | The lemmatizer mode. ~~str~~ | -| `lookups` | The provided lookups, may be `None` if the default lookups should be loaded. ~~Optional[Lookups]~~ | -| **RETURNS** | The lookups. ~~Lookups~~ | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------- | +| `mode` | The lemmatizer mode. ~~str~~ | +| **RETURNS** | The required table names and the optional table names. ~~Tuple[List[str], List[str]]~~ | ## Lemmatizer.to_disk {#to_disk tag="method"} From 3bc3c05fcc7e4fa40f6a3e43681444b3c36b653e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 3 Oct 2020 17:20:18 +0200 Subject: [PATCH 16/38] Tidy up and auto-format --- spacy/cli/debug_data.py | 2 +- spacy/lang/pl/__init__.py | 1 - spacy/lang/zh/__init__.py | 2 +- spacy/ml/models/tok2vec.py | 8 ++++++-- spacy/pipeline/textcat.py | 2 +- spacy/tests/conftest.py | 3 ++- spacy/tests/doc/test_morphanalysis.py | 14 ++++++++------ spacy/tests/lang/zh/test_serialize.py | 3 ++- spacy/tests/pipeline/test_textcat.py | 7 +++++-- spacy/tests/serialize/test_serialize_doc.py | 8 +++++++- spacy/tests/test_scorer.py | 6 ++---- spacy/training/augment.py | 2 +- spacy/training/loop.py | 6 +++--- spacy/vocab.pyx | 2 +- 14 files changed, 40 insertions(+), 26 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 3dc8d262d..ead759e33 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -171,7 +171,7 @@ def debug_data( n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values()) msg.warn( "{} words in training data without vectors ({:0.2f}%)".format( - n_missing_vectors, n_missing_vectors / gold_train_data["n_words"], + n_missing_vectors, n_missing_vectors / gold_train_data["n_words"] ), ) msg.text( diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index f7be8a6c2..9e7303e83 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -8,7 +8,6 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import PolishLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...lookups import Lookups from ...language import Language diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 858f41f65..55a77330a 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -47,7 +47,7 @@ class Segmenter(str, Enum): @registry.tokenizers("spacy.zh.ChineseTokenizer") -def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char,): +def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char): def chinese_tokenizer_factory(nlp): return ChineseTokenizer(nlp, segmenter=segmenter) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 2870de1b9..f9a906397 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -165,8 +165,12 @@ def MultiHashEmbed( @registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed( - width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool, - feature: Union[int, str]="LOWER" + width: int, + rows: int, + nM: int, + nC: int, + also_use_static_vectors: bool, + feature: Union[int, str] = "LOWER", ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedded representation based on character embeddings, using a feed-forward network. A fixed number of UTF-8 byte characters are used for diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 989c65b8f..fc60ebf89 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -70,7 +70,7 @@ subword_features = true }, ) def make_textcat( - nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float, + nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float ) -> "TextCategorizer": """Create a TextCategorizer compoment. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels can diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index bcf582388..4a3d126d7 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -294,7 +294,8 @@ def zh_tokenizer_pkuseg(): "segmenter": "pkuseg", } }, - "initialize": {"tokenizer": { + "initialize": { + "tokenizer": { "pkuseg_model": "default", } }, diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index b44b13d4c..918d4acdc 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -5,12 +5,14 @@ import pytest def i_has(en_tokenizer): doc = en_tokenizer("I has") doc[0].set_morph({"PronType": "prs"}) - doc[1].set_morph({ - "VerbForm": "fin", - "Tense": "pres", - "Number": "sing", - "Person": "three", - }) + doc[1].set_morph( + { + "VerbForm": "fin", + "Tense": "pres", + "Number": "sing", + "Person": "three", + } + ) return doc diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py index 58c084ec8..03cdbbe24 100644 --- a/spacy/tests/lang/zh/test_serialize.py +++ b/spacy/tests/lang/zh/test_serialize.py @@ -34,7 +34,8 @@ def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): "segmenter": "pkuseg", } }, - "initialize": {"tokenizer": { + "initialize": { + "tokenizer": { "pkuseg_model": "medicine", } }, diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index dd0159927..e950c81c6 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -139,7 +139,8 @@ def test_overfitting_IO(): nlp = English() nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"} # Set exclusive labels - textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}},) + config = {"model": {"exclusive_classes": True}} + textcat = nlp.add_pipe("textcat", config=config) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) @@ -226,7 +227,9 @@ def test_positive_class_not_binary(): textcat = nlp.add_pipe("textcat") get_examples = make_get_examples(nlp) with pytest.raises(ValueError): - textcat.initialize(get_examples, labels=["SOME", "THING", "POS"], positive_label="POS") + textcat.initialize( + get_examples, labels=["SOME", "THING", "POS"], positive_label="POS" + ) def test_textcat_evaluation(): diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 8b6adb83b..00b9d12d4 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -92,7 +92,13 @@ def test_serialize_doc_bin_unknown_spaces(en_vocab): @pytest.mark.parametrize( - "writer_flag,reader_flag,reader_value", [(True, True, "bar"), (True, False, "bar"), (False, True, "nothing"), (False, False, "nothing")] + "writer_flag,reader_flag,reader_value", + [ + (True, True, "bar"), + (True, False, "bar"), + (False, True, "nothing"), + (False, False, "nothing"), + ], ) def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value): """Test that custom extensions are correctly serialized in DocBin.""" diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 039f3d4d8..4c1b09849 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -158,7 +158,7 @@ def test_las_per_type(en_vocab): examples = [] for input_, annot in test_las_apple: doc = Doc( - en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"], + en_vocab, words=input_.split(" "), heads=annot["heads"], deps=annot["deps"] ) gold = {"heads": annot["heads"], "deps": annot["deps"]} doc[0].dep_ = "compound" @@ -182,9 +182,7 @@ def test_ner_per_type(en_vocab): examples = [] for input_, annot in test_ner_cardinal: doc = Doc( - en_vocab, - words=input_.split(" "), - ents=["B-CARDINAL", "O", "B-CARDINAL"], + en_vocab, words=input_.split(" "), ents=["B-CARDINAL", "O", "B-CARDINAL"] ) entities = offsets_to_biluo_tags(doc, annot["entities"]) example = Example.from_dict(doc, {"entities": entities}) diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 8965c5457..7415ad335 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -30,7 +30,7 @@ class OrthVariants(BaseModel): @registry.augmenters("spacy.orth_variants.v1") def create_orth_variants_augmenter( - level: float, lower: float, orth_variants: OrthVariants, + level: float, lower: float, orth_variants: OrthVariants ) -> Callable[["Language", Example], Iterator[Example]]: """Create a data augmentation callback that uses orth-variant replacement. The callback can be added to a corpus or other data iterator during training. diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 093a9ebb3..fbfc5930f 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -21,8 +21,8 @@ def train( output_path: Optional[Path] = None, *, use_gpu: int = -1, - stdout: IO=sys.stdout, - stderr: IO=sys.stderr + stdout: IO = sys.stdout, + stderr: IO = sys.stderr, ) -> None: """Train a pipeline. @@ -34,7 +34,7 @@ def train( printing, set to io.StringIO. stderr (file): A second file-like object to write output messages. To disable printing, set to io.StringIO. - + RETURNS (Path / None): The path to the final exported model. """ # We use no_print here so we can respect the stdout/stderr options. diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ce104d9db..a22f12c65 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -16,7 +16,7 @@ from .errors import Errors from .attrs import intify_attrs, NORM, IS_STOP from .vectors import Vectors from .util import registry -from .lookups import Lookups, load_lookups +from .lookups import Lookups from . import util from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang From d6c967401f1a6fb78f34ec70170cecb2e498e3b8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 3 Oct 2020 17:20:47 +0200 Subject: [PATCH 17/38] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index e61e5ab25..ba0ba1f4a 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a30" +__version__ = "3.0.0a31" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 989c59918c7c5e1b1c61187b53ec893f7358fcb0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 3 Oct 2020 18:53:39 +0200 Subject: [PATCH 18/38] Update docs [ci skip] --- website/docs/api/cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 32d73d762..e51e698dd 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -232,7 +232,7 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [ | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | -| **CREATES** | The final trained pipeline and the best trained pipeline. | +| **CREATES** | The best trained pipeline and the final checkpoint (if training is terminated). | ## convert {#convert tag="command"} From 80603f0fa57c9735a9b07a9af315d695cb445568 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 3 Oct 2020 18:54:09 +0200 Subject: [PATCH 19/38] Make SentenceRecognizer.label_data return None Overwrite the method from the base class (Tagger) but don't export anything in "init labels" --- spacy/pipeline/senter.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index ec635de5c..231072e9c 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -71,6 +71,10 @@ class SentenceRecognizer(Tagger): # are 0 return tuple(["I", "S"]) + @property + def label_data(self): + return None + def set_annotations(self, docs, batch_tag_ids): """Modify a batch of documents, using pre-computed scores. From c2401fca411559c66fa4172886a24d4d632de162 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 3 Oct 2020 19:12:46 +0200 Subject: [PATCH 20/38] Add tests for Pipe.label_data --- spacy/tests/pipeline/test_pipe_methods.py | 33 ++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index ea09d990c..d6d04f158 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,6 +1,6 @@ import pytest from spacy.language import Language -from spacy.util import SimpleFrozenList +from spacy.util import SimpleFrozenList, get_arg_names @pytest.fixture @@ -346,3 +346,34 @@ def test_pipe_methods_frozen(): nlp.components.sort() with pytest.raises(NotImplementedError): nlp.component_names.clear() + + +@pytest.mark.parametrize( + "pipe", + [ + "tagger", + "parser", + "ner", + "textcat", + pytest.param("morphologizer", marks=pytest.mark.xfail), + ], +) +def test_pipe_label_data_exports_labels(pipe): + nlp = Language() + pipe = nlp.add_pipe(pipe) + # Make sure pipe has pipe labels + assert getattr(pipe, "label_data", None) is not None + # Make sure pipe can be initialized with labels + initialize = getattr(pipe, "initialize", None) + assert initialize is not None + assert "labels" in get_arg_names(initialize) + + +@pytest.mark.parametrize("pipe", ["senter", "entity_linker"]) +def test_pipe_label_data_no_labels(pipe): + nlp = Language() + pipe = nlp.add_pipe(pipe) + assert getattr(pipe, "label_data", None) is None + initialize = getattr(pipe, "initialize", None) + if initialize is not None: + assert "labels" not in get_arg_names(initialize) From 8ea8b7d9406244deec7b357c68d1163268fa613f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Oct 2020 19:13:42 +0200 Subject: [PATCH 21/38] Support loading labels in morphologizer --- spacy/pipeline/morphologizer.pyx | 34 ++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index ab0554692..db6fa0a11 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -134,7 +134,7 @@ class Morphologizer(Tagger): self.cfg["labels_pos"][norm_label] = POS_IDS[pos] return 1 - def initialize(self, get_examples, *, nlp=None): + def initialize(self, get_examples, *, nlp=None, labels=None): """Initialize the pipe for training, using a representative set of data examples. @@ -145,20 +145,24 @@ class Morphologizer(Tagger): DOCS: https://nightly.spacy.io/api/morphologizer#initialize """ self._ensure_examples(get_examples) - # First, fetch all labels from the data - for example in get_examples(): - for i, token in enumerate(example.reference): - pos = token.pos_ - morph = str(token.morph) - # create and add the combined morph+POS label - morph_dict = Morphology.feats_to_dict(morph) - if pos: - morph_dict[self.POS_FEAT] = pos - norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] - # add label->morph and label->POS mappings - if norm_label not in self.cfg["labels_morph"]: - self.cfg["labels_morph"][norm_label] = morph - self.cfg["labels_pos"][norm_label] = POS_IDS[pos] + if labels is not None: + self.cfg["labels_morph"] = labels["labels_morph"] + self.cfg["labels_pos"] = labels["labels_pos"] + else: + # First, fetch all labels from the data + for example in get_examples(): + for i, token in enumerate(example.reference): + pos = token.pos_ + morph = str(token.morph) + # create and add the combined morph+POS label + morph_dict = Morphology.feats_to_dict(morph) + if pos: + morph_dict[self.POS_FEAT] = pos + norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] + # add label->morph and label->POS mappings + if norm_label not in self.cfg["labels_morph"]: + self.cfg["labels_morph"][norm_label] = morph + self.cfg["labels_pos"][norm_label] = POS_IDS[pos] if len(self.labels) <= 1: raise ValueError(Errors.E143.format(name=self.name)) doc_sample = [] From b305f2ff5a40fed855fee71b6b0cf7dca775ac28 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Oct 2020 19:26:10 +0200 Subject: [PATCH 22/38] Fix loggers --- spacy/training/loggers.py | 8 +++++--- spacy/training/loop.py | 12 +++++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index e8c948f54..585764214 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -29,8 +29,8 @@ def console_logger(progress_bar: bool = False): table_header = [col.upper() for col in table_header] table_widths = [3, 6] + loss_widths + score_widths + [6] table_aligns = ["r" for _ in table_widths] - stdout.write(msg.row(table_header, widths=table_widths)) - stdout.write(msg.row(["-" * width for width in table_widths])) + stdout.write(msg.row(table_header, widths=table_widths) + "\n") + stdout.write(msg.row(["-" * width for width in table_widths]) + "\n") progress = None def log_step(info: Optional[Dict[str, Any]]) -> None: @@ -75,7 +75,9 @@ def console_logger(progress_bar: bool = False): ) if progress is not None: progress.close() - stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns)) + stdout.write( + msg.row(data, widths=table_widths, aligns=table_aligns) + "\n" + ) if progress_bar: # Set disable=None, so that it disables on non-TTY progress = tqdm.tqdm( diff --git a/spacy/training/loop.py b/spacy/training/loop.py index fbfc5930f..2e347829a 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -69,10 +69,10 @@ def train( eval_frequency=T["eval_frequency"], exclude=frozen_components, ) - stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}")) + stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") if frozen_components: - stdout.write(msg.info(f"Frozen components: {frozen_components}")) - stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}")) + stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n") + stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n") with nlp.select_pipes(disable=frozen_components): log_step, finalize_logger = train_logger(nlp, stdout, stderr) try: @@ -93,7 +93,7 @@ def train( msg.warn( f"Aborting and saving the final best model. " f"Encountered exception: {str(e)}" - ) + ) + "\n" ) raise e finally: @@ -106,7 +106,9 @@ def train( else: nlp.to_disk(final_model_path) # This will only run if we don't hit an error - stdout.write(msg.good("Saved pipeline to output directory", final_model_path)) + stdout.write( + msg.good("Saved pipeline to output directory", final_model_path) + "\n" + ) def train_while_improving( From 85ede32680686c62bae2522cd5690ca7e826a2a5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Oct 2020 19:26:23 +0200 Subject: [PATCH 23/38] Format --- spacy/training/loggers.py | 4 +--- spacy/training/loop.py | 3 ++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 585764214..f0ca7064a 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -75,9 +75,7 @@ def console_logger(progress_bar: bool = False): ) if progress is not None: progress.close() - stdout.write( - msg.row(data, widths=table_widths, aligns=table_aligns) + "\n" - ) + stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns) + "\n") if progress_bar: # Set disable=None, so that it disables on non-TTY progress = tqdm.tqdm( diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 2e347829a..b63adb6c9 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -93,7 +93,8 @@ def train( msg.warn( f"Aborting and saving the final best model. " f"Encountered exception: {str(e)}" - ) + "\n" + ) + + "\n" ) raise e finally: From 70b9de8e589776ba90c000addfa24dffe5915b33 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Oct 2020 19:26:52 +0200 Subject: [PATCH 24/38] Set version to v3.0.0a32 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index ba0ba1f4a..037ca6bcb 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a31" +__version__ = "3.0.0a32" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 835070cedcc427bd111edf640fd923fa0a93ace8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Oct 2020 19:35:10 +0200 Subject: [PATCH 25/38] Upd test --- spacy/tests/pipeline/test_pipe_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index d6d04f158..0b663fcb8 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -355,7 +355,7 @@ def test_pipe_methods_frozen(): "parser", "ner", "textcat", - pytest.param("morphologizer", marks=pytest.mark.xfail), + "morphologizer" ], ) def test_pipe_label_data_exports_labels(pipe): From 3b2a78720c451773a0dd049a3b7f0c18a8558da4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Oct 2020 19:35:19 +0200 Subject: [PATCH 26/38] Upd morphologizer --- spacy/pipeline/morphologizer.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index db6fa0a11..29f0d7fb4 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -146,8 +146,8 @@ class Morphologizer(Tagger): """ self._ensure_examples(get_examples) if labels is not None: - self.cfg["labels_morph"] = labels["labels_morph"] - self.cfg["labels_pos"] = labels["labels_pos"] + self.cfg["labels_morph"] = labels["morph"] + self.cfg["labels_pos"] = labels["pos"] else: # First, fetch all labels from the data for example in get_examples(): From 3f657ed3a1f7844b3629de018ab3fb6351971590 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 3 Oct 2020 22:34:10 +0200 Subject: [PATCH 27/38] implement warning in __init_subclass__ instead --- spacy/errors.py | 6 +++--- spacy/language.py | 6 +----- spacy/pipeline/pipe.pyx | 10 +++++++++- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 2c076db52..791e567eb 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -85,9 +85,9 @@ class Warnings: "attribute or operator.") # TODO: fix numbering after merging develop into master - W089 = ("The 'begin_training' method has been renamed to 'initialize', " - "for calls to 'nlp' as well as for the individual pipeline " - "components.") + W088 = ("This component implements a 'begin_training' method, " + "which should probably be renamed to 'initialize'.") + W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.") W090 = ("Could not locate any {format} files in path '{path}'.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") diff --git a/spacy/language.py b/spacy/language.py index 36cd251f3..14b9f4eb0 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1207,11 +1207,7 @@ class Language: ) self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) for name, proc in self.pipeline: - # backwards compatibility for older components - if hasattr(proc, "begin_training"): - warnings.warn(Warnings.W089, DeprecationWarning) - proc.begin_training(get_examples, pipeline=self.pipeline, sgd=self._optimizer) - elif hasattr(proc, "initialize"): + if hasattr(proc, "initialize"): p_settings = I["components"].get(name, {}) p_settings = validate_init_settings( proc.initialize, p_settings, section="components", name=name diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 5316620e9..a18f04ee3 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -1,4 +1,5 @@ # cython: infer_types=True, profile=True +import warnings from typing import Optional, Tuple import srsly from thinc.api import set_dropout_rate, Model @@ -6,7 +7,7 @@ from thinc.api import set_dropout_rate, Model from ..tokens.doc cimport Doc from ..training import validate_examples -from ..errors import Errors +from ..errors import Errors, Warnings from .. import util @@ -33,6 +34,13 @@ cdef class Pipe: self.name = name self.cfg = dict(cfg) + @classmethod + def __init_subclass__(cls, **kwargs): + """Raise a warning if an inheriting class implements 'begin_training' + (from v2) instead of the new 'initialize' method (from v3)""" + if hasattr(cls, "begin_training"): + warnings.warn(Warnings.W088) + @property def labels(self) -> Optional[Tuple[str]]: return [] From 2110e8f86dd47686c25d0d44fff314be0cf60d42 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 10:06:49 +0200 Subject: [PATCH 28/38] Auto-format --- spacy/tests/pipeline/test_pipe_methods.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 0b663fcb8..c0b9762ed 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -349,14 +349,7 @@ def test_pipe_methods_frozen(): @pytest.mark.parametrize( - "pipe", - [ - "tagger", - "parser", - "ner", - "textcat", - "morphologizer" - ], + "pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"], ) def test_pipe_label_data_exports_labels(pipe): nlp = Language() From d3b3663942ebe862a83cba4ac5a3e2b0e3a6a2cc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 10:11:27 +0200 Subject: [PATCH 29/38] Adjust error message and add test --- spacy/errors.py | 7 +++++-- spacy/pipeline/pipe.pyx | 2 +- spacy/tests/pipeline/test_pipe_methods.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 878eed114..5343e7ce8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -85,8 +85,11 @@ class Warnings: "attribute or operator.") # TODO: fix numbering after merging develop into master - W088 = ("This component implements a 'begin_training' method, " - "which should probably be renamed to 'initialize'.") + W088 = ("The pipeline component {name} implements a 'begin_training' " + "method, which won't be called by spaCy. As of v3.0, 'begin_training' " + "has been renamed to 'initialize' so you likely want to rename the " + "component method. See the documentation for details: " + "https://nightly.spacy.io/api/language#initialize") W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.") W090 = ("Could not locate any {format} files in path '{path}'.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index a18f04ee3..41ca23ace 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -39,7 +39,7 @@ cdef class Pipe: """Raise a warning if an inheriting class implements 'begin_training' (from v2) instead of the new 'initialize' method (from v3)""" if hasattr(cls, "begin_training"): - warnings.warn(Warnings.W088) + warnings.warn(Warnings.W088.format(name=cls.__name__)) @property def labels(self) -> Optional[Tuple[str]]: diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index c0b9762ed..e647ba440 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,5 +1,6 @@ import pytest from spacy.language import Language +from spacy.pipeline import Pipe from spacy.util import SimpleFrozenList, get_arg_names @@ -370,3 +371,14 @@ def test_pipe_label_data_no_labels(pipe): initialize = getattr(pipe, "initialize", None) if initialize is not None: assert "labels" not in get_arg_names(initialize) + + +def test_warning_pipe_begin_training(): + with pytest.warns(UserWarning, match="begin_training"): + + class IncompatPipe(Pipe): + def __init__(self): + ... + + def begin_training(*args, **kwargs): + ... From ff914f4e6feec972b5475cc102be97754cd18dd5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 11:10:26 +0200 Subject: [PATCH 30/38] Lazy-load xx --- spacy/training/converters/conll_ner_to_docs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py index 3b851039c..902db585b 100644 --- a/spacy/training/converters/conll_ner_to_docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -2,9 +2,9 @@ from wasabi import Printer from .. import tags_to_entities from ...training import iob_to_biluo -from ...lang.xx import MultiLanguage from ...tokens import Doc, Span from ...util import load_model +from ...util import load_model, get_lang_class def conll_ner_to_docs( @@ -86,7 +86,7 @@ def conll_ner_to_docs( if model: nlp = load_model(model) else: - nlp = MultiLanguage() + nlp = get_lang_class("xx")() output_docs = [] for conll_doc in input_data.strip().split(doc_delimiter): conll_doc = conll_doc.strip() @@ -136,7 +136,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): "Segmenting sentences with sentencizer. (Use `-b model` for " "improved parser-based sentence segmentation.)" ) - nlp = MultiLanguage() + nlp = get_lang_class("xx")() sentencizer = nlp.create_pipe("sentencizer") lines = doc.strip().split("\n") words = [line.strip().split()[0] for line in lines] From bcd52e5486b5b2747a39675c45d3bc9846afbe12 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 11:16:31 +0200 Subject: [PATCH 31/38] Tidy up errors and warnings --- spacy/cli/_util.py | 3 +- spacy/errors.py | 323 ++++++++---------- spacy/ml/models/tok2vec.py | 3 +- spacy/ml/staticvectors.py | 13 +- .../pipeline/_parser_internals/arc_eager.pyx | 17 +- spacy/pipeline/_parser_internals/ner.pyx | 10 +- spacy/pipeline/morphologizer.pyx | 2 +- spacy/pipeline/senter.pyx | 2 +- spacy/pipeline/tagger.pyx | 4 +- spacy/scorer.py | 2 +- spacy/tokens/doc.pyx | 4 +- spacy/tokens/span.pyx | 6 +- .../training/converters/conll_ner_to_docs.py | 8 +- spacy/training/converters/iob_to_docs.py | 5 +- spacy/training/pretrain.py | 5 +- 15 files changed, 186 insertions(+), 221 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 69c32bbad..c959c9861 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -322,8 +322,7 @@ def git_checkout( if dest.exists(): msg.fail("Destination of checkout must not exist", exits=1) if not dest.parent.exists(): - raise IOError("Parent of destination of checkout must exist") - + msg.fail("Parent of destination of checkout must exist", exits=1) if sparse and git_version >= (2, 22): return git_sparse_checkout(repo, subpath, dest, branch) elif sparse: diff --git a/spacy/errors.py b/spacy/errors.py index 5343e7ce8..9145a7b19 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -16,8 +16,6 @@ def add_codes(err_cls): @add_codes class Warnings: - W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing " - "using ftfy.fix_text if necessary.") W005 = ("Doc object not parsed. This means displaCy won't be able to " "generate a dependency visualization for it. Make sure the Doc " "was processed with a model that supports dependency parsing, and " @@ -51,8 +49,6 @@ class Warnings: W017 = ("Alias '{alias}' already exists in the Knowledge Base.") W018 = ("Entity '{entity}' already exists in the Knowledge Base - " "ignoring the duplicate entry.") - W020 = ("Unnamed vectors. This won't allow multiple vectors models to be " - "loaded. (Shape: {shape})") W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " "incorrect. Modify PhraseMatcher._terminal_hash to fix.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " @@ -65,7 +61,7 @@ class Warnings: "be more efficient to split your training data into multiple " "smaller JSON files instead.") W028 = ("Doc.from_array was called with a vector of type '{type}', " - "but is expecting one of type 'uint64' instead. This may result " + "but is expecting one of type uint64 instead. This may result " "in problems with the vocab further on in the pipeline.") W030 = ("Some entities could not be aligned in the text \"{text}\" with " "entities \"{entities}\". Use " @@ -79,18 +75,17 @@ class Warnings: "If this is surprising, make sure you have the spacy-lookups-data " "package installed. The languages with lexeme normalization tables " "are currently: {langs}") - W034 = ("Please install the package spacy-lookups-data in order to include " - "the default lexeme normalization table for the language '{lang}'.") W035 = ('Discarding subpattern "{pattern}" due to an unrecognized ' "attribute or operator.") # TODO: fix numbering after merging develop into master - W088 = ("The pipeline component {name} implements a 'begin_training' " - "method, which won't be called by spaCy. As of v3.0, 'begin_training' " - "has been renamed to 'initialize' so you likely want to rename the " + W088 = ("The pipeline component {name} implements a `begin_training` " + "method, which won't be called by spaCy. As of v3.0, `begin_training` " + "has been renamed to `initialize`, so you likely want to rename the " "component method. See the documentation for details: " "https://nightly.spacy.io/api/language#initialize") - W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.") + W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed " + "to `nlp.initialize`.") W090 = ("Could not locate any {format} files in path '{path}'.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") @@ -108,39 +103,33 @@ class Warnings: "download a newer compatible model or retrain your custom model " "with the current spaCy version. For more details and available " "updates, run: python -m spacy validate") - W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' " - "instead.") - W097 = ("No Model config was provided to create the '{name}' component, " - "and no default configuration could be found either.") - W098 = ("No Model config was provided to create the '{name}' component, " - "so a default configuration was used.") - W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', " - "but got '{type}' instead, so ignoring it.") + W096 = ("The method `nlp.disable_pipes` is now deprecated - use " + "`nlp.select_pipes` instead.") W100 = ("Skipping unsupported morphological feature(s): '{feature}'. " "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " "string \"Field1=Value1,Value2|Field2=Value3\".") - W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.") + W101 = ("Skipping Doc custom extension '{name}' while merging docs.") W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.") W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported " "word segmenters: {supported}. Defaulting to {default}.") W104 = ("Skipping modifications for '{target}' segmenter. The current " "segmenter is '{current}'.") - W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you " - "need to match on a stream of documents, you can use nlp.pipe and " + W105 = ("As of spaCy v3.0, the `{matcher}.pipe` method is deprecated. If you " + "need to match on a stream of documents, you can use `nlp.pipe` and " "call the {matcher} on each Doc object.") - W107 = ("The property Doc.{prop} is deprecated. Use " - "Doc.has_annotation(\"{attr}\") instead.") + W107 = ("The property `Doc.{prop}` is deprecated. Use " + "`Doc.has_annotation(\"{attr}\")` instead.") @add_codes class Errors: E001 = ("No component '{name}' found in pipeline. Available names: {opts}") E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). " - "This usually happens when spaCy calls nlp.{method} with custom " + "This usually happens when spaCy calls `nlp.{method}` with custom " "component name that's not registered on the current language class. " "If you're using a custom component, make sure you've added the " - "decorator @Language.component (for function components) or " - "@Language.factory (for class components).\n\nAvailable " + "decorator `@Language.component` (for function components) or " + "`@Language.factory` (for class components).\n\nAvailable " "factories: {opts}") E003 = ("Not a valid pipeline component. Expected callable, but " "got {component} (name: '{name}'). If you're using a custom " @@ -158,14 +147,13 @@ class Errors: E008 = ("Can't restore disabled pipeline component '{name}' because it " "doesn't exist in the pipeline anymore. If you want to remove " "components from the pipeline, you should do it before calling " - "`nlp.select_pipes()` or after restoring the disabled components.") + "`nlp.select_pipes` or after restoring the disabled components.") E010 = ("Word vectors set to length 0. This may be because you don't have " "a model installed or loaded, or because your model doesn't " "include word vectors. For more info, see the docs:\n" "https://nightly.spacy.io/usage/models") E011 = ("Unknown operator: '{op}'. Options: {opts}") E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}") - E014 = ("Unknown tag ID: {tag}") E016 = ("MultitaskObjective target should be function or one of: dep, " "tag, ent, dep_tag_offset, ent_tag.") E017 = ("Can only add unicode or bytes. Got type: {value_type}") @@ -181,27 +169,24 @@ class Errors: "For example, are all labels added to the model? If you're " "training a named entity recognizer, also make sure that none of " "your annotated entity spans have leading or trailing whitespace " - "or punctuation. " - "You can also use the experimental `debug data` command to " + "or punctuation. You can also use the `debug data` command to " "validate your JSON-formatted training data. For details, run:\n" "python -m spacy debug data --help") E025 = ("String is too long: {length} characters. Max is 2**30.") E026 = ("Error accessing token at position {i}: out of bounds in Doc of " "length {length}.") - E027 = ("Arguments 'words' and 'spaces' should be sequences of the same " - "length, or 'spaces' should be left default at None. spaces " + E027 = ("Arguments `words` and `spaces` should be sequences of the same " + "length, or `spaces` should be left default at None. `spaces` " "should be a sequence of booleans, with True meaning that the " "word owns a ' ' character following it.") - E028 = ("orths_and_spaces expects either a list of unicode string or a " - "list of (unicode, bool) tuples. Got bytes instance: {value}") - E029 = ("noun_chunks requires the dependency parse, which requires a " + E028 = ("`words` expects a list of unicode strings, but got bytes instance: {value}") + E029 = ("`noun_chunks` requires the dependency parse, which requires a " "statistical model to be installed and loaded. For more info, see " "the documentation:\nhttps://nightly.spacy.io/usage/models") E030 = ("Sentence boundaries unset. You can add the 'sentencizer' " - "component to the pipeline with: " - "nlp.add_pipe('sentencizer'). " + "component to the pipeline with: `nlp.add_pipe('sentencizer')`. " "Alternatively, add the dependency parser, or set sentence " - "boundaries by setting doc[i].is_sent_start.") + "boundaries by setting `doc[i].is_sent_start`.") E031 = ("Invalid token: empty string ('') at position {i}.") E033 = ("Cannot load into non-empty Doc of length {length}.") E035 = ("Error creating span with start {start} and end {end} for Doc of " @@ -215,7 +200,7 @@ class Errors: "issue here: http://github.com/explosion/spaCy/issues") E040 = ("Attempt to access token at {i}, max length {max_length}.") E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?") - E042 = ("Error accessing doc[{i}].nbor({j}), for doc of length {length}.") + E042 = ("Error accessing `doc[{i}].nbor({j})`, for doc of length {length}.") E043 = ("Refusing to write to token.sent_start if its document is parsed, " "because this may cause inconsistent state.") E044 = ("Invalid value for token.sent_start: {value}. Must be one of: " @@ -235,7 +220,7 @@ class Errors: E056 = ("Invalid tokenizer exception: ORTH values combined don't match " "original string.\nKey: {key}\nOrths: {orths}") E057 = ("Stepped slices not supported in Span objects. Try: " - "list(tokens)[start:stop:step] instead.") + "`list(tokens)[start:stop:step]` instead.") E058 = ("Could not retrieve vector for key {key}.") E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}") E060 = ("Cannot add new key to vectors: the table is full. Current shape: " @@ -244,7 +229,7 @@ class Errors: "and 63 are occupied. You can replace one by specifying the " "`flag_id` explicitly, e.g. " "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.") - E063 = ("Invalid value for flag_id: {value}. Flag IDs must be between 1 " + E063 = ("Invalid value for `flag_id`: {value}. Flag IDs must be between 1 " "and 63 (inclusive).") E064 = ("Error fetching a Lexeme from the Vocab. When looking up a " "string, the lexeme returned had an orth ID that did not match " @@ -273,7 +258,7 @@ class Errors: E085 = ("Can't create lexeme for string '{string}'.") E087 = ("Unknown displaCy style: {style}.") E088 = ("Text of length {length} exceeds maximum of {max_length}. The " - "v2.x parser and NER models require roughly 1GB of temporary " + "parser and NER models require roughly 1GB of temporary " "memory per 100,000 characters in the input. This means long " "texts may cause memory allocation errors. If you're not using " "the parser or NER, it's probably safe to increase the " @@ -290,8 +275,8 @@ class Errors: E094 = ("Error reading line {line_num} in vectors file {loc}.") E095 = ("Can't write to frozen dictionary. This is likely an internal " "error. Are you writing to a default function argument?") - E096 = ("Invalid object passed to displaCy: Can only visualize Doc or " - "Span objects, or dicts if set to manual=True.") + E096 = ("Invalid object passed to displaCy: Can only visualize `Doc` or " + "Span objects, or dicts if set to `manual=True`.") E097 = ("Invalid pattern: expected token pattern (list of dicts) or " "phrase pattern (string) but got:\n{pattern}") E098 = ("Invalid pattern: expected both RIGHT_ID and RIGHT_ATTRS.") @@ -308,11 +293,11 @@ class Errors: E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A " "token can only be part of one entity, so make sure the entities " "you're setting don't overlap.") - E106 = ("Can't find doc._.{attr} attribute specified in the underscore " + E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore " "settings: {opts}") - E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}") + E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}") E109 = ("Component '{name}' could not be run. Did you forget to " - "call initialize()?") + "call `initialize()`?") E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}") E111 = ("Pickling a token is not supported, because tokens are only views " "of the parent Doc and can't exist on their own. A pickled token " @@ -329,8 +314,8 @@ class Errors: E117 = ("The newly split tokens must match the text of the original token. " "New orths: {new}. Old text: {old}.") E118 = ("The custom extension attribute '{attr}' is not registered on the " - "Token object so it can't be set during retokenization. To " - "register an attribute, use the Token.set_extension classmethod.") + "`Token` object so it can't be set during retokenization. To " + "register an attribute, use the `Token.set_extension` classmethod.") E119 = ("Can't set custom extension attribute '{attr}' during " "retokenization because it's not writable. This usually means it " "was registered with a getter function (and no setter) or as a " @@ -354,7 +339,7 @@ class Errors: E130 = ("You are running a narrow unicode build, which is incompatible " "with spacy >= 2.1.0. To fix this, reinstall Python and use a wide " "unicode build instead. You can also rebuild Python and set the " - "--enable-unicode=ucs4 flag.") + "`--enable-unicode=ucs4 flag`.") E131 = ("Cannot write the kb_id of an existing Span object because a Span " "is a read-only view of the underlying Token objects stored in " "the Doc. Instead, create a new Span object and specify the " @@ -367,27 +352,20 @@ class Errors: E133 = ("The sum of prior probabilities for alias '{alias}' should not " "exceed 1, but found {sum}.") E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") - E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure " - "to provide a valid JSON object as input with either the `text` " - "or `tokens` key. For more info, see the docs:\n" - "https://nightly.spacy.io/api/cli#pretrain-jsonl") - E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input " - "includes either the `text` or `tokens` key. For more info, see " - "the docs:\nhttps://nightly.spacy.io/api/cli#pretrain-jsonl") - E139 = ("Knowledge Base for component '{name}' is empty. Use the methods " - "kb.add_entity and kb.add_alias to add entries.") + E139 = ("Knowledge base for component '{name}' is empty. Use the methods " + "`kb.add_entity` and `kb.add_alias` to add entries.") E140 = ("The list of entities, prior probabilities and entity vectors " "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " "provided {found}.") E143 = ("Labels for component '{name}' not initialized. This can be fixed " "by calling add_label, or by providing a representative batch of " - "examples to the component's initialize method.") + "examples to the component's `initialize` method.") E145 = ("Error reading `{param}` from input file.") - E146 = ("Could not access `{path}`.") + E146 = ("Could not access {path}.") E147 = ("Unexpected error in the {method} functionality of the " "EntityLinker: {msg}. This is likely a bug in spaCy, so feel free " - "to open an issue.") + "to open an issue: https://github.com/explosion/spaCy/issues") E148 = ("Expected {ents} KB identifiers but got {ids}. Make sure that " "each entity in `doc.ents` is assigned to a KB identifier.") E149 = ("Error deserializing model. Check that the config used to create " @@ -395,18 +373,18 @@ class Errors: E150 = ("The language of the `nlp` object and the `vocab` should be the " "same, but found '{nlp}' and '{vocab}' respectively.") E152 = ("The attribute {attr} is not supported for token patterns. " - "Please use the option validate=True with Matcher, PhraseMatcher, " + "Please use the option `validate=True` with the Matcher, PhraseMatcher, " "or EntityRuler for more details.") E153 = ("The value type {vtype} is not supported for token patterns. " "Please use the option validate=True with Matcher, PhraseMatcher, " "or EntityRuler for more details.") E154 = ("One of the attributes or values is not supported for token " - "patterns. Please use the option validate=True with Matcher, " + "patterns. Please use the option `validate=True` with the Matcher, " "PhraseMatcher, or EntityRuler for more details.") E155 = ("The pipeline needs to include a {pipe} in order to use " "Matcher or PhraseMatcher with the attribute {attr}. " - "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) " - "instead of list(nlp.tokenizer.pipe()).") + "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` " + "instead of `list(nlp.tokenizer.pipe())`.") E157 = ("Can't render negative values for dependency arc start or end. " "Make sure that you're passing in absolute token indices, not " "relative token offsets.\nstart: {start}, end: {end}, label: " @@ -415,13 +393,11 @@ class Errors: E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}") E160 = ("Can't find language data file: {path}") E161 = ("Found an internal inconsistency when predicting entity links. " - "This is likely a bug in spaCy, so feel free to open an issue.") - E162 = ("Cannot evaluate textcat model on data with different labels.\n" - "Labels in model: {model_labels}\nLabels in evaluation " - "data: {eval_labels}") + "This is likely a bug in spaCy, so feel free to open an issue: " + "https://github.com/explosion/spaCy/issues") E163 = ("cumsum was found to be unstable: its last element does not " "correspond to sum") - E164 = ("x is neither increasing nor decreasing: {}.") + E164 = ("x is neither increasing nor decreasing: {x}.") E165 = ("Only one class present in y_true. ROC AUC score is not defined in " "that case.") E166 = ("Can only merge DocBins with the same value for '{param}'.\n" @@ -436,10 +412,10 @@ class Errors: E178 = ("Each pattern should be a list of dicts, but got: {pat}. Maybe you " "accidentally passed a single pattern to Matcher.add instead of a " "list of patterns? If you only want to add one pattern, make sure " - "to wrap it in a list. For example: matcher.add('{key}', [pattern])") + "to wrap it in a list. For example: `matcher.add('{key}', [pattern])`") E179 = ("Invalid pattern. Expected a list of Doc objects but got a single " "Doc. If you only want to add one pattern, make sure to wrap it " - "in a list. For example: matcher.add('{key}', [doc])") + "in a list. For example: `matcher.add('{key}', [doc])`") E180 = ("Span attributes can't be declared as required or assigned by " "components, since spans are only views of the Doc. Use Doc and " "Token attributes (or custom extension attributes) only and remove " @@ -447,17 +423,16 @@ class Errors: E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. " "Only Doc and Token attributes are supported.") E182 = ("Received invalid attribute declaration: {attr}\nDid you forget " - "to define the attribute? For example: {attr}.???") + "to define the attribute? For example: `{attr}.???`") E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level " "attributes are supported, for example: {solution}") E184 = ("Only attributes without underscores are supported in component " "attribute declarations (because underscore and non-underscore " "attributes are connected anyways): {attr} -> {solution}") E185 = ("Received invalid attribute in component attribute declaration: " - "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.") - E186 = ("'{tok_a}' and '{tok_b}' are different texts.") + "`{obj}.{attr}`\nAttribute '{attr}' does not exist on {obj}.") E187 = ("Only unicode strings are supported as labels.") - E189 = ("Each argument to Doc.__init__ should be of equal length.") + E189 = ("Each argument to `Doc.__init__` should be of equal length.") E190 = ("Token head out of range in `Doc.from_array()` for token index " "'{index}' with value '{value}' (equivalent to relative head " "index: '{rel_head_index}'). The head indices should be relative " @@ -471,17 +446,32 @@ class Errors: "({curr_dim}).") E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.") E195 = ("Matcher can be called on {good} only, got {got}.") - E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can " - "only be fixed with token.is_sent_start.") + E196 = ("Refusing to write to `token.is_sent_end`. Sentence boundaries can " + "only be fixed with `token.is_sent_start`.") E197 = ("Row out of bounds, unable to add row {row} for key {key}.") E198 = ("Unable to return {n} most similar vectors for the current vectors " "table, which contains {n_rows} vectors.") - E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") - E200 = ("Specifying a base model with a pretrained component '{component}' " - "can not be combined with adding a pretrained Tok2Vec layer.") - E201 = ("Span index out of range.") + E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.") + E200 = ("Can't yet set {attr} from Span. Vote for this feature on the " + "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " + "Try checking whitespace and delimiters. See " + "https://nightly.spacy.io/api/cli#convert") + E093 = ("The token-per-line NER file is not formatted correctly. Try checking " + "whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert") + E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This " + "dimension refers to the output width, after the linear projection " + "has been applied.") + E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This " + "dimension refers to the width of the vectors table.") + E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}") + E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}") + E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.") + E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.") + E910 = ("Encountered NaN value when computing loss for component '{name}'.") + E911 = ("Invalid feature: {feat}. Must be a token attribute.") E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found " "for mode '{mode}'. Required tables: {tables}. Found: {found}.") E913 = ("Corpus path can't be None. Maybe you forgot to define it in your " @@ -494,44 +484,44 @@ class Errors: "final score, set its weight to null in the [training.score_weights] " "section of your training config.") E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})") - E917 = ("Received invalid value {value} for 'state_type' in " + E917 = ("Received invalid value {value} for `state_type` in " "TransitionBasedParser: only 'parser' or 'ner' are valid options.") E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid " - "values are an instance of spacy.vocab.Vocab or True to create one" + "values are an instance of `spacy.vocab.Vocab` or True to create one" " (default).") - E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training " + E919 = ("A textcat `positive_label` '{pos_label}' was provided for training " "data that does not appear to be a binary classification problem " "with two labels. Labels found: {labels}") - E920 = ("The textcat's 'positive_label' setting '{pos_label}' " + E920 = ("The textcat's `positive_label` setting '{pos_label}' " "does not match any label in the training data or provided during " "initialization. Available labels: {labels}") - E921 = ("The method 'set_output' can only be called on components that have " - "a Model with a 'resize_output' attribute. Otherwise, the output " + E921 = ("The method `set_output` can only be called on components that have " + "a Model with a `resize_output` attribute. Otherwise, the output " "layer can not be dynamically changed.") E922 = ("Component '{name}' has been initialized with an output dimension of " "{nO} - cannot add any more labels.") E923 = ("It looks like there is no proper sample data to initialize the " - "Model of component '{name}'. " - "This is likely a bug in spaCy, so feel free to open an issue.") + "Model of component '{name}'. This is likely a bug in spaCy, so " + "feel free to open an issue: https://github.com/explosion/spaCy/issues") E924 = ("The '{name}' component does not seem to be initialized properly. " - "This is likely a bug in spaCy, so feel free to open an issue.") + "This is likely a bug in spaCy, so feel free to open an issue: " + "https://github.com/explosion/spaCy/issues") E925 = ("Invalid color values for displaCy visualizer: expected dictionary " "mapping label names to colors but got: {obj}") - E926 = ("It looks like you're trying to modify nlp.{attr} directly. This " + E926 = ("It looks like you're trying to modify `nlp.{attr}` directly. This " "doesn't work because it's an immutable computed property. If you " "need to modify the pipeline, use the built-in methods like " - "nlp.add_pipe, nlp.remove_pipe, nlp.disable_pipe or nlp.enable_pipe " - "instead.") + "`nlp.add_pipe`, `nlp.remove_pipe`, `nlp.disable_pipe` or " + "`nlp.enable_pipe` instead.") E927 = ("Can't write to frozen list Maybe you're trying to modify a computed " "property or default function argument?") - E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, " + E928 = ("A KnowledgeBase can only be serialized to/from from a directory, " "but the provided argument {loc} points to a file.") - E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " - "not seem to exist.") - E930 = ("Received invalid get_examples callback in {name}.initialize. " + E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.") + E930 = ("Received invalid get_examples callback in `{name}.initialize`. " "Expected function that returns an iterable of Example objects but " "got: {obj}") - E931 = ("Encountered Pipe subclass without Pipe.{method} method in component " + E931 = ("Encountered Pipe subclass without `Pipe.{method}` method in component " "'{name}'. If the component is trainable and you want to use this " "method, make sure it's overwritten on the subclass. If your " "component isn't trainable, add a method that does nothing or " @@ -544,21 +534,21 @@ class Errors: "models, see the models directory: https://spacy.io/models. If you " "want to create a blank model, use spacy.blank: " "nlp = spacy.blank(\"{name}\")") - E942 = ("Executing after_{name} callback failed. Expected the function to " + E942 = ("Executing `after_{name}` callback failed. Expected the function to " "return an initialized nlp object but got: {value}. Maybe " "you forgot to return the modified object in your function?") - E943 = ("Executing before_creation callback failed. Expected the function to " + E943 = ("Executing `before_creation` callback failed. Expected the function to " "return an uninitialized Language subclass but got: {value}. Maybe " "you forgot to return the modified object in your function or " "returned the initialized nlp object instead?") - E944 = ("Can't copy pipeline component '{name}' from source model '{model}': " + E944 = ("Can't copy pipeline component '{name}' from source '{model}': " "not found in pipeline. Available components: {opts}") E945 = ("Can't copy pipeline component '{name}' from source. Expected loaded " "nlp object, but got: {source}") - E947 = ("Matcher.add received invalid 'greedy' argument: expected " + E947 = ("`Matcher.add` received invalid `greedy` argument: expected " "a string value from {expected} but got: '{arg}'") - E948 = ("Matcher.add received invalid 'patterns' argument: expected " - "a List, but got: {arg_type}") + E948 = ("`Matcher.add` received invalid 'patterns' argument: expected " + "a list, but got: {arg_type}") E949 = ("Can only create an alignment when the texts are the same.") E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") @@ -570,9 +560,9 @@ class Errors: "for your language.") E956 = ("Can't find component '{name}' in [components] block in the config. " "Available components: {opts}") - E957 = ("Writing directly to Language.factories isn't needed anymore in " - "spaCy v3. Instead, you can use the @Language.factory decorator " - "to register your custom component factory or @Language.component " + E957 = ("Writing directly to `Language.factories` isn't needed anymore in " + "spaCy v3. Instead, you can use the `@Language.factory` decorator " + "to register your custom component factory or `@Language.component` " "to register a simple stateless function component that just takes " "a Doc and returns it.") E958 = ("Language code defined in config ({bad_lang_code}) does not match " @@ -590,99 +580,93 @@ class Errors: "component.\n\n{config}") E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, " "got: {cfg_type}.") - E963 = ("Can't read component info from @Language.{decorator} decorator. " + E963 = ("Can't read component info from `@Language.{decorator}` decorator. " "Maybe you forgot to call it? Make sure you're using " - "@Language.{decorator}() instead of @Language.{decorator}.") + "`@Language.{decorator}()` instead of `@Language.{decorator}`.") E964 = ("The pipeline component factory for '{name}' needs to have the " "following named arguments, which are passed in by spaCy:\n- nlp: " "receives the current nlp object and lets you access the vocab\n- " "name: the name of the component instance, can be used to identify " "the component, output losses etc.") - E965 = ("It looks like you're using the @Language.component decorator to " + E965 = ("It looks like you're using the `@Language.component` decorator to " "register '{name}' on a class instead of a function component. If " "you need to register a class or function that *returns* a component " - "function, use the @Language.factory decorator instead.") - E966 = ("nlp.add_pipe now takes the string name of the registered component " + "function, use the `@Language.factory` decorator instead.") + E966 = ("`nlp.add_pipe` now takes the string name of the registered component " "factory, not a callable component. Expected string, but got " "{component} (name: '{name}').\n\n- If you created your component " - "with nlp.create_pipe('name'): remove nlp.create_pipe and call " - "nlp.add_pipe('name') instead.\n\n- If you passed in a component " - "like TextCategorizer(): call nlp.add_pipe with the string name " - "instead, e.g. nlp.add_pipe('textcat').\n\n- If you're using a custom " - "component: Add the decorator @Language.component (for function " - "components) or @Language.factory (for class components / factories) " + "with `nlp.create_pipe('name')`: remove nlp.create_pipe and call " + "`nlp.add_pipe('name')` instead.\n\n- If you passed in a component " + "like `TextCategorizer()`: call `nlp.add_pipe` with the string name " + "instead, e.g. `nlp.add_pipe('textcat')`.\n\n- If you're using a custom " + "component: Add the decorator `@Language.component` (for function " + "components) or `@Language.factory` (for class components / factories) " "to your custom component and assign it a name, e.g. " - "@Language.component('your_name'). You can then run " - "nlp.add_pipe('your_name') to add it to the pipeline.") + "`@Language.component('your_name')`. You can then run " + "`nlp.add_pipe('your_name')` to add it to the pipeline.") E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.") - E968 = ("nlp.replace_pipe now takes the string name of the registered component " + E968 = ("`nlp.replace_pipe` now takes the string name of the registered component " "factory, not a callable component. Expected string, but got " "{component}.\n\n- If you created your component with" - "with nlp.create_pipe('name'): remove nlp.create_pipe and call " - "nlp.replace_pipe('{name}', 'name') instead.\n\n- If you passed in a " - "component like TextCategorizer(): call nlp.replace_pipe with the " - "string name instead, e.g. nlp.replace_pipe('{name}', 'textcat').\n\n" + "with `nlp.create_pipe('name')`: remove `nlp.create_pipe` and call " + "`nlp.replace_pipe('{name}', 'name')` instead.\n\n- If you passed in a " + "component like `TextCategorizer()`: call `nlp.replace_pipe` with the " + "string name instead, e.g. `nlp.replace_pipe('{name}', 'textcat')`.\n\n" "- If you're using a custom component: Add the decorator " - "@Language.component (for function components) or @Language.factory " + "`@Language.component` (for function components) or `@Language.factory` " "(for class components / factories) to your custom component and " - "assign it a name, e.g. @Language.component('your_name'). You can " - "then run nlp.replace_pipe('{name}', 'your_name').") + "assign it a name, e.g. `@Language.component('your_name')`. You can " + "then run `nlp.replace_pipe('{name}', 'your_name')`.") E969 = ("Expected string values for field '{field}', but received {types} instead. ") E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?") - E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " + E971 = ("Found incompatible lengths in `Doc.from_array`: {array_length} for the " "array and {doc_length} for the Doc itself.") - E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") + E972 = ("`Example.__init__` got None for '{arg}'. Requires Doc.") E973 = ("Unexpected type for NER data") E974 = ("Unknown {obj} attribute: {key}") - E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, " + E976 = ("The method `Example.from_dict` expects a {type} as {n} argument, " "but received None.") E977 = ("Can not compare a MorphAnalysis with a string object. " - "This is likely a bug in spaCy, so feel free to open an issue.") + "This is likely a bug in spaCy, so feel free to open an issue: " + "https://github.com/explosion/spaCy/issues") E978 = ("The {name} method takes a list of Example objects, but got: {types}") - E979 = ("Cannot convert {type} to an Example object.") E980 = ("Each link annotation should refer to a dictionary with at most one " "identifier mapping to 1.0, and all others to 0.0.") - E981 = ("The offsets of the annotations for 'links' could not be aligned " + E981 = ("The offsets of the annotations for `links` could not be aligned " "to token boundaries.") - E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing " + E982 = ("The `Token.ent_iob` attribute should be an integer indexing " "into {values}, but found {value}.") E983 = ("Invalid key for '{dict}': {key}. Available keys: " "{keys}") E984 = ("Invalid component config for '{name}': component block needs either " - "a key 'factory' specifying the registered function used to " - "initialize the component, or a key 'source' key specifying a " - "spaCy model to copy the component from. For example, factory = " - "\"ner\" will use the 'ner' factory and all other settings in the " - "block will be passed to it as arguments. Alternatively, source = " - "\"en_core_web_sm\" will copy the component from that model.\n\n{config}") - E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}") + "a key `factory` specifying the registered function used to " + "initialize the component, or a key `source` key specifying a " + "spaCy model to copy the component from. For example, `factory = " + "\"ner\"` will use the 'ner' factory and all other settings in the " + "block will be passed to it as arguments. Alternatively, `source = " + "\"en_core_web_sm\"` will copy the component from that model.\n\n{config}") + E985 = ("Can't load model from config file: no [nlp] section found.\n\n{config}") E986 = ("Could not create any training batches: check your input. " - "Are the train and dev paths defined? " - "Is 'discard_oversize' set appropriately? ") - E987 = ("The text of an example training instance is either a Doc or " - "a string, but found {type} instead.") - E988 = ("Could not parse any training examples. Ensure the data is " - "formatted correctly.") - E989 = ("'nlp.update()' was called with two positional arguments. This " + "Are the train and dev paths defined? Is `discard_oversize` set appropriately? ") + E989 = ("`nlp.update()` was called with two positional arguments. This " "may be due to a backwards-incompatible change to the format " "of the training data in spaCy 3.0 onwards. The 'update' " - "function should now be called with a batch of 'Example' " - "objects, instead of (text, annotation) tuples. ") - E991 = ("The function 'select_pipes' should be called with either a " - "'disable' argument to list the names of the pipe components " + "function should now be called with a batch of Example " + "objects, instead of `(text, annotation)` tuples. ") + E991 = ("The function `nlp.select_pipes` should be called with either a " + "`disable` argument to list the names of the pipe components " "that should be disabled, or with an 'enable' argument that " "specifies which pipes should not be disabled.") E992 = ("The function `select_pipes` was called with `enable`={enable} " "and `disable`={disable} but that information is conflicting " "for the `nlp` pipeline with components {names}.") - E993 = ("The config for 'nlp' needs to include a key 'lang' specifying " + E993 = ("The config for the nlp object needs to include a key `lang` specifying " "the code of the language to initialize it with (for example " - "'en' for English) - this can't be 'None'.\n\n{config}") - E996 = ("Could not parse {file}: {msg}") + "'en' for English) - this can't be None.\n\n{config}") E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " "'{token_attrs}'.") - E999 = ("Unable to merge the `Doc` objects because they do not all share " + E999 = ("Unable to merge the Doc objects because they do not all share " "the same `Vocab`.") E1000 = ("The Chinese word segmenter is pkuseg but no pkuseg model was " "loaded. Provide the name of a pretrained model or the path to " @@ -694,35 +678,24 @@ class Errors: E1003 = ("Unsupported lemmatizer mode '{mode}'.") E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. " "Required tables: {tables}. Found: {found}. Maybe you forgot to " - "call nlp.initialize() to load in the data?") + "call `nlp.initialize()` to load in the data?") E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for " "'{chunk}'. Tokenizer exceptions are only allowed to specify " - "`ORTH` and `NORM`.") - E1006 = ("Unable to initialize {name} model with 0 labels.") + "ORTH and NORM.") E1007 = ("Unsupported DependencyMatcher operator '{op}'.") E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check " "that you are providing a list of patterns as `List[List[dict]]`.") - E1009 = ("String for hash '{val}' not found in StringStore. Set the value " - "through token.morph_ instead or add the string to the " - "StringStore with `nlp.vocab.strings.add(string)`.") E1010 = ("Unable to set entity information for token {i} which is included " "in more than one span in entities, blocked, missing or outside.") - E1011 = ("Unsupported default '{default}' in doc.set_ents. Available " + E1011 = ("Unsupported default '{default}' in `doc.set_ents`. Available " "options: {modes}") E1012 = ("Entity spans and blocked/missing/outside spans should be " - "provided to doc.set_ents as lists of `Span` objects.") + "provided to `doc.set_ents` as lists of Span objects.") E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the " "token itself. To set the morph from this MorphAnalysis, set from " "the string value with: `token.set_morph(str(other_morph))`.") -@add_codes -class TempErrors: - T003 = ("Resizing pretrained Tagger models is not currently supported.") - T007 = ("Can't yet set {attr} from Span. Vote for this feature on the " - "issue tracker: http://github.com/explosion/spaCy/issues") - - # Deprecated model shortcuts, only used in errors and warnings OLD_MODEL_SHORTCUTS = { "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm", diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index f9a906397..1a0979cab 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -6,6 +6,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM from ...tokens import Doc from ...util import registry +from ...errors import Errors from ...ml import _character_embed from ..staticvectors import StaticVectors from ..featureextractor import FeatureExtractor @@ -201,7 +202,7 @@ def CharacterEmbed( """ feature = intify_attr(feature) if feature is None: - raise ValueError("Invalid feature: Must be a token attribute.") + raise ValueError(Errors.E911(feat=feature)) if also_use_static_vectors: model = chain( concatenate( diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 41afdbf80..c77247d33 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -1,11 +1,11 @@ from typing import List, Tuple, Callable, Optional, cast - from thinc.initializers import glorot_uniform_init from thinc.util import partial from thinc.types import Ragged, Floats2d, Floats1d from thinc.api import Model, Ops, registry from ..tokens import Doc +from ..errors import Errors @registry.layers("spacy.StaticVectors.v1") @@ -76,16 +76,9 @@ def init( nO = Y.data.shape[1] if nM is None: - raise ValueError( - "Cannot initialize StaticVectors layer: nM dimension unset. " - "This dimension refers to the width of the vectors table." - ) + raise ValueError(Errors.E905) if nO is None: - raise ValueError( - "Cannot initialize StaticVectors layer: nO dimension unset. " - "This dimension refers to the output width, after the linear " - "projection has been applied." - ) + raise ValueError(Errors.E904) model.set_dim("nM", nM) model.set_dim("nO", nO) model.set_param("W", init_W(model.ops, (nO, nM))) diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index dafa99bdd..69f015bda 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -9,10 +9,11 @@ from ...strings cimport hash_string from ...structs cimport TokenC from ...tokens.doc cimport Doc, set_children_from_heads from ...training.example cimport Example -from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC +from ...errors import Errors + # Calculate cost as gold/not gold. We don't use scalar value anyway. cdef int BINARY_COSTS = 1 cdef weight_t MIN_SCORE = -90000 @@ -86,7 +87,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, SENT_START_UNKNOWN, 0 ) - + elif is_sent_start is None: gs.state_bits[i] = set_state_flag( gs.state_bits[i], @@ -109,7 +110,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, IS_SENT_START, 0 ) - + for i, (head, label) in enumerate(zip(heads, labels)): if head is not None: gs.heads[i] = head @@ -158,7 +159,7 @@ cdef void update_gold_state(GoldParseStateC* gs, StateClass stcls) nogil: ) gs.n_kids_in_stack[i] = 0 gs.n_kids_in_buffer[i] = 0 - + for i in range(stcls.stack_depth()): s_i = stcls.S(i) if not is_head_unknown(gs, s_i): @@ -403,7 +404,7 @@ cdef class RightArc: return 0 sent_start = st._sent[st.B_(0).l_edge].sent_start return sent_start != 1 and st.H(st.S(0)) != st.B(0) - + @staticmethod cdef int transition(StateC* st, attr_t label) nogil: st.add_arc(st.S(0), st.B(0), label) @@ -701,10 +702,10 @@ cdef class ArcEager(TransitionSystem): output[i] = self.c[i].is_valid(st, self.c[i].label) else: output[i] = is_valid[self.c[i].move] - + def get_cost(self, StateClass stcls, gold, int i): if not isinstance(gold, ArcEagerGold): - raise TypeError("Expected ArcEagerGold") + raise TypeError(Errors.E909.format(name="ArcEagerGold")) cdef ArcEagerGold gold_ = gold gold_state = gold_.c n_gold = 0 @@ -717,7 +718,7 @@ cdef class ArcEager(TransitionSystem): cdef int set_costs(self, int* is_valid, weight_t* costs, StateClass stcls, gold) except -1: if not isinstance(gold, ArcEagerGold): - raise TypeError("Expected ArcEagerGold") + raise TypeError(Errors.E909.format(name="ArcEagerGold")) cdef ArcEagerGold gold_ = gold gold_.update(stcls) gold_state = gold_.c diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx index 0351bcaf7..4f142caaf 100644 --- a/spacy/pipeline/_parser_internals/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -1,16 +1,18 @@ -from collections import Counter from libc.stdint cimport int32_t from cymem.cymem cimport Pool +from collections import Counter + from ...typedefs cimport weight_t, attr_t from ...lexeme cimport Lexeme from ...attrs cimport IS_SPACE from ...training.example cimport Example -from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition, do_func_t +from ...errors import Errors + cdef enum: MISSING @@ -248,7 +250,7 @@ cdef class BiluoPushDown(TransitionSystem): def get_cost(self, StateClass stcls, gold, int i): if not isinstance(gold, BiluoGold): - raise TypeError("Expected BiluoGold") + raise TypeError(Errors.E909.format(name="BiluoGold")) cdef BiluoGold gold_ = gold gold_state = gold_.c n_gold = 0 @@ -261,7 +263,7 @@ cdef class BiluoPushDown(TransitionSystem): cdef int set_costs(self, int* is_valid, weight_t* costs, StateClass stcls, gold) except -1: if not isinstance(gold, BiluoGold): - raise TypeError("Expected BiluoGold") + raise TypeError(Errors.E909.format(name="BiluoGold")) cdef BiluoGold gold_ = gold gold_.update(stcls) gold_state = gold_.c diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 29f0d7fb4..82f3bf37d 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -238,7 +238,7 @@ class Morphologizer(Tagger): truths.append(eg_truths) d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): - raise ValueError("nan value when computing loss") + raise ValueError(Errors.E910.format(name=self.name)) return float(loss), d_scores def score(self, examples, **kwargs): diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 231072e9c..0bfef7c7b 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -125,7 +125,7 @@ class SentenceRecognizer(Tagger): truths.append(eg_truth) d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): - raise ValueError("nan value when computing loss") + raise ValueError(Errors.E910.format(name=self.name)) return float(loss), d_scores def initialize(self, get_examples, *, nlp=None): diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 37ad42b88..6cb582b36 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -15,7 +15,7 @@ from .pipe import Pipe, deserialize_config from ..language import Language from ..attrs import POS, ID from ..parts_of_speech import X -from ..errors import Errors, TempErrors, Warnings +from ..errors import Errors, Warnings from ..scorer import Scorer from ..training import validate_examples from .. import util @@ -258,7 +258,7 @@ class Tagger(Pipe): truths = [eg.get_aligned("TAG", as_string=True) for eg in examples] d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): - raise ValueError("nan value when computing loss") + raise ValueError(Errors.E910.format(name=self.name)) return float(loss), d_scores def initialize(self, get_examples, *, nlp=None, labels=None): diff --git a/spacy/scorer.py b/spacy/scorer.py index db32dabae..d1065f3a9 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -905,7 +905,7 @@ def _auc(x, y): if np.all(dx <= 0): direction = -1 else: - raise ValueError(Errors.E164.format(x)) + raise ValueError(Errors.E164.format(x=x)) area = direction * np.trapz(y, x) if isinstance(area, np.memmap): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 9dfa6e714..3404274ce 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -245,7 +245,7 @@ cdef class Doc: self.noun_chunks_iterator = self.vocab.get_noun_chunks cdef bint has_space if words is None and spaces is not None: - raise ValueError("words must be set if spaces is set") + raise ValueError(Errors.E908) elif spaces is None and words is not None: self.has_unknown_spaces = True else: @@ -309,7 +309,7 @@ cdef class Doc: else: if len(ent) < 3 or ent[1] != "-": raise ValueError(Errors.E177.format(tag=ent)) - ent_iob, ent_type = ent.split("-", 1) + ent_iob, ent_type = ent.split("-", 1) if ent_iob not in iob_strings: raise ValueError(Errors.E177.format(tag=ent)) ent_iob = iob_strings.index(ent_iob) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 781474d3a..6a14e2849 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -17,7 +17,7 @@ from ..lexeme cimport Lexeme from ..symbols cimport dep from ..util import normalize_slice -from ..errors import Errors, TempErrors, Warnings +from ..errors import Errors, Warnings from .underscore import Underscore, get_ext_args @@ -652,7 +652,7 @@ cdef class Span: return self.root.ent_id def __set__(self, hash_t key): - raise NotImplementedError(TempErrors.T007.format(attr="ent_id")) + raise NotImplementedError(Errors.E200.format(attr="ent_id")) property ent_id_: """RETURNS (str): The (string) entity ID.""" @@ -660,7 +660,7 @@ cdef class Span: return self.root.ent_id_ def __set__(self, hash_t key): - raise NotImplementedError(TempErrors.T007.format(attr="ent_id_")) + raise NotImplementedError(Errors.E200.format(attr="ent_id_")) @property def orth_(self): diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py index 902db585b..28f0f87c3 100644 --- a/spacy/training/converters/conll_ner_to_docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -3,7 +3,7 @@ from wasabi import Printer from .. import tags_to_entities from ...training import iob_to_biluo from ...tokens import Doc, Span -from ...util import load_model +from ...errors import Errors from ...util import load_model, get_lang_class @@ -103,11 +103,7 @@ def conll_ner_to_docs( lines = [line.strip() for line in conll_sent.split("\n") if line.strip()] cols = list(zip(*[line.split() for line in lines])) if len(cols) < 2: - raise ValueError( - "The token-per-line NER file is not formatted correctly. " - "Try checking whitespace and delimiters. See " - "https://nightly.spacy.io/api/cli#convert" - ) + raise ValueError(Errors.E093) length = len(cols[0]) words.extend(cols[0]) sent_starts.extend([True] + [False] * (length - 1)) diff --git a/spacy/training/converters/iob_to_docs.py b/spacy/training/converters/iob_to_docs.py index bfd981649..73ad8953d 100644 --- a/spacy/training/converters/iob_to_docs.py +++ b/spacy/training/converters/iob_to_docs.py @@ -4,6 +4,7 @@ from .conll_ner_to_docs import n_sents_info from ...vocab import Vocab from ...training import iob_to_biluo, tags_to_entities from ...tokens import Doc, Span +from ...errors import Errors from ...util import minibatch @@ -45,9 +46,7 @@ def read_iob(raw_sents, vocab, n_sents): sent_words, sent_iob = zip(*sent_tokens) sent_tags = ["-"] * len(sent_words) else: - raise ValueError( - "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert" - ) + raise ValueError(Errors.E092) words.extend(sent_words) tags.extend(sent_tags) iob.extend(sent_iob) diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index 4f05c6344..b91fb07a8 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -16,6 +16,7 @@ from ..attrs import ID from ..ml.models.multi_task import build_cloze_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain +from ..errors import Errors from ..util import registry, load_model_from_config, dot_to_object @@ -151,9 +152,9 @@ def create_objective(config: Config): distance = L2Distance(normalize=True, ignore_zeros=True) return partial(get_vectors_loss, distance=distance) else: - raise ValueError("Unexpected loss type", config["loss"]) + raise ValueError(Errors.E906.format(loss_type=config["loss"])) else: - raise ValueError("Unexpected objective_type", objective_type) + raise ValueError(Errors.E907.format(objective_type=objective_type)) def get_vectors_loss(ops, docs, prediction, distance): From 96b636c2d3f8e8f62bf53e0c5c30147c48bca537 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 4 Oct 2020 13:08:21 +0200 Subject: [PATCH 32/38] Update attribute ruler --- spacy/pipeline/attributeruler.py | 38 ++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index f314953e9..b4580ff7c 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -18,15 +18,16 @@ from .. import util MatcherPatternType = List[Dict[Union[int, str], Any]] AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]] +TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]] +MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] @Language.factory( - "attribute_ruler", default_config={"pattern_dicts": None, "validate": False} + "attribute_ruler", default_config={"validate": False} ) def make_attribute_ruler( nlp: Language, name: str, - pattern_dicts: Optional[Iterable[AttributeRulerPatternType]], validate: bool, ): return AttributeRuler( @@ -49,14 +50,14 @@ class AttributeRuler(Pipe): pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None, validate: bool = False, ) -> None: - """Initialize the AttributeRuler. + """Create the AttributeRuler. After creation, you can add patterns + with the `.initialize()` or `.add_patterns()` methods, or load patterns + with `.from_bytes()` or `.from_disk()`. Loading patterns will remove + any patterns you've added previously. vocab (Vocab): The vocab. name (str): The pipe name. Defaults to "attribute_ruler". - pattern_dicts (Iterable[Dict]): A list of pattern dicts with the keys as - the arguments to AttributeRuler.add (`patterns`/`attrs`/`index`) to add - as patterns. - + RETURNS (AttributeRuler): The AttributeRuler component. DOCS: https://nightly.spacy.io/api/attributeruler#init @@ -68,8 +69,27 @@ class AttributeRuler(Pipe): self._attrs_unnormed = [] # store for reference self.indices = [] - if pattern_dicts: - self.add_patterns(pattern_dicts) + def initialize( + self, + get_examples: Optional[Callable[[], Iterable[Example]]] = None, + *, + nlp: Optional[Language] = None, + patterns: Optional[Iterable[AttributeRulerPatternType]] = None, + tag_map: Optional[TagMapType]=None, + morph_rules: Optional[MorphRulesType]=None + ): + """Initialize the attribute ruler by adding zero or more patterns. + + Rules can be specified as a sequence of dicts using the `patterns` + keyword argument. You can also provide rules using the "tag map" or + "morph rules" formats supported by spaCy prior to v3. + """ + if patterns: + self.add_patterns(patterns) + if tag_map: + self.load_from_tag_map(tag_map) + if morph_rules: + self.load_from_morph_rules(morph_rules) def __call__(self, doc: Doc) -> Doc: """Apply the AttributeRuler to a Doc and set all attribute exceptions. From 11347f34da5182d35559eae644231a432fb4d9c4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 13:54:05 +0200 Subject: [PATCH 33/38] Tidy up, tests and docs --- spacy/pipeline/attributeruler.py | 57 ++++---- spacy/tests/pipeline/test_attributeruler.py | 105 ++++++++------ website/docs/api/attributeruler.md | 145 +++++++++++--------- website/docs/usage/linguistic-features.md | 12 +- website/docs/usage/v3.md | 26 +++- 5 files changed, 193 insertions(+), 152 deletions(-) diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index b4580ff7c..9e6174d07 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -1,10 +1,11 @@ +from typing import List, Dict, Union, Iterable, Any, Optional, Callable, Iterator +from typing import Tuple import srsly -from typing import List, Dict, Union, Iterable, Any, Optional from pathlib import Path from .pipe import Pipe from ..errors import Errors -from ..training import validate_examples +from ..training import validate_examples, Example from ..language import Language from ..matcher import Matcher from ..scorer import Scorer @@ -22,17 +23,9 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] -@Language.factory( - "attribute_ruler", default_config={"validate": False} -) -def make_attribute_ruler( - nlp: Language, - name: str, - validate: bool, -): - return AttributeRuler( - nlp.vocab, name, pattern_dicts=pattern_dicts, validate=validate - ) +@Language.factory("attribute_ruler", default_config={"validate": False}) +def make_attribute_ruler(nlp: Language, name: str, validate: bool): + return AttributeRuler(nlp.vocab, name, validate=validate) class AttributeRuler(Pipe): @@ -43,12 +36,7 @@ class AttributeRuler(Pipe): """ def __init__( - self, - vocab: Vocab, - name: str = "attribute_ruler", - *, - pattern_dicts: Optional[Iterable[AttributeRulerPatternType]] = None, - validate: bool = False, + self, vocab: Vocab, name: str = "attribute_ruler", *, validate: bool = False ) -> None: """Create the AttributeRuler. After creation, you can add patterns with the `.initialize()` or `.add_patterns()` methods, or load patterns @@ -57,7 +45,7 @@ class AttributeRuler(Pipe): vocab (Vocab): The vocab. name (str): The pipe name. Defaults to "attribute_ruler". - + RETURNS (AttributeRuler): The AttributeRuler component. DOCS: https://nightly.spacy.io/api/attributeruler#init @@ -71,15 +59,15 @@ class AttributeRuler(Pipe): def initialize( self, - get_examples: Optional[Callable[[], Iterable[Example]]] = None, + get_examples: Optional[Callable[[], Iterable[Example]]], *, nlp: Optional[Language] = None, patterns: Optional[Iterable[AttributeRulerPatternType]] = None, - tag_map: Optional[TagMapType]=None, - morph_rules: Optional[MorphRulesType]=None + tag_map: Optional[TagMapType] = None, + morph_rules: Optional[MorphRulesType] = None, ): """Initialize the attribute ruler by adding zero or more patterns. - + Rules can be specified as a sequence of dicts using the `patterns` keyword argument. You can also provide rules using the "tag map" or "morph rules" formats supported by spaCy prior to v3. @@ -126,7 +114,7 @@ class AttributeRuler(Pipe): set_token_attrs(span[index], attrs) return doc - def pipe(self, stream, *, batch_size=128): + def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under the hood when the nlp object is called on a text and all components are applied to the Doc. @@ -210,16 +198,16 @@ class AttributeRuler(Pipe): self.attrs.append(attrs) self.indices.append(index) - def add_patterns(self, pattern_dicts: Iterable[AttributeRulerPatternType]) -> None: + def add_patterns(self, patterns: Iterable[AttributeRulerPatternType]) -> None: """Add patterns from a list of pattern dicts with the keys as the arguments to AttributeRuler.add. - pattern_dicts (Iterable[dict]): A list of pattern dicts with the keys + patterns (Iterable[dict]): A list of pattern dicts with the keys as the arguments to AttributeRuler.add (patterns/attrs/index) to add as patterns. DOCS: https://nightly.spacy.io/api/attributeruler#add_patterns """ - for p in pattern_dicts: + for p in patterns: self.add(**p) @property @@ -234,7 +222,7 @@ class AttributeRuler(Pipe): all_patterns.append(p) return all_patterns - def score(self, examples, **kwargs): + def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Score a batch of examples. examples (Iterable[Example]): The examples to score. @@ -275,7 +263,7 @@ class AttributeRuler(Pipe): def from_bytes( self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList() - ): + ) -> "AttributeRuler": """Load the AttributeRuler from a bytestring. bytes_data (bytes): The data to load. @@ -293,7 +281,6 @@ class AttributeRuler(Pipe): "patterns": load_patterns, } util.from_bytes(bytes_data, deserialize, exclude) - return self def to_disk( @@ -303,6 +290,7 @@ class AttributeRuler(Pipe): path (Union[Path, str]): A path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. + DOCS: https://nightly.spacy.io/api/attributeruler#to_disk """ serialize = { @@ -313,11 +301,13 @@ class AttributeRuler(Pipe): def from_disk( self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList() - ) -> None: + ) -> "AttributeRuler": """Load the AttributeRuler from disk. path (Union[Path, str]): A path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (AttributeRuler): The loaded object. + DOCS: https://nightly.spacy.io/api/attributeruler#from_disk """ @@ -329,11 +319,10 @@ class AttributeRuler(Pipe): "patterns": load_patterns, } util.from_disk(path, deserialize, exclude) - return self -def _split_morph_attrs(attrs): +def _split_morph_attrs(attrs: dict) -> Tuple[dict, dict]: """Split entries from a tag map or morph rules dict into to two dicts, one with the token-level features (POS, LEMMA) and one with the remaining features, which are presumed to be individual MORPH features.""" diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index 5773127af..c967bcdcd 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -63,6 +63,39 @@ def morph_rules(): return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}} +def check_tag_map(ruler): + doc = Doc( + ruler.vocab, + words=["This", "is", "a", "test", "."], + tags=["DT", "VBZ", "DT", "NN", "."], + ) + doc = ruler(doc) + for i in range(len(doc)): + if i == 4: + assert doc[i].pos_ == "PUNCT" + assert str(doc[i].morph) == "PunctType=peri" + else: + assert doc[i].pos_ == "" + assert str(doc[i].morph) == "" + + +def check_morph_rules(ruler): + doc = Doc( + ruler.vocab, + words=["This", "is", "the", "test", "."], + tags=["DT", "VBZ", "DT", "NN", "."], + ) + doc = ruler(doc) + for i in range(len(doc)): + if i != 2: + assert doc[i].pos_ == "" + assert str(doc[i].morph) == "" + else: + assert doc[2].pos_ == "DET" + assert doc[2].lemma_ == "a" + assert str(doc[2].morph) == "Case=Nom" + + def test_attributeruler_init(nlp, pattern_dicts): a = nlp.add_pipe("attribute_ruler") for p in pattern_dicts: @@ -78,7 +111,8 @@ def test_attributeruler_init(nlp, pattern_dicts): def test_attributeruler_init_patterns(nlp, pattern_dicts): # initialize with patterns - nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) + ruler = nlp.add_pipe("attribute_ruler") + ruler.initialize(lambda: [], patterns=pattern_dicts) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" assert str(doc[2].morph) == "Case=Nom|Number=Plur" @@ -88,10 +122,11 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): assert doc.has_annotation("MORPH") nlp.remove_pipe("attribute_ruler") # initialize with patterns from asset - nlp.add_pipe( - "attribute_ruler", - config={"pattern_dicts": {"@misc": "attribute_ruler_patterns"}}, - ) + nlp.config["initialize"]["components"]["attribute_ruler"] = { + "patterns": {"@misc": "attribute_ruler_patterns"} + } + nlp.add_pipe("attribute_ruler") + nlp.initialize() doc = nlp("This is a test.") assert doc[2].lemma_ == "the" assert str(doc[2].morph) == "Case=Nom|Number=Plur" @@ -103,18 +138,15 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): def test_attributeruler_score(nlp, pattern_dicts): # initialize with patterns - nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) + ruler = nlp.add_pipe("attribute_ruler") + ruler.initialize(lambda: [], patterns=pattern_dicts) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" assert str(doc[3].morph) == "Case=Nom|Number=Sing" - - dev_examples = [ - Example.from_dict( - nlp.make_doc("This is a test."), {"lemmas": ["this", "is", "a", "cat", "."]} - ) - ] + doc = nlp.make_doc("This is a test.") + dev_examples = [Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]})] scores = nlp.evaluate(dev_examples) # "cat" is the only correct lemma assert scores["lemma_acc"] == pytest.approx(0.2) @@ -139,40 +171,27 @@ def test_attributeruler_rule_order(nlp): def test_attributeruler_tag_map(nlp, tag_map): - a = AttributeRuler(nlp.vocab) - a.load_from_tag_map(tag_map) - doc = Doc( - nlp.vocab, - words=["This", "is", "a", "test", "."], - tags=["DT", "VBZ", "DT", "NN", "."], - ) - doc = a(doc) - for i in range(len(doc)): - if i == 4: - assert doc[i].pos_ == "PUNCT" - assert str(doc[i].morph) == "PunctType=peri" - else: - assert doc[i].pos_ == "" - assert str(doc[i].morph) == "" + ruler = AttributeRuler(nlp.vocab) + ruler.load_from_tag_map(tag_map) + check_tag_map(ruler) + + +def test_attributeruler_tag_map_initialize(nlp, tag_map): + ruler = nlp.add_pipe("attribute_ruler") + ruler.initialize(lambda: [], tag_map=tag_map) + check_tag_map(ruler) def test_attributeruler_morph_rules(nlp, morph_rules): - a = AttributeRuler(nlp.vocab) - a.load_from_morph_rules(morph_rules) - doc = Doc( - nlp.vocab, - words=["This", "is", "the", "test", "."], - tags=["DT", "VBZ", "DT", "NN", "."], - ) - doc = a(doc) - for i in range(len(doc)): - if i != 2: - assert doc[i].pos_ == "" - assert str(doc[i].morph) == "" - else: - assert doc[2].pos_ == "DET" - assert doc[2].lemma_ == "a" - assert str(doc[2].morph) == "Case=Nom" + ruler = AttributeRuler(nlp.vocab) + ruler.load_from_morph_rules(morph_rules) + check_morph_rules(ruler) + + +def test_attributeruler_morph_rules_initialize(nlp, morph_rules): + ruler = nlp.add_pipe("attribute_ruler") + ruler.initialize(lambda: [], morph_rules=morph_rules) + check_morph_rules(ruler) def test_attributeruler_indices(nlp): diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index 60fda6bda..b89759080 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -4,6 +4,7 @@ tag: class source: spacy/pipeline/attributeruler.py new: 3 teaser: 'Pipeline component for rule-based token attribute assignment' +api_base_class: /api/pipe api_string_name: attribute_ruler api_trainable: false --- @@ -25,17 +26,13 @@ how the component should be configured. You can override its settings via the > #### Example > > ```python -> config = { -> "pattern_dicts": None, -> "validate": True, -> } +> config = {"validate": True} > nlp.add_pipe("attribute_ruler", config=config) > ``` -| Setting | Description | -| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `pattern_dicts` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ | -| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ | +| Setting | Description | +| ---------- | --------------------------------------------------------------------------------------------- | +| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ | ```python %%GITHUB_SPACY/spacy/pipeline/attributeruler.py @@ -43,36 +40,26 @@ how the component should be configured. You can override its settings via the ## AttributeRuler.\_\_init\_\_ {#init tag="method"} -Initialize the attribute ruler. If pattern dicts are supplied here, they need to -be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"` -keys, e.g.: - -```python -pattern_dicts = [ - {"patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}}, - {"patterns": [[{"LOWER": "an"}]], "attrs": {"LEMMA": "a"}}, -] -``` +Initialize the attribute ruler. > #### Example > > ```python > # Construction via add_pipe -> attribute_ruler = nlp.add_pipe("attribute_ruler") +> ruler = nlp.add_pipe("attribute_ruler") > ``` -| Name | Description | -| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ | -| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ | -| _keyword-only_ | | -| `pattern_dicts` | Optional patterns to load in on initialization. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ | -| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ | +| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ | +| _keyword-only_ | | +| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ | ## AttributeRuler.\_\_call\_\_ {#call tag="method"} -Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched -by the provided patterns. +Apply the attribute ruler to a `Doc`, setting token attributes for tokens +matched by the provided patterns. | Name | Description | | ----------- | -------------------------------- | @@ -90,10 +77,10 @@ may be negative to index from the end of the span. > #### Example > > ```python -> attribute_ruler = nlp.add_pipe("attribute_ruler") +> ruler = nlp.add_pipe("attribute_ruler") > patterns = [[{"TAG": "VB"}]] > attrs = {"POS": "VERB"} -> attribute_ruler.add(patterns=patterns, attrs=attrs) +> ruler.add(patterns=patterns, attrs=attrs) > ``` | Name | Description | @@ -107,11 +94,10 @@ may be negative to index from the end of the span. > #### Example > > ```python -> attribute_ruler = nlp.add_pipe("attribute_ruler") -> pattern_dicts = [ +> ruler = nlp.add_pipe("attribute_ruler") +> patterns = [ > { -> "patterns": [[{"TAG": "VB"}]], -> "attrs": {"POS": "VERB"} +> "patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"} > }, > { > "patterns": [[{"LOWER": "two"}, {"LOWER": "apples"}]], @@ -119,15 +105,16 @@ may be negative to index from the end of the span. > "index": -1 > }, > ] -> attribute_ruler.add_patterns(pattern_dicts) +> ruler.add_patterns(patterns) > ``` -Add patterns from a list of pattern dicts with the keys as the arguments to +Add patterns from a list of pattern dicts. Each pattern dict can specify the +keys `"patterns"`, `"attrs"` and `"index"`, which match the arguments of [`AttributeRuler.add`](/api/attributeruler#add). -| Name | Description | -| --------------- | -------------------------------------------------------------------------- | -| `pattern_dicts` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ | +| Name | Description | +| ---------- | -------------------------------------------------------------------------- | +| `patterns` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ | ## AttributeRuler.patterns {#patterns tag="property"} @@ -139,20 +126,39 @@ Get all patterns that have been added to the attribute ruler in the | ----------- | -------------------------------------------------------------------------------------------- | | **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ | -## AttributeRuler.score {#score tag="method" new="3"} +## AttributeRuler.initialize {#initialize tag="method"} -Score a batch of examples. +Initialize the component with data. Typically called before training to load in +rules from a file. This method is typically called by +[`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. > #### Example > > ```python -> scores = attribute_ruler.score(examples) +> ruler = nlp.add_pipe("attribute_ruler") +> ruler.initialize(lambda: [], nlp=nlp, patterns=patterns) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.attribute_ruler] +> +> [initialize.components.attribute_ruler.patterns] +> @readers = "srsly.read_json.v1" +> path = "corpus/attribute_ruler_patterns.json > ``` -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects (the training data). Not used by this component. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `patterns` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ | +| `tag_map` | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ | +| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]]~~ | ## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"} @@ -170,6 +176,21 @@ Load attribute ruler patterns from morph rules. | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ | +## AttributeRuler.score {#score tag="method" new="3"} + +Score a batch of examples. + +> #### Example +> +> ```python +> scores = ruler.score(examples) +> ``` + +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ | + ## AttributeRuler.to_disk {#to_disk tag="method"} Serialize the pipe to disk. @@ -177,8 +198,8 @@ Serialize the pipe to disk. > #### Example > > ```python -> attribute_ruler = nlp.add_pipe("attribute_ruler") -> attribute_ruler.to_disk("/path/to/attribute_ruler") +> ruler = nlp.add_pipe("attribute_ruler") +> ruler.to_disk("/path/to/attribute_ruler") > ``` | Name | Description | @@ -194,8 +215,8 @@ Load the pipe from disk. Modifies the object in place and returns it. > #### Example > > ```python -> attribute_ruler = nlp.add_pipe("attribute_ruler") -> attribute_ruler.from_disk("/path/to/attribute_ruler") +> ruler = nlp.add_pipe("attribute_ruler") +> ruler.from_disk("/path/to/attribute_ruler") > ``` | Name | Description | @@ -210,8 +231,8 @@ Load the pipe from disk. Modifies the object in place and returns it. > #### Example > > ```python -> attribute_ruler = nlp.add_pipe("attribute_ruler") -> attribute_ruler_bytes = attribute_ruler.to_bytes() +> ruler = nlp.add_pipe("attribute_ruler") +> ruler = ruler.to_bytes() > ``` Serialize the pipe to a bytestring. @@ -229,9 +250,9 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > #### Example > > ```python -> attribute_ruler_bytes = attribute_ruler.to_bytes() -> attribute_ruler = nlp.add_pipe("attribute_ruler") -> attribute_ruler.from_bytes(attribute_ruler_bytes) +> ruler_bytes = ruler.to_bytes() +> ruler = nlp.add_pipe("attribute_ruler") +> ruler.from_bytes(ruler_bytes) > ``` | Name | Description | @@ -250,12 +271,12 @@ serialization by passing in the string names via the `exclude` argument. > #### Example > > ```python -> data = attribute_ruler.to_disk("/path", exclude=["vocab"]) +> data = ruler.to_disk("/path", exclude=["vocab"]) > ``` -| Name | Description | -| ---------- | -------------------------------------------------------------- | -| `vocab` | The shared [`Vocab`](/api/vocab). | -| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. | -| `attrs` | The attributes to set. You usually don't want to exclude this. | -| `indices` | The token indices. You usually don't want to exclude this. | +| Name | Description | +| ---------- | --------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. | +| `attrs` | The attributes to set. You usually don't want to exclude this. | +| `indices` | The token indices. You usually don't want to exclude this. | diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 7b9aaa0b9..1964bac18 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1801,17 +1801,7 @@ print(doc2[5].tag_, doc2[5].pos_) # WP PRON -For easy migration from from spaCy v2 to v3, the -[`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules** -in the v2 format with the methods -[`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and -[`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules). - -```diff -nlp = spacy.blank("en") -+ ruler = nlp.add_pipe("attribute_ruler") -+ ruler.load_from_tag_map(YOUR_TAG_MAP) -``` +The [`AttributeRuler`](/api/attributeruler) can import a **tag map and morph rules** in the v2.x format via its built-in methods or when the component is initialized before training. See the [migration guide](/usage/v3#migrating-training-mappings-exceptions) for details. diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 4ce57af01..a10fc6321 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -804,8 +804,30 @@ nlp = spacy.blank("en") Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy v3.0 now manages mappings and exceptions with a separate and more flexible pipeline component, the [`AttributeRuler`](/api/attributeruler). See the -[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. The -`AttributeRuler` provides two handy helper methods +[usage guide](/usage/linguistic-features#mappings-exceptions) for examples. If +you have tag maps and morph rules in the v2.x format, you can load them into the +attribute ruler before training using the `[initialize]` block of your config. + +> #### What does the initialization do? +> +> The `[initialize]` block is used when +> [`nlp.initialize`](/api/language#initialize) is called (usually right before +> training). It lets you define data resources for initializing the pipeline in +> your `config.cfg`. After training, the rules are saved to disk with the +> exported pipeline, so your runtime model doesn't depend on local data. For +> details see the [config lifecycle](/usage/training/#config-lifecycle) and +> [initialization](/usage/training/#initialization) docs. + +```ini +### config.cfg (excerpt) +[initialize.components.attribute_ruler] + +[initialize.components.attribute_ruler.tag_map] +@readers = "srsly.read_json.v1" +path = "./corpus/tag_map.json" +``` + +The `AttributeRuler` also provides two handy helper methods [`load_from_tag_map`](/api/attributeruler#load_from_tag_map) and [`load_from_morph_rules`](/api/attributeruler#load_from_morph_rules) that let you load in your existing tag map or morph rules: From 9b3a9343615bf98e01abaa0a7db0fe563458fdf5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 14:14:55 +0200 Subject: [PATCH 34/38] Update docs [ci skip] --- website/docs/usage/training.md | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 8516b444c..1981f03b7 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -8,6 +8,7 @@ menu: - ['Config System', 'config'] - ['Custom Training', 'config-custom'] - ['Custom Functions', 'custom-functions'] + - ['Initialization', 'initialization'] - ['Data Utilities', 'data'] - ['Parallel Training', 'parallel-training'] - ['Internal API', 'api'] @@ -824,12 +825,15 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]: return create_model(output_width) ``` -### Customizing the initialization {#initialization} +## Customizing the initialization {#initialization} When you start training a new model from scratch, [`spacy train`](/api/cli#train) will call -[`nlp.initialize`](/api/language#initialize) to initialize the pipeline for -training. This process typically includes the following: +[`nlp.initialize`](/api/language#initialize) to initialize the pipeline and load +the required data. All settings for this are defined in the +[`[initialize]`](/api/data-formats#config-initialize) block of the config, so +you can keep track of how the initial `nlp` object was created. The +initialization process typically includes the following: > #### config.cfg (excerpt) > @@ -859,10 +863,22 @@ The initialization step allows the config to define **all settings** required for the pipeline, while keeping a separation between settings and functions that should only be used **before training** to set up the initial pipeline, and logic and configuration that needs to be available **at runtime**. Without that -separation, TODO: +separation, it would be very difficult to use the came, reproducible config file +because the component settings required for training (load data from an external +file) wouldn't match the component settings required at runtime (load what's +included with the saved `nlp` object and don't depend on external file). ![Illustration of pipeline lifecycle](../images/lifecycle.svg) + + +For details and examples of how pipeline components can **save and load data +assets** like model weights or lookup tables, and how the component +initialization is implemented under the hood, see the usage guide on +[serializing and initializing component data](/usage/processing-pipelines#component-data-initialization). + + + #### Initializing labels {#initialization-labels} Built-in pipeline components like the From 84ae197dd6d229b1ef34a205d1103f87623c0db6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 4 Oct 2020 14:16:53 +0200 Subject: [PATCH 35/38] Fix logger --- spacy/training/loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/training/loop.py b/spacy/training/loop.py index b63adb6c9..0d4414964 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -77,7 +77,7 @@ def train( log_step, finalize_logger = train_logger(nlp, stdout, stderr) try: for batch, info, is_best_checkpoint in training_step_iterator: - log_step(info if is_best_checkpoint else None) + log_step(info if is_best_checkpoint is not None else None) if is_best_checkpoint is not None and output_path is not None: with nlp.select_pipes(disable=frozen_components): update_meta(T, nlp, info) From 8f018e47f84264ca852c67578af1ab95cbd74be3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 14:43:45 +0200 Subject: [PATCH 36/38] Adjust [initialize.components] on Language.remove_pipe and Language.rename_pipe --- spacy/language.py | 7 +++++++ spacy/tests/pipeline/test_pipe_methods.py | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index d76741da3..9fdde03d5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -896,6 +896,10 @@ class Language: self._components[i] = (new_name, self._components[i][1]) self._pipe_meta[new_name] = self._pipe_meta.pop(old_name) self._pipe_configs[new_name] = self._pipe_configs.pop(old_name) + # Make sure [initialize] config is adjusted + if old_name in self._config["initialize"]["components"]: + init_cfg = self._config["initialize"]["components"].pop(old_name) + self._config["initialize"]["components"][new_name] = init_cfg def remove_pipe(self, name: str) -> Tuple[str, Callable[[Doc], Doc]]: """Remove a component from the pipeline. @@ -912,6 +916,9 @@ class Language: # because factory may be used for something else self._pipe_meta.pop(name) self._pipe_configs.pop(name) + # Make sure name is removed from the [initialize] config + if name in self._config["initialize"]["components"]: + self._config["initialize"]["components"].pop(name) # Make sure the name is also removed from the set of disabled components if name in self.disabled: self._disabled.remove(name) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index e647ba440..a4297a1d1 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -382,3 +382,25 @@ def test_warning_pipe_begin_training(): def begin_training(*args, **kwargs): ... + + +def test_pipe_methods_initialize(): + """Test that the [initialize] config reflects the components correctly.""" + nlp = Language() + nlp.add_pipe("tagger") + assert "tagger" not in nlp.config["initialize"]["components"] + nlp.config["initialize"]["components"]["tagger"] = {"labels": ["hello"]} + assert nlp.config["initialize"]["components"]["tagger"] == {"labels": ["hello"]} + nlp.remove_pipe("tagger") + assert "tagger" not in nlp.config["initialize"]["components"] + nlp.add_pipe("tagger") + assert "tagger" not in nlp.config["initialize"]["components"] + nlp.config["initialize"]["components"]["tagger"] = {"labels": ["hello"]} + nlp.rename_pipe("tagger", "my_tagger") + assert "tagger" not in nlp.config["initialize"]["components"] + assert nlp.config["initialize"]["components"]["my_tagger"] == {"labels": ["hello"]} + nlp.config["initialize"]["components"]["test"] = {"foo": "bar"} + nlp.add_pipe("ner", name="test") + assert "test" in nlp.config["initialize"]["components"] + nlp.remove_pipe("test") + assert "test" not in nlp.config["initialize"]["components"] From d38dc466c5d17cc66f6be4edc028e13e41788b6c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 15:26:01 +0200 Subject: [PATCH 37/38] Adjust error [ci skip] --- spacy/errors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 9145a7b19..20edf45b5 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -185,8 +185,8 @@ class Errors: "the documentation:\nhttps://nightly.spacy.io/usage/models") E030 = ("Sentence boundaries unset. You can add the 'sentencizer' " "component to the pipeline with: `nlp.add_pipe('sentencizer')`. " - "Alternatively, add the dependency parser, or set sentence " - "boundaries by setting `doc[i].is_sent_start`.") + "Alternatively, add the dependency parser or sentence recognizer, " + "or set sentence boundaries by setting `doc[i].is_sent_start`.") E031 = ("Invalid token: empty string ('') at position {i}.") E033 = ("Cannot load into non-empty Doc of length {length}.") E035 = ("Error creating span with start {start} and end {end} for Doc of " From 3c36a57e84a4792af59cab5ea5b76c2301c303a4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 4 Oct 2020 17:46:29 +0200 Subject: [PATCH 38/38] Update data augmenters (#6196) * Draft lower-case augmenter * Make warning a debug log * Update lowercase augmenter, docs and tests Co-authored-by: Matthew Honnibal --- spacy/tests/training/test_augmenters.py | 100 ++++++++++++++++++++++++ spacy/tests/training/test_training.py | 60 +------------- spacy/training/augment.py | 31 ++++++++ spacy/training/example.pyx | 3 +- website/docs/api/top-level.md | 32 ++++++-- 5 files changed, 161 insertions(+), 65 deletions(-) create mode 100644 spacy/tests/training/test_augmenters.py diff --git a/spacy/tests/training/test_augmenters.py b/spacy/tests/training/test_augmenters.py new file mode 100644 index 000000000..0bd4d5ef2 --- /dev/null +++ b/spacy/tests/training/test_augmenters.py @@ -0,0 +1,100 @@ +import pytest +from spacy.training import Corpus +from spacy.training.augment import create_orth_variants_augmenter +from spacy.training.augment import create_lower_casing_augmenter +from spacy.lang.en import English +from spacy.tokens import DocBin, Doc +from contextlib import contextmanager +import random + +from ..util import make_tempdir + + +@contextmanager +def make_docbin(docs, name="roundtrip.spacy"): + with make_tempdir() as tmpdir: + output_file = tmpdir / name + DocBin(docs=docs).to_disk(output_file) + yield output_file + + +@pytest.fixture +def nlp(): + return English() + + +@pytest.fixture +def doc(nlp): + # fmt: off + words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."] + tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] + pos = ["PROPN", "PART", "NOUN", "VERB", "ADP", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"] + ents = ["B-PERSON", "I-PERSON", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-GPE", "O"] + cats = {"TRAVEL": 1.0, "BAKING": 0.0} + # fmt: on + doc = Doc(nlp.vocab, words=words, tags=tags, pos=pos, ents=ents) + doc.cats = cats + return doc + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_make_orth_variants(nlp, doc): + single = [ + {"tags": ["NFP"], "variants": ["…", "..."]}, + {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, + ] + augmenter = create_orth_variants_augmenter( + level=0.2, lower=0.5, orth_variants={"single": single} + ) + with make_docbin([doc]) as output_file: + reader = Corpus(output_file, augmenter=augmenter) + # Due to randomness, only test that it works without errors for now + list(reader(nlp)) + + +def test_lowercase_augmenter(nlp, doc): + augmenter = create_lower_casing_augmenter(level=1.0) + with make_docbin([doc]) as output_file: + reader = Corpus(output_file, augmenter=augmenter) + corpus = list(reader(nlp)) + eg = corpus[0] + assert eg.reference.text == doc.text.lower() + assert eg.predicted.text == doc.text.lower() + ents = [(e.start, e.end, e.label) for e in doc.ents] + assert [(e.start, e.end, e.label) for e in eg.reference.ents] == ents + for ref_ent, orig_ent in zip(eg.reference.ents, doc.ents): + assert ref_ent.text == orig_ent.text.lower() + assert [t.pos_ for t in eg.reference] == [t.pos_ for t in doc] + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_custom_data_augmentation(nlp, doc): + def create_spongebob_augmenter(randomize: bool = False): + def augment(nlp, example): + text = example.text + if randomize: + ch = [c.lower() if random.random() < 0.5 else c.upper() for c in text] + else: + ch = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)] + example_dict = example.to_dict() + doc = nlp.make_doc("".join(ch)) + example_dict["token_annotation"]["ORTH"] = [t.text for t in doc] + yield example + yield example.from_dict(doc, example_dict) + + return augment + + with make_docbin([doc]) as output_file: + reader = Corpus(output_file, augmenter=create_spongebob_augmenter()) + corpus = list(reader(nlp)) + orig_text = "Sarah 's sister flew to Silicon Valley via London . " + augmented = "SaRaH 's sIsTeR FlEw tO SiLiCoN VaLlEy vIa lOnDoN . " + assert corpus[0].text == orig_text + assert corpus[0].reference.text == orig_text + assert corpus[0].predicted.text == orig_text + assert corpus[1].text == augmented + assert corpus[1].reference.text == augmented + assert corpus[1].predicted.text == augmented + ents = [(e.start, e.end, e.label) for e in doc.ents] + assert [(e.start, e.end, e.label) for e in corpus[0].reference.ents] == ents + assert [(e.start, e.end, e.label) for e in corpus[1].reference.ents] == ents diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 7d41c8908..07e1aef01 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -1,23 +1,20 @@ import numpy from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment from spacy.training import biluo_tags_to_spans, iob_to_biluo -from spacy.training import Corpus, docs_to_json -from spacy.training.example import Example +from spacy.training import Corpus, docs_to_json, Example from spacy.training.converters import json_to_docs -from spacy.training.augment import create_orth_variants_augmenter from spacy.lang.en import English from spacy.tokens import Doc, DocBin from spacy.util import get_words_and_spaces, minibatch from thinc.api import compounding import pytest import srsly -import random from ..util import make_tempdir @pytest.fixture -def doc(en_vocab): +def doc(): nlp = English() # make sure we get a new vocab every time # fmt: off words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."] @@ -495,59 +492,6 @@ def test_roundtrip_docs_to_docbin(doc): assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_make_orth_variants(doc): - nlp = English() - orth_variants = { - "single": [ - {"tags": ["NFP"], "variants": ["…", "..."]}, - {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, - ] - } - augmenter = create_orth_variants_augmenter( - level=0.2, lower=0.5, orth_variants=orth_variants - ) - with make_tempdir() as tmpdir: - output_file = tmpdir / "roundtrip.spacy" - DocBin(docs=[doc]).to_disk(output_file) - # due to randomness, test only that this runs with no errors for now - reader = Corpus(output_file, augmenter=augmenter) - list(reader(nlp)) - - -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_custom_data_augmentation(doc): - def create_spongebob_augmenter(randomize: bool = False): - def augment(nlp, example): - text = example.text - if randomize: - ch = [c.lower() if random.random() < 0.5 else c.upper() for c in text] - else: - ch = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)] - example_dict = example.to_dict() - doc = nlp.make_doc("".join(ch)) - example_dict["token_annotation"]["ORTH"] = [t.text for t in doc] - yield example - yield example.from_dict(doc, example_dict) - - return augment - - nlp = English() - with make_tempdir() as tmpdir: - output_file = tmpdir / "roundtrip.spacy" - DocBin(docs=[doc]).to_disk(output_file) - reader = Corpus(output_file, augmenter=create_spongebob_augmenter()) - corpus = list(reader(nlp)) - orig_text = "Sarah 's sister flew to Silicon Valley via London . " - augmented = "SaRaH 's sIsTeR FlEw tO SiLiCoN VaLlEy vIa lOnDoN . " - assert corpus[0].text == orig_text - assert corpus[0].reference.text == orig_text - assert corpus[0].predicted.text == orig_text - assert corpus[1].text == augmented - assert corpus[1].reference.text == augmented - assert corpus[1].predicted.text == augmented - - @pytest.mark.skip("Outdated") @pytest.mark.parametrize( "tokens_a,tokens_b,expected", diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 7415ad335..e6d10a195 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -34,16 +34,47 @@ def create_orth_variants_augmenter( ) -> Callable[["Language", Example], Iterator[Example]]: """Create a data augmentation callback that uses orth-variant replacement. The callback can be added to a corpus or other data iterator during training. + + level (float): The percentage of texts that will be augmented. + lower (float): The percentage of texts that will be lowercased. + orth_variants (Dict[str, dict]): A dictionary containing the single and + paired orth variants. Typically loaded from a JSON file. + RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter. """ return partial( orth_variants_augmenter, orth_variants=orth_variants, level=level, lower=lower ) +@registry.augmenters("spacy.lower_case.v1") +def create_lower_casing_augmenter( + level: float, +) -> Callable[["Language", Example], Iterator[Example]]: + """Create a data augmentation callback that converts documents to lowercase. + The callback can be added to a corpus or other data iterator during training. + + level (float): The percentage of texts that will be augmented. + RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter. + """ + return partial(lower_casing_augmenter, level=level) + + def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]: yield example +def lower_casing_augmenter( + nlp: "Language", example: Example, *, level: float, +) -> Iterator[Example]: + if random.random() >= level: + yield example + else: + example_dict = example.to_dict() + doc = nlp.make_doc(example.text.lower()) + example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in doc] + yield example.from_dict(doc, example_dict) + + def orth_variants_augmenter( nlp: "Language", example: Example, diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index f6225135c..1f3a36b33 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -12,6 +12,7 @@ from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings from ..pipeline._parser_internals import nonproj +from ..util import logger cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot): @@ -390,7 +391,7 @@ def _fix_legacy_dict_data(example_dict): if "HEAD" in token_dict and "SENT_START" in token_dict: # If heads are set, we don't also redundantly specify SENT_START. token_dict.pop("SENT_START") - warnings.warn(Warnings.W092) + logger.debug(Warnings.W092) return { "token_annotation": token_dict, "doc_annotation": doc_dict diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index d7273b651..eb2eb5d71 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -689,7 +689,8 @@ Data augmentation is the process of applying small modifications to the training data. It can be especially useful for punctuation and case replacement – for example, if your corpus only uses smart quotes and you want to include variations using regular quotes, or to make the model less sensitive to -capitalization by including a mix of capitalized and lowercase examples. See the [usage guide](/usage/training#data-augmentation) for details and examples. +capitalization by including a mix of capitalized and lowercase examples. See the +[usage guide](/usage/training#data-augmentation) for details and examples. ### spacy.orth_variants.v1 {#orth_variants tag="registered function"} @@ -707,7 +708,7 @@ capitalization by including a mix of capitalized and lowercase examples. See the > ``` Create a data augmentation callback that uses orth-variant replacement. The -callback can be added to a corpus or other data iterator during training. This +callback can be added to a corpus or other data iterator during training. It's is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart quotes, or only have smart quotes etc. @@ -718,6 +719,25 @@ beyond corpora that don't have smart quotes, or only have smart quotes etc. | `orth_variants` | A dictionary containing the single and paired orth variants. Typically loaded from a JSON file. See [`en_orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. ~~Dict[str, Dict[List[Union[str, List[str]]]]]~~ | | **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ | +### spacy.lower_case.v1 {#lower_case tag="registered function"} + +> #### Example config +> +> ```ini +> [corpora.train.augmenter] +> @augmenters = "spacy.lower_case.v1" +> level = 0.3 +> ``` + +Create a data augmentation callback that lowercases documents. The callback can +be added to a corpus or other data iterator during training. It's especially +useful for making the model less sensitive to capitalization. + +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `level` | The percentage of texts that will be augmented. ~~float~~ | +| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ | + ## Training data and alignment {#gold source="spacy/training"} ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"} @@ -827,10 +847,10 @@ utilities. ### util.get_lang_class {#util.get_lang_class tag="function"} Import and load a `Language` class. Allows lazy-loading -[language data](/usage/linguistic-features#language-data) and importing languages using the -two-letter language code. To add a language code for a custom language class, -you can register it using the [`@registry.languages`](/api/top-level#registry) -decorator. +[language data](/usage/linguistic-features#language-data) and importing +languages using the two-letter language code. To add a language code for a +custom language class, you can register it using the +[`@registry.languages`](/api/top-level#registry) decorator. > #### Example >