diff --git a/examples/training/rehearsal.py b/examples/training/rehearsal.py index 8c94ab14e..a7eb120c9 100644 --- a/examples/training/rehearsal.py +++ b/examples/training/rehearsal.py @@ -33,7 +33,7 @@ def read_raw_data(nlp, jsonl_loc): for json_obj in srsly.read_jsonl(jsonl_loc): if json_obj["text"].strip(): doc = nlp.make_doc(json_obj["text"]) - yield doc + yield Example.from_dict(doc, {}) def read_gold_data(nlp, gold_loc): @@ -52,7 +52,7 @@ def main(model_name, unlabelled_loc): batch_size = 4 nlp = spacy.load(model_name) nlp.get_pipe("ner").add_label(LABEL) - raw_docs = list(read_raw_data(nlp, unlabelled_loc)) + raw_examples = list(read_raw_data(nlp, unlabelled_loc)) optimizer = nlp.resume_training() # Avoid use of Adam when resuming training. I don't understand this well # yet, but I'm getting weird results from Adam. Try commenting out the @@ -61,20 +61,24 @@ def main(model_name, unlabelled_loc): optimizer.learn_rate = 0.1 optimizer.b1 = 0.0 optimizer.b2 = 0.0 - sizes = compounding(1.0, 4.0, 1.001) + + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + with nlp.select_pipes(enable="ner") and warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module="spacy") for itn in range(n_iter): - random.shuffle(TRAIN_DATA) - random.shuffle(raw_docs) + random.shuffle(train_examples) + random.shuffle(raw_examples) losses = {} r_losses = {} # batch up the examples using spaCy's minibatch - raw_batches = minibatch(raw_docs, size=4) - for batch in minibatch(TRAIN_DATA, size=sizes): + raw_batches = minibatch(raw_examples, size=4) + for batch in minibatch(train_examples, size=sizes): nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses) raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index b82ff5bb4..e107b6165 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -20,6 +20,8 @@ from pathlib import Path from spacy.vocab import Vocab import spacy from spacy.kb import KnowledgeBase + +from spacy.gold import Example from spacy.pipeline import EntityRuler from spacy.util import minibatch, compounding @@ -94,7 +96,7 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): # Convert the texts to docs to make sure we have doc.ents set for the training examples. # Also ensure that the annotated examples correspond to known identifiers in the knowledge base. kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings() - TRAIN_DOCS = [] + train_examples = [] for text, annotation in TRAIN_DATA: with nlp.select_pipes(disable="entity_linker"): doc = nlp(text) @@ -109,17 +111,17 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): "Removed", kb_id, "from training because it is not in the KB." ) annotation_clean["links"][offset] = new_dict - TRAIN_DOCS.append((doc, annotation_clean)) + train_examples .append(Example.from_dict(doc, annotation_clean)) with nlp.select_pipes(enable="entity_linker"): # only train entity linker # reset and initialize the weights randomly optimizer = nlp.begin_training() for itn in range(n_iter): - random.shuffle(TRAIN_DOCS) + random.shuffle(train_examples) losses = {} # batch up the examples using spaCy's minibatch - batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update( batch, diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py index df1356e3c..fffa140f4 100644 --- a/examples/training/train_intent_parser.py +++ b/examples/training/train_intent_parser.py @@ -23,6 +23,7 @@ import plac import random from pathlib import Path import spacy +from spacy.gold import Example from spacy.util import minibatch, compounding @@ -120,17 +121,19 @@ def main(model=None, output_dir=None, n_iter=15): parser = nlp.create_pipe("parser") nlp.add_pipe(parser, first=True) + train_examples = [] for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for dep in annotations.get("deps", []): parser.add_label(dep) with nlp.select_pipes(enable="parser"): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): - random.shuffle(TRAIN_DATA) + random.shuffle(train_examples) losses = {} # batch up the examples using spaCy's minibatch - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(batch, sgd=optimizer, losses=losses) print("Losses", losses) diff --git a/examples/training/train_morphologizer.py b/examples/training/train_morphologizer.py index aec114de7..8c39a28a6 100644 --- a/examples/training/train_morphologizer.py +++ b/examples/training/train_morphologizer.py @@ -14,6 +14,7 @@ import plac import random from pathlib import Path import spacy +from spacy.gold import Example from spacy.util import minibatch, compounding from spacy.morphology import Morphology @@ -84,8 +85,10 @@ def main(lang="en", output_dir=None, n_iter=25): morphologizer = nlp.create_pipe("morphologizer") nlp.add_pipe(morphologizer) - # add labels - for _, annotations in TRAIN_DATA: + # add labels and create the Example instances + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) morph_labels = annotations.get("morphs") pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs"))) assert len(morph_labels) == len(pos_labels) @@ -98,10 +101,10 @@ def main(lang="en", output_dir=None, n_iter=25): optimizer = nlp.begin_training() for i in range(n_iter): - random.shuffle(TRAIN_DATA) + random.shuffle(train_examples) losses = {} # batch up the examples using spaCy's minibatch - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(batch, sgd=optimizer, losses=losses) print("Losses", losses) diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py index 98b428bf8..26b283777 100644 --- a/examples/training/train_ner.py +++ b/examples/training/train_ner.py @@ -17,6 +17,7 @@ import random import warnings from pathlib import Path import spacy +from spacy.gold import Example from spacy.util import minibatch, compounding @@ -50,8 +51,10 @@ def main(model=None, output_dir=None, n_iter=100): else: ner = nlp.get_pipe("simple_ner") - # add labels - for _, annotations in TRAIN_DATA: + # add labels and create Example objects + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for ent in annotations.get("entities"): print("Add label", ent[2]) ner.add_label(ent[2]) @@ -68,10 +71,10 @@ def main(model=None, output_dir=None, n_iter=100): "Transitions", list(enumerate(nlp.get_pipe("simple_ner").get_tag_names())) ) for itn in range(n_iter): - random.shuffle(TRAIN_DATA) + random.shuffle(train_examples) losses = {} # batch up the examples using spaCy's minibatch - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update( batch, diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 5124d0a2c..c4edafac4 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -80,6 +80,10 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy + train_examples = [] + for text, annotation in TRAIN_DATA: + train_examples.append(TRAIN_DATA.from_dict(nlp(text), annotation)) + if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) @@ -102,8 +106,8 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): - random.shuffle(TRAIN_DATA) - batches = minibatch(TRAIN_DATA, size=sizes) + random.shuffle(train_examples) + batches = minibatch(train_examples, size=sizes) losses = {} for batch in batches: nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses) diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py index 4f4409e31..d46a8f4b9 100644 --- a/examples/training/train_parser.py +++ b/examples/training/train_parser.py @@ -14,6 +14,7 @@ import plac import random from pathlib import Path import spacy +from spacy.gold import Example from spacy.util import minibatch, compounding @@ -59,18 +60,20 @@ def main(model=None, output_dir=None, n_iter=15): else: parser = nlp.get_pipe("parser") - # add labels to the parser - for _, annotations in TRAIN_DATA: + # add labels to the parser and create the Example objects + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for dep in annotations.get("deps", []): parser.add_label(dep) with nlp.select_pipes(enable="parser"): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): - random.shuffle(TRAIN_DATA) + random.shuffle(train_examples) losses = {} # batch up the examples using spaCy's minibatch - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(batch, sgd=optimizer, losses=losses) print("Losses", losses) diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index 06e05f6cd..4eeb77fb9 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -17,6 +17,7 @@ import plac import random from pathlib import Path import spacy +from spacy.gold import Example from spacy.util import minibatch, compounding @@ -58,12 +59,16 @@ def main(lang="en", output_dir=None, n_iter=25): tagger.add_label(tag, values) nlp.add_pipe(tagger) + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + optimizer = nlp.begin_training() for i in range(n_iter): - random.shuffle(TRAIN_DATA) + random.shuffle(train_examples) losses = {} # batch up the examples using spaCy's minibatch - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(batch, sgd=optimizer, losses=losses) print("Losses", losses) diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index ee9f3e707..3dc9f1027 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -31,17 +31,20 @@ def profile_cli( def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None: - try: - import ml_datasets - except ImportError: - msg.fail( - "This command requires the ml_datasets library to be installed:" - "pip install ml_datasets", - exits=1, - ) + if inputs is not None: inputs = _read_inputs(inputs, msg) if inputs is None: + try: + import ml_datasets + except ImportError: + msg.fail( + "This command, when run without an input file, " + "requires the ml_datasets library to be installed: " + "pip install ml_datasets", + exits=1, + ) + n_inputs = 25000 with msg.loading("Loading IMDB dataset via Thinc..."): imdb_train, _ = ml_datasets.imdb() diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 92fd8c20a..b974247bd 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -12,7 +12,7 @@ from thinc.api import Model, use_pytorch_for_gpu_memory import random from ._app import app, Arg, Opt -from ..gold import Corpus +from ..gold import Corpus, Example from ..lookups import Lookups from .. import util from ..errors import Errors @@ -423,9 +423,8 @@ def train_while_improving( if raw_text: random.shuffle(raw_text) - raw_batches = util.minibatch( - (nlp.make_doc(rt["text"]) for rt in raw_text), size=8 - ) + raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text] + raw_batches = util.minibatch(raw_examples, size=8) for step, (epoch, batch) in enumerate(train_data): dropout = next(dropouts) diff --git a/spacy/errors.py b/spacy/errors.py index 07cf7bbdf..4e73aee6f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -547,13 +547,13 @@ class Errors(object): E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") E973 = ("Unexpected type for NER data") E974 = ("Unknown {obj} attribute: {key}") - E975 = ("The method Example.from_dict expects a Doc as first argument, " + E975 = ("The method 'Example.from_dict' expects a Doc as first argument, " "but got {type}") - E976 = ("The method Example.from_dict expects a dict as second argument, " + E976 = ("The method 'Example.from_dict' expects a dict as second argument, " "but received None.") E977 = ("Can not compare a MorphAnalysis with a string object. " "This is likely a bug in spaCy, so feel free to open an issue.") - E978 = ("The {method} method of component {name} takes a list of Example objects, " + E978 = ("The '{method}' method of {name} takes a list of Example objects, " "but found {types} instead.") E979 = ("Cannot convert {type} to an Example object.") E980 = ("Each link annotation should refer to a dictionary with at most one " diff --git a/spacy/language.py b/spacy/language.py index 573b83e5f..dbc213574 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -2,6 +2,7 @@ import random import itertools import weakref import functools +from collections import Iterable from contextlib import contextmanager from copy import copy, deepcopy from pathlib import Path @@ -529,22 +530,6 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) - def _convert_examples(self, examples): - converted_examples = [] - if isinstance(examples, tuple): - examples = [examples] - for eg in examples: - if isinstance(eg, Example): - converted_examples.append(eg.copy()) - elif isinstance(eg, tuple): - doc, annot = eg - if isinstance(doc, str): - doc = self.make_doc(doc) - converted_examples.append(Example.from_dict(doc, annot)) - else: - raise ValueError(Errors.E979.format(type=type(eg))) - return converted_examples - def update( self, examples, @@ -557,7 +542,7 @@ class Language(object): ): """Update the models in the pipeline. - examples (iterable): A batch of `Example` or `Doc` objects. + examples (iterable): A batch of `Example` objects. dummy: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. sgd (callable): An optimizer. @@ -569,10 +554,13 @@ class Language(object): """ if dummy is not None: raise ValueError(Errors.E989) - if len(examples) == 0: return - examples = self._convert_examples(examples) + if not isinstance(examples, Iterable): + raise TypeError(Errors.E978.format(name="language", method="update", types=type(examples))) + wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)]) + if wrong_types: + raise TypeError(Errors.E978.format(name="language", method="update", types=wrong_types)) if sgd is None: if self._optimizer is None: @@ -605,22 +593,26 @@ class Language(object): initial ones. This is useful for keeping a pretrained model on-track, even if you're updating it with a smaller set of examples. - examples (iterable): A batch of `Doc` objects. + examples (iterable): A batch of `Example` objects. drop (float): The dropout rate. sgd (callable): An optimizer. RETURNS (dict): Results from the update. EXAMPLE: >>> raw_text_batches = minibatch(raw_texts) - >>> for labelled_batch in minibatch(zip(train_docs, train_golds)): + >>> for labelled_batch in minibatch(examples): >>> nlp.update(labelled_batch) - >>> raw_batch = [nlp.make_doc(text) for text in next(raw_text_batches)] + >>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)] >>> nlp.rehearse(raw_batch) """ # TODO: document if len(examples) == 0: return - examples = self._convert_examples(examples) + if not isinstance(examples, Iterable): + raise TypeError(Errors.E978.format(name="language", method="rehearse", types=type(examples))) + wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)]) + if wrong_types: + raise TypeError(Errors.E978.format(name="language", method="rehearse", types=wrong_types)) if sgd is None: if self._optimizer is None: self._optimizer = create_default_optimizer() @@ -696,7 +688,7 @@ class Language(object): component that has a .rehearse() method. Rehearsal is used to prevent models from "forgetting" their initialised "knowledge". To perform rehearsal, collect samples of text you want the models to retain performance - on, and call nlp.rehearse() with a batch of Doc objects. + on, and call nlp.rehearse() with a batch of Example objects. """ if cfg.get("device", -1) >= 0: util.use_gpu(cfg["device"]) @@ -728,7 +720,11 @@ class Language(object): DOCS: https://spacy.io/api/language#evaluate """ - examples = self._convert_examples(examples) + if not isinstance(examples, Iterable): + raise TypeError(Errors.E978.format(name="language", method="evaluate", types=type(examples))) + wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)]) + if wrong_types: + raise TypeError(Errors.E978.format(name="language", method="evaluate", types=wrong_types)) if scorer is None: scorer = Scorer(pipeline=self.pipeline) if component_cfg is None: diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index be28dcc85..ed700b09a 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -295,7 +295,7 @@ class Tagger(Pipe): return except AttributeError: types = set([type(eg) for eg in examples]) - raise ValueError(Errors.E978.format(name="Tagger", method="update", types=types)) + raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types)) set_dropout_rate(self.model, drop) tag_scores, bp_tag_scores = self.model.begin_update( [eg.predicted for eg in examples]) @@ -321,7 +321,7 @@ class Tagger(Pipe): docs = [eg.predicted for eg in examples] except AttributeError: types = set([type(eg) for eg in examples]) - raise ValueError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) + raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) if self._rehearsal_model is None: return if not any(len(doc) for doc in docs): @@ -358,7 +358,7 @@ class Tagger(Pipe): try: y = example.y except AttributeError: - raise ValueError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) + raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) for token in y: tag = token.tag_ if tag in orig_tag_map: @@ -790,7 +790,7 @@ class ClozeMultitask(Pipe): predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples]) except AttributeError: types = set([type(eg) for eg in examples]) - raise ValueError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types)) + raise TypeError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types)) loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) bp_predictions(d_predictions) if sgd is not None: @@ -856,7 +856,7 @@ class TextCategorizer(Pipe): return except AttributeError: types = set([type(eg) for eg in examples]) - raise ValueError(Errors.E978.format(name="TextCategorizer", method="update", types=types)) + raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types)) set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update( [eg.predicted for eg in examples] @@ -879,7 +879,7 @@ class TextCategorizer(Pipe): docs = [eg.predicted for eg in examples] except AttributeError: types = set([type(eg) for eg in examples]) - raise ValueError(Errors.E978.format(name="TextCategorizer", method="rehearse", types=types)) + raise TypeError(Errors.E978.format(name="TextCategorizer", method="rehearse", types=types)) if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return @@ -940,7 +940,7 @@ class TextCategorizer(Pipe): try: y = example.y except AttributeError: - raise ValueError(Errors.E978.format(name="TextCategorizer", method="update", types=type(example))) + raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=type(example))) for cat in y.cats: self.add_label(cat) self.require_labels() @@ -1105,7 +1105,7 @@ class EntityLinker(Pipe): docs = [eg.predicted for eg in examples] except AttributeError: types = set([type(eg) for eg in examples]) - raise ValueError(Errors.E978.format(name="EntityLinker", method="update", types=types)) + raise TypeError(Errors.E978.format(name="EntityLinker", method="update", types=types)) if set_annotations: # This seems simpler than other ways to get that exact output -- but # it does run the model twice :( diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 81484c083..2f828e7fa 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -209,6 +209,10 @@ def test_train_empty(): ] nlp = English() + train_examples = [] + for t in train_data: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + ner = nlp.create_pipe("ner") ner.add_label("PERSON") nlp.add_pipe(ner, last=True) @@ -216,10 +220,9 @@ def test_train_empty(): nlp.begin_training() for itn in range(2): losses = {} - batches = util.minibatch(train_data) + batches = util.minibatch(train_examples) for batch in batches: - texts, annotations = zip(*batch) - nlp.update(train_data, losses=losses) + nlp.update(batch, losses=losses) def test_overwrite_token(): @@ -328,7 +331,9 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly nlp = English() ner = nlp.create_pipe("ner") - for _, annotations in TRAIN_DATA: + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for ent in annotations.get("entities"): ner.add_label(ent[2]) nlp.add_pipe(ner) @@ -336,7 +341,7 @@ def test_overfitting_IO(): for i in range(50): losses = {} - nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["ner"] < 0.00001 # test the trained model diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index c54088f56..4cff31712 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -3,6 +3,7 @@ import pytest from spacy.lang.en import English from ..util import get_doc, apply_transition_sequence, make_tempdir from ... import util +from ...gold import Example TRAIN_DATA = [ ( @@ -189,7 +190,9 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly nlp = English() parser = nlp.create_pipe("parser") - for _, annotations in TRAIN_DATA: + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for dep in annotations.get("deps", []): parser.add_label(dep) nlp.add_pipe(parser) @@ -197,7 +200,7 @@ def test_overfitting_IO(): for i in range(50): losses = {} - nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["parser"] < 0.00001 # test the trained model diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index a50ad8499..f91cc6f70 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -3,6 +3,7 @@ import pytest from spacy.kb import KnowledgeBase from spacy import util +from spacy.gold import Example from spacy.lang.en import English from spacy.pipeline import EntityRuler from spacy.tests.util import make_tempdir @@ -283,11 +284,10 @@ def test_overfitting_IO(): nlp.add_pipe(ruler) # Convert the texts to docs to make sure we have doc.ents set for the training examples - TRAIN_DOCS = [] + train_examples = [] for text, annotation in TRAIN_DATA: doc = nlp(text) - annotation_clean = annotation - TRAIN_DOCS.append((doc, annotation_clean)) + train_examples.append(Example.from_dict(doc, annotation)) # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer @@ -309,7 +309,7 @@ def test_overfitting_IO(): optimizer = nlp.begin_training() for i in range(50): losses = {} - nlp.update(TRAIN_DOCS, sgd=optimizer, losses=losses) + nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["entity_linker"] < 0.001 # test the trained model diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index c853de232..9b7e2788d 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -1,6 +1,7 @@ import pytest from spacy import util +from spacy.gold import Example from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir @@ -33,7 +34,9 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly nlp = English() morphologizer = nlp.create_pipe("morphologizer") + train_examples = [] for inst in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]): morphologizer.add_label(morph + "|POS=" + pos) nlp.add_pipe(morphologizer) @@ -41,7 +44,7 @@ def test_overfitting_IO(): for i in range(50): losses = {} - nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["morphologizer"] < 0.00001 # test the trained model diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 041da2c9f..bfa1bd65a 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -1,6 +1,7 @@ import pytest from spacy import util +from spacy.gold import Example from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir @@ -34,12 +35,15 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly nlp = English() senter = nlp.create_pipe("senter") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) nlp.add_pipe(senter) optimizer = nlp.begin_training() for i in range(200): losses = {} - nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["senter"] < 0.001 # test the trained model diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index a90207a78..aedf8e2b3 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,6 +1,7 @@ import pytest from spacy import util +from spacy.gold import Example from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir @@ -28,12 +29,15 @@ def test_overfitting_IO(): tagger = nlp.create_pipe("tagger") for tag, values in TAG_MAP.items(): tagger.add_label(tag, values) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) nlp.add_pipe(tagger) optimizer = nlp.begin_training() for i in range(50): losses = {} - nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["tagger"] < 0.00001 # test the trained model diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 6f01ada69..214163a97 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -85,7 +85,9 @@ def test_overfitting_IO(): fix_random_seed(0) nlp = English() textcat = nlp.create_pipe("textcat") - for _, annotations in TRAIN_DATA: + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for label, value in annotations.get("cats").items(): textcat.add_label(label) nlp.add_pipe(textcat) @@ -93,7 +95,7 @@ def test_overfitting_IO(): for i in range(50): losses = {} - nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["textcat"] < 0.01 # test the trained model @@ -134,11 +136,13 @@ def test_textcat_configs(textcat_config): pipe_config = {"model": textcat_config} nlp = English() textcat = nlp.create_pipe("textcat", pipe_config) - for _, annotations in TRAIN_DATA: + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for label, value in annotations.get("cats").items(): textcat.add_label(label) nlp.add_pipe(textcat) optimizer = nlp.begin_training() for i in range(5): losses = {} - nlp.update(TRAIN_DATA, sgd=optimizer, losses=losses) + nlp.update(train_examples, sgd=optimizer, losses=losses) diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 033e4f83e..5d504a9c6 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -1,5 +1,6 @@ import pytest from spacy import displacy +from spacy.gold import Example from spacy.lang.en import English from spacy.lang.ja import Japanese from spacy.lang.xx import MultiLanguage @@ -141,10 +142,10 @@ def test_issue2800(): """Test issue that arises when too many labels are added to NER model. Used to cause segfault. """ - train_data = [] - train_data.extend([("One sentence", {"entities": []})]) - entity_types = [str(i) for i in range(1000)] nlp = English() + train_data = [] + train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]) + entity_types = [str(i) for i in range(1000)] ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for entity_type in list(entity_types): @@ -153,8 +154,8 @@ def test_issue2800(): for i in range(20): losses = {} random.shuffle(train_data) - for statement, entities in train_data: - nlp.update((statement, entities), sgd=optimizer, losses=losses, drop=0.5) + for example in train_data: + nlp.update([example], sgd=optimizer, losses=losses, drop=0.5) def test_issue2822(it_tokenizer): diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py index cab68793c..67bc88466 100644 --- a/spacy/tests/regression/test_issue3611.py +++ b/spacy/tests/regression/test_issue3611.py @@ -1,4 +1,5 @@ import spacy +from spacy.gold import Example from spacy.util import minibatch, compounding @@ -12,15 +13,15 @@ def test_issue3611(): ] y_train = ["offensive", "offensive", "inoffensive"] - # preparing the data - pos_cats = list() - for train_instance in y_train: - pos_cats.append({label: label == train_instance for label in unique_classes}) - train_data = list(zip(x_train, [{"cats": cats} for cats in pos_cats])) - - # set up the spacy model with a text categorizer component nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + + # add a text categorizer component textcat = nlp.create_pipe( "textcat", config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py index b641213ad..12a320c71 100644 --- a/spacy/tests/regression/test_issue4030.py +++ b/spacy/tests/regression/test_issue4030.py @@ -1,4 +1,5 @@ import spacy +from spacy.gold import Example from spacy.util import minibatch, compounding @@ -12,15 +13,15 @@ def test_issue4030(): ] y_train = ["offensive", "offensive", "inoffensive"] - # preparing the data - pos_cats = list() - for train_instance in y_train: - pos_cats.append({label: label == train_instance for label in unique_classes}) - train_data = list(zip(x_train, [{"cats": cats} for cats in pos_cats])) - - # set up the spacy model with a text categorizer component nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + + # add a text categorizer component textcat = nlp.create_pipe( "textcat", config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py index 4978e0c8e..d7a12d054 100644 --- a/spacy/tests/regression/test_issue4348.py +++ b/spacy/tests/regression/test_issue4348.py @@ -1,3 +1,4 @@ +from spacy.gold import Example from spacy.lang.en import English from spacy.util import minibatch, compounding import pytest @@ -7,9 +8,10 @@ import pytest def test_issue4348(): """Test that training the tagger with empty data, doesn't throw errors""" - TRAIN_DATA = [("", {"tags": []}), ("", {"tags": []})] - nlp = English() + example = Example.from_dict(nlp.make_doc(""), {"tags": []}) + TRAIN_DATA = [example, example] + tagger = nlp.create_pipe("tagger") nlp.add_pipe(tagger) diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py index 10c7868a0..c3d3c4326 100644 --- a/spacy/tests/regression/test_issue4924.py +++ b/spacy/tests/regression/test_issue4924.py @@ -1,7 +1,8 @@ +from spacy.gold import Example from spacy.language import Language def test_issue4924(): nlp = Language() - docs_golds = [("", {})] - nlp.evaluate(docs_golds) + example = Example.from_dict(nlp.make_doc(""), {}) + nlp.evaluate([example]) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index a5e11ea28..0ed4d50d5 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -589,7 +589,7 @@ def test_tuple_format_implicit(): ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}), ] - _train(train_data) + _train_tuples(train_data) def test_tuple_format_implicit_invalid(): @@ -605,20 +605,24 @@ def test_tuple_format_implicit_invalid(): ] with pytest.raises(KeyError): - _train(train_data) + _train_tuples(train_data) -def _train(train_data): +def _train_tuples(train_data): nlp = English() ner = nlp.create_pipe("ner") ner.add_label("ORG") ner.add_label("LOC") nlp.add_pipe(ner) + train_examples = [] + for t in train_data: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + optimizer = nlp.begin_training() for i in range(5): losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(batch, sgd=optimizer, losses=losses) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index e5555bbc7..7b4c29c5a 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -5,6 +5,7 @@ from spacy.tokens import Doc, Span from spacy.vocab import Vocab from .util import add_vecs_to_vocab, assert_docs_equal +from ..gold import Example @pytest.fixture @@ -23,26 +24,45 @@ def test_language_update(nlp): annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}} wrongkeyannots = {"LABEL": True} doc = Doc(nlp.vocab, words=text.split(" ")) - # Update with text and dict - nlp.update((text, annots)) + example = Example.from_dict(doc, annots) + nlp.update([example]) + + # Not allowed to call with just one Example + with pytest.raises(TypeError): + nlp.update(example) + + # Update with text and dict: not supported anymore since v.3 + with pytest.raises(TypeError): + nlp.update((text, annots)) # Update with doc object and dict - nlp.update((doc, annots)) - # Update badly + with pytest.raises(TypeError): + nlp.update((doc, annots)) + + # Create examples badly with pytest.raises(ValueError): - nlp.update((doc, None)) + example = Example.from_dict(doc, None) with pytest.raises(KeyError): - nlp.update((text, wrongkeyannots)) + example = Example.from_dict(doc, wrongkeyannots) def test_language_evaluate(nlp): text = "hello world" annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}} doc = Doc(nlp.vocab, words=text.split(" ")) - # Evaluate with text and dict - nlp.evaluate([(text, annots)]) + example = Example.from_dict(doc, annots) + nlp.evaluate([example]) + + # Not allowed to call with just one Example + with pytest.raises(TypeError): + nlp.evaluate(example) + + # Evaluate with text and dict: not supported anymore since v.3 + with pytest.raises(TypeError): + nlp.evaluate([(text, annots)]) # Evaluate with doc object and dict - nlp.evaluate([(doc, annots)]) - with pytest.raises(Exception): + with pytest.raises(TypeError): + nlp.evaluate([(doc, annots)]) + with pytest.raises(TypeError): nlp.evaluate([text, annots]) @@ -56,8 +76,9 @@ def test_evaluate_no_pipe(nlp): text = "hello world" annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}} nlp = Language(Vocab()) + doc = nlp(text) nlp.add_pipe(pipe) - nlp.evaluate([(text, annots)]) + nlp.evaluate([Example.from_dict(doc, annots)]) def vector_modification_pipe(doc):