diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index 6c3a21f4b..f76336d84 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -9,6 +9,7 @@ max_length = 0 limit = 0 # Data augmentation orth_variant_level = 0.0 +noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 @@ -24,8 +25,8 @@ scores = ["speed", "tags_acc", "uas", "las", "ents_f"] score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} # These settings are invalid for the transformer models. init_tok2vec = null -vectors = null discard_oversize = false +omit_extra_lookups = false [training.batch_size] @schedules = "compounding.v1" @@ -52,7 +53,7 @@ learn_rate = 0.001 [nlp] lang = "en" -vectors = ${training:vectors} +vectors = null [nlp.pipeline.tok2vec] factory = "tok2vec" @@ -62,12 +63,20 @@ factory = "senter" [nlp.pipeline.ner] factory = "ner" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.tagger] factory = "tagger" [nlp.pipeline.parser] factory = "parser" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.senter.model] @architectures = "spacy.Tagger.v1" diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 4f1898d69..40885b6e8 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -9,6 +9,7 @@ max_length = 0 limit = 0 # Data augmentation orth_variant_level = 0.0 +noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 @@ -24,7 +25,6 @@ scores = ["speed", "tags_acc", "uas", "las", "ents_f"] score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} # These settings are invalid for the transformer models. init_tok2vec = null -vectors = null discard_oversize = false [training.batch_size] @@ -72,7 +72,7 @@ normalize = true [nlp] lang = "en" -vectors = ${training:vectors} +vectors = null [nlp.pipeline.tok2vec] factory = "tok2vec" @@ -82,12 +82,20 @@ factory = "senter" [nlp.pipeline.ner] factory = "ner" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.tagger] factory = "tagger" [nlp.pipeline.parser] factory = "parser" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.senter.model] @architectures = "spacy.Tagger.v1" diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index acbcc8d41..905b5b4e0 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -6,6 +6,7 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 +noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = 0 @@ -40,6 +41,10 @@ factory = "tagger" [nlp.pipeline.parser] factory = "parser" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.tagger.model] @architectures = "spacy.Tagger.v1" diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index c305c015c..7383116e7 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -6,6 +6,7 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 +noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = -1 @@ -40,6 +41,10 @@ factory = "tagger" [nlp.pipeline.parser] factory = "parser" +learn_tokens = false +min_action_freq = 1 +beam_width = 1 +beam_update_prob = 1.0 [nlp.pipeline.tagger.model] @architectures = "spacy.Tagger.v1" diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 65acadb07..c5e679467 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -120,13 +120,22 @@ def load_data(dataset, threshold, limit=0, split=0.8): random.shuffle(train_data) texts, labels = zip(*train_data) - unique_labels = sorted(set([l for label_set in labels for l in label_set])) + unique_labels = set() + for label_set in labels: + if isinstance(label_set, int) or isinstance(label_set, str): + unique_labels.add(label_set) + elif isinstance(label_set, list) or isinstance(label_set, set): + unique_labels.update(label_set) + unique_labels = sorted(unique_labels) print(f"# of unique_labels: {len(unique_labels)}") count_values_train = dict() for text, annot_list in train_data: - for annot in annot_list: - count_values_train[annot] = count_values_train.get(annot, 0) + 1 + if isinstance(annot_list, int) or isinstance(annot_list, str): + count_values_train[annot_list] = count_values_train.get(annot_list, 0) + 1 + else: + for annot in annot_list: + count_values_train[annot] = count_values_train.get(annot, 0) + 1 for value, count in sorted(count_values_train.items(), key=lambda item: item[1]): if count < threshold: unique_labels.remove(value) @@ -138,7 +147,7 @@ def load_data(dataset, threshold, limit=0, split=0.8): else: cats = [] for y in labels: - if isinstance(y, str): + if isinstance(y, str) or isinstance(y, int): cats.append({str(label): (label == y) for label in unique_labels}) elif isinstance(y, set): cats.append({str(label): (label in y) for label in unique_labels}) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 735e304f9..bae252b1c 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -54,7 +54,8 @@ def evaluate( "NER P": f"{scorer.ents_p:.2f}", "NER R": f"{scorer.ents_r:.2f}", "NER F": f"{scorer.ents_f:.2f}", - "Textcat": f"{scorer.textcat_score:.2f}", + "Textcat AUC": f"{scorer.textcat_auc:.2f}", + "Textcat F": f"{scorer.textcat_f:.2f}", "Sent P": f"{scorer.sent_p:.2f}", "Sent R": f"{scorer.sent_r:.2f}", "Sent F": f"{scorer.sent_f:.2f}", diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index d37426b5a..4f4707b52 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -266,17 +266,15 @@ def create_pretraining_model(nlp, tok2vec): the tok2vec input model. The tok2vec input model needs to be a model that takes a batch of Doc objects (as a list), and returns a list of arrays. Each array in the output needs to have one row per token in the doc. + The actual tok2vec layer is stored as a reference, and only this bit will be + serialized to file and read back in when calling the 'train' command. """ output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain( Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size) ) - # This is annoying, but the parser etc have the flatten step after - # the tok2vec. To load the weights in cleanly, we need to match - # the shape of the models' components exactly. So what we cann - # "tok2vec" has to be the same set of processes as what the components do. - tok2vec = chain(tok2vec, list2array()) - model = chain(tok2vec, output_layer) + model = chain(tok2vec, list2array()) + model = chain(model, output_layer) model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) mlm_model = build_masked_language_model(nlp.vocab, model) mlm_model.set_ref("tok2vec", tok2vec) diff --git a/spacy/cli/train.py b/spacy/cli/train.py deleted file mode 100644 index cbe977cad..000000000 --- a/spacy/cli/train.py +++ /dev/null @@ -1,773 +0,0 @@ -import os -import tqdm -from pathlib import Path -from thinc.api import use_ops -from timeit import default_timer as timer -import shutil -import srsly -from wasabi import msg -import contextlib -import random - -from ..util import create_default_optimizer -from ..util import use_gpu as set_gpu -from ..gold import GoldCorpus -from ..lookups import Lookups -from .. import util -from .. import about - - -def train( - # fmt: off - lang: ("Model language", "positional", None, str), - output_path: ("Output directory to store model in", "positional", None, Path), - train_path: ("Location of JSON-formatted training data", "positional", None, Path), - dev_path: ("Location of JSON-formatted development data", "positional", None, Path), - raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, - base_model: ("Name of model to update (optional)", "option", "b", str) = None, - pipeline: ("Comma-separated names of pipeline components", "option", "p", str) = "tagger,parser,ner", - vectors: ("Model to load vectors from", "option", "v", str) = None, - replace_components: ("Replace components from base model", "flag", "R", bool) = False, - n_iter: ("Number of iterations", "option", "n", int) = 30, - n_early_stopping: ("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int) = None, - n_examples: ("Number of examples", "option", "ns", int) = 0, - use_gpu: ("Use GPU", "option", "g", int) = -1, - version: ("Model version", "option", "V", str) = "0.0.0", - meta_path: ("Optional path to meta.json to use as base.", "option", "m", Path) = None, - init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, - parser_multitasks: ("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str) = "", - entity_multitasks: ("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str) = "", - noise_level: ("Amount of corruption for data augmentation", "option", "nl", float) = 0.0, - orth_variant_level: ("Amount of orthography variation for data augmentation", "option", "ovl", float) = 0.0, - eval_beam_widths: ("Beam widths to evaluate, e.g. 4,8", "option", "bw", str) = "", - gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False, - learn_tokens: ("Make parser learn gold-standard tokenization", "flag", "T", bool) = False, - textcat_multilabel: ("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool) = False, - textcat_arch: ("Textcat model architecture", "option", "ta", str) = "bow", - textcat_positive_label: ("Textcat positive label for binary classes with two labels", "option", "tpl", str) = None, - tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, - omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False, - verbose: ("Display more information for debug", "flag", "VV", bool) = False, - debug: ("Run data diagnostics before training", "flag", "D", bool) = False, - # fmt: on -): - """ - Train or update a spaCy model. Requires data to be formatted in spaCy's - JSON format. To convert data from other formats, use the `spacy convert` - command. - """ - util.fix_random_seed() - util.set_env_log(verbose) - - # Make sure all files and paths exists if they are needed - train_path = util.ensure_path(train_path) - dev_path = util.ensure_path(dev_path) - meta_path = util.ensure_path(meta_path) - output_path = util.ensure_path(output_path) - if raw_text is not None: - raw_text = list(srsly.read_jsonl(raw_text)) - if not train_path or not train_path.exists(): - msg.fail("Training data not found", train_path, exits=1) - if not dev_path or not dev_path.exists(): - msg.fail("Development data not found", dev_path, exits=1) - if meta_path is not None and not meta_path.exists(): - msg.fail("Can't find model meta.json", meta_path, exits=1) - meta = srsly.read_json(meta_path) if meta_path else {} - if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: - msg.warn( - "Output directory is not empty", - "This can lead to unintended side effects when saving the model. " - "Please use an empty directory or a different path instead. If " - "the specified output path doesn't exist, the directory will be " - "created for you.", - ) - if not output_path.exists(): - output_path.mkdir() - msg.good(f"Created output directory: {output_path}") - - tag_map = {} - if tag_map_path is not None: - tag_map = srsly.read_json(tag_map_path) - # Take dropout and batch size as generators of values -- dropout - # starts high and decays sharply, to force the optimizer to explore. - # Batch size starts at 1 and grows, so that we make updates quickly - # at the beginning of training. - dropout_rates = util.decaying( - util.env_opt("dropout_from", 0.2), - util.env_opt("dropout_to", 0.2), - util.env_opt("dropout_decay", 0.0), - ) - batch_sizes = util.compounding( - util.env_opt("batch_from", 100.0), - util.env_opt("batch_to", 1000.0), - util.env_opt("batch_compound", 1.001), - ) - - if not eval_beam_widths: - eval_beam_widths = [1] - else: - eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] - if 1 not in eval_beam_widths: - eval_beam_widths.append(1) - eval_beam_widths.sort() - has_beam_widths = eval_beam_widths != [1] - - default_dir = Path(__file__).parent.parent / "pipeline" / "defaults" - - # Set up the base model and pipeline. If a base model is specified, load - # the model and make sure the pipeline matches the pipeline setting. If - # training starts from a blank model, intitalize the language class. - pipeline = [p.strip() for p in pipeline.split(",")] - msg.text(f"Training pipeline: {pipeline}") - disabled_pipes = None - pipes_added = False - if use_gpu >= 0: - activated_gpu = None - try: - activated_gpu = set_gpu(use_gpu) - except Exception as e: - msg.warn(f"Exception: {e}") - if activated_gpu is not None: - msg.text(f"Using GPU: {use_gpu}") - else: - msg.warn(f"Unable to activate GPU: {use_gpu}") - msg.text("Using CPU only") - use_gpu = -1 - if base_model: - msg.text(f"Starting with base model '{base_model}'") - nlp = util.load_model(base_model) - if nlp.lang != lang: - msg.fail( - f"Model language ('{nlp.lang}') doesn't match language " - f"specified as `lang` argument ('{lang}') ", - exits=1, - ) - if vectors: - msg.text(f"Loading vectors from model '{vectors}'") - _load_vectors(nlp, vectors) - - nlp.select_pipes(disable=[p for p in nlp.pipe_names if p not in pipeline]) - for pipe in pipeline: - # first, create the model. - # Bit of a hack after the refactor to get the vectors into a default config - # use train-from-config instead :-) - if pipe == "parser": - config_loc = default_dir / "parser_defaults.cfg" - elif pipe == "tagger": - config_loc = default_dir / "tagger_defaults.cfg" - elif pipe == "ner": - config_loc = default_dir / "ner_defaults.cfg" - elif pipe == "textcat": - config_loc = default_dir / "textcat_defaults.cfg" - elif pipe == "senter": - config_loc = default_dir / "senter_defaults.cfg" - else: - raise ValueError(f"Component {pipe} currently not supported.") - pipe_cfg = util.load_config(config_loc, create_objects=False) - if vectors: - pretrained_config = { - "@architectures": "spacy.VocabVectors.v1", - "name": vectors, - } - pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config - - if pipe == "parser": - pipe_cfg["learn_tokens"] = learn_tokens - elif pipe == "textcat": - pipe_cfg["exclusive_classes"] = not textcat_multilabel - pipe_cfg["architecture"] = textcat_arch - pipe_cfg["positive_label"] = textcat_positive_label - - if pipe not in nlp.pipe_names: - msg.text(f"Adding component to base model '{pipe}'") - nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) - pipes_added = True - elif replace_components: - msg.text(f"Replacing component from base model '{pipe}'") - nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg)) - pipes_added = True - else: - if pipe == "textcat": - textcat_cfg = nlp.get_pipe("textcat").cfg - base_cfg = { - "exclusive_classes": textcat_cfg["exclusive_classes"], - "architecture": textcat_cfg["architecture"], - "positive_label": textcat_cfg["positive_label"], - } - if base_cfg != pipe_cfg: - msg.fail( - f"The base textcat model configuration does" - f"not match the provided training options. " - f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}", - exits=1, - ) - msg.text(f"Extending component from base model '{pipe}'") - disabled_pipes = nlp.select_pipes( - disable=[p for p in nlp.pipe_names if p not in pipeline] - ) - else: - msg.text(f"Starting with blank model '{lang}'") - lang_cls = util.get_lang_class(lang) - nlp = lang_cls() - - if vectors: - msg.text(f"Loading vectors from model '{vectors}'") - _load_vectors(nlp, vectors) - - for pipe in pipeline: - # first, create the model. - # Bit of a hack after the refactor to get the vectors into a default config - # use train-from-config instead :-) - if pipe == "parser": - config_loc = default_dir / "parser_defaults.cfg" - elif pipe == "tagger": - config_loc = default_dir / "tagger_defaults.cfg" - elif pipe == "morphologizer": - config_loc = default_dir / "morphologizer_defaults.cfg" - elif pipe == "ner": - config_loc = default_dir / "ner_defaults.cfg" - elif pipe == "textcat": - config_loc = default_dir / "textcat_defaults.cfg" - elif pipe == "senter": - config_loc = default_dir / "senter_defaults.cfg" - else: - raise ValueError(f"Component {pipe} currently not supported.") - pipe_cfg = util.load_config(config_loc, create_objects=False) - if vectors: - pretrained_config = { - "@architectures": "spacy.VocabVectors.v1", - "name": vectors, - } - pipe_cfg["model"]["tok2vec"]["pretrained_vectors"] = pretrained_config - - if pipe == "parser": - pipe_cfg["learn_tokens"] = learn_tokens - elif pipe == "textcat": - pipe_cfg["exclusive_classes"] = not textcat_multilabel - pipe_cfg["architecture"] = textcat_arch - pipe_cfg["positive_label"] = textcat_positive_label - - pipe = nlp.create_pipe(pipe, config=pipe_cfg) - nlp.add_pipe(pipe) - - # Update tag map with provided mapping - nlp.vocab.morphology.tag_map.update(tag_map) - - # Create empty extra lexeme tables so the data from spacy-lookups-data - # isn't loaded if these features are accessed - if omit_extra_lookups: - nlp.vocab.lookups_extra = Lookups() - nlp.vocab.lookups_extra.add_table("lexeme_cluster") - nlp.vocab.lookups_extra.add_table("lexeme_prob") - nlp.vocab.lookups_extra.add_table("lexeme_settings") - - if vectors: - msg.text("Loading vector from model '{}'".format(vectors)) - _load_vectors(nlp, vectors) - - # Multitask objectives - multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] - for pipe_name, multitasks in multitask_options: - if multitasks: - if pipe_name not in pipeline: - msg.fail( - f"Can't use multitask objective without '{pipe_name}' in " - f"the pipeline" - ) - pipe = nlp.get_pipe(pipe_name) - for objective in multitasks.split(","): - pipe.add_multitask_objective(objective) - - # Prepare training corpus - msg.text(f"Counting training words (limit={n_examples})") - corpus = GoldCorpus(train_path, dev_path, limit=n_examples) - n_train_words = corpus.count_train() - - if base_model and not pipes_added: - # Start with an existing model, use default optimizer - optimizer = create_default_optimizer() - else: - # Start with a blank model, call begin_training - cfg = {"device": use_gpu} - optimizer = nlp.begin_training(lambda: corpus.train_examples, **cfg) - nlp._optimizer = None - - # Load in pretrained weights (TODO: this may be broken in the config rewrite) - if init_tok2vec is not None: - components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text(f"Loaded pretrained tok2vec for: {components}") - - # Verify textcat config - if "textcat" in pipeline: - textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) - if textcat_positive_label and textcat_positive_label not in textcat_labels: - msg.fail( - f"The textcat_positive_label (tpl) '{textcat_positive_label}' " - f"does not match any label in the training data.", - exits=1, - ) - if textcat_positive_label and len(textcat_labels) != 2: - msg.fail( - "A textcat_positive_label (tpl) '{textcat_positive_label}' was " - "provided for training data that does not appear to be a " - "binary classification problem with two labels.", - exits=1, - ) - train_data = corpus.train_data( - nlp, - noise_level=noise_level, - gold_preproc=gold_preproc, - max_length=0, - ignore_misaligned=True, - ) - train_labels = set() - if textcat_multilabel: - multilabel_found = False - for ex in train_data: - train_labels.update(ex.gold.cats.keys()) - if list(ex.gold.cats.values()).count(1.0) != 1: - multilabel_found = True - if not multilabel_found and not base_model: - msg.warn( - "The textcat training instances look like they have " - "mutually-exclusive classes. Remove the flag " - "'--textcat-multilabel' to train a classifier with " - "mutually-exclusive classes." - ) - if not textcat_multilabel: - for ex in train_data: - train_labels.update(ex.gold.cats.keys()) - if list(ex.gold.cats.values()).count(1.0) != 1 and not base_model: - msg.warn( - "Some textcat training instances do not have exactly " - "one positive label. Modifying training options to " - "include the flag '--textcat-multilabel' for classes " - "that are not mutually exclusive." - ) - nlp.get_pipe("textcat").cfg["exclusive_classes"] = False - textcat_multilabel = True - break - if base_model and set(textcat_labels) != train_labels: - msg.fail( - f"Cannot extend textcat model using data with different " - f"labels. Base model labels: {textcat_labels}, training data " - f"labels: {list(train_labels)}", - exits=1, - ) - if textcat_multilabel: - msg.text( - f"Textcat evaluation score: ROC AUC score macro-averaged across " - f"the labels '{', '.join(textcat_labels)}'" - ) - elif textcat_positive_label and len(textcat_labels) == 2: - msg.text( - f"Textcat evaluation score: F1-score for the " - f"label '{textcat_positive_label}'" - ) - elif len(textcat_labels) > 1: - if len(textcat_labels) == 2: - msg.warn( - "If the textcat component is a binary classifier with " - "exclusive classes, provide '--textcat-positive-label' for " - "an evaluation on the positive class." - ) - msg.text( - f"Textcat evaluation score: F1-score macro-averaged across " - f"the labels '{', '.join(textcat_labels)}'" - ) - else: - msg.fail( - "Unsupported textcat configuration. Use `spacy debug-data` " - "for more information." - ) - - # fmt: off - row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths) - row_widths = [len(w) for w in row_head] - row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} - # fmt: on - print("") - msg.row(row_head, **row_settings) - msg.row(["-" * width for width in row_settings["widths"]], **row_settings) - try: - iter_since_best = 0 - best_score = 0.0 - for i in range(n_iter): - train_data = corpus.train_dataset( - nlp, - noise_level=noise_level, - orth_variant_level=orth_variant_level, - gold_preproc=gold_preproc, - max_length=0, - ignore_misaligned=True, - ) - if raw_text: - random.shuffle(raw_text) - raw_batches = util.minibatch( - (nlp.make_doc(rt["text"]) for rt in raw_text), size=8 - ) - words_seen = 0 - with tqdm.tqdm(total=n_train_words, leave=False) as pbar: - losses = {} - for batch in util.minibatch_by_words(train_data, size=batch_sizes): - if not batch: - continue - try: - nlp.update( - batch, - sgd=optimizer, - drop=next(dropout_rates), - losses=losses, - ) - except ValueError as e: - err = "Error during training" - if init_tok2vec: - err += " Did you provide the same parameters during 'train' as during 'pretrain'?" - msg.fail(err, f"Original error message: {e}", exits=1) - if raw_text: - # If raw text is available, perform 'rehearsal' updates, - # which use unlabelled data to reduce overfitting. - raw_batch = list(next(raw_batches)) - nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) - docs = [ex.doc for ex in batch] - if not int(os.environ.get("LOG_FRIENDLY", 0)): - pbar.update(sum(len(doc) for doc in docs)) - words_seen += sum(len(doc) for doc in docs) - with nlp.use_params(optimizer.averages): - util.set_env_log(False) - epoch_model_path = output_path / f"model{i}" - nlp.to_disk(epoch_model_path) - nlp_loaded = util.load_model_from_path(epoch_model_path) - for beam_width in eval_beam_widths: - for name, component in nlp_loaded.pipeline: - if hasattr(component, "cfg"): - component.cfg["beam_width"] = beam_width - dev_dataset = list( - corpus.dev_dataset( - nlp_loaded, - gold_preproc=gold_preproc, - ignore_misaligned=True, - ) - ) - nwords = sum(len(ex.doc) for ex in dev_dataset) - start_time = timer() - scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose) - end_time = timer() - if use_gpu < 0: - gpu_wps = None - cpu_wps = nwords / (end_time - start_time) - else: - gpu_wps = nwords / (end_time - start_time) - # Evaluate on CPU in the first iteration only (for - # timing) when GPU is enabled - if i == 0: - with use_ops("numpy"): - nlp_loaded = util.load_model_from_path(epoch_model_path) - for name, component in nlp_loaded.pipeline: - if hasattr(component, "cfg"): - component.cfg["beam_width"] = beam_width - dev_dataset = list( - corpus.dev_dataset( - nlp_loaded, - gold_preproc=gold_preproc, - ignore_misaligned=True, - ) - ) - start_time = timer() - scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose) - end_time = timer() - cpu_wps = nwords / (end_time - start_time) - acc_loc = output_path / f"model{i}" / "accuracy.json" - srsly.write_json(acc_loc, scorer.scores) - - # Update model meta.json - meta["lang"] = nlp.lang - meta["pipeline"] = nlp.pipe_names - if beam_width == 1: - meta["speed"] = { - "nwords": nwords, - "cpu": cpu_wps, - "gpu": gpu_wps, - } - meta.setdefault("accuracy", {}) - for component in nlp.pipe_names: - for metric in _get_metrics(component): - meta["accuracy"][metric] = scorer.scores[metric] - else: - meta.setdefault("beam_accuracy", {}) - meta.setdefault("beam_speed", {}) - for component in nlp.pipe_names: - for metric in _get_metrics(component): - meta["beam_accuracy"][metric] = scorer.scores[metric] - meta["beam_speed"][beam_width] = { - "nwords": nwords, - "cpu": cpu_wps, - "gpu": gpu_wps, - } - meta["vectors"] = { - "width": nlp.vocab.vectors_length, - "vectors": len(nlp.vocab.vectors), - "keys": nlp.vocab.vectors.n_keys, - "name": nlp.vocab.vectors.name, - } - meta.setdefault("name", f"model{i}") - meta.setdefault("version", version) - meta["labels"] = nlp.meta["labels"] - meta_loc = output_path / f"model{i}" / "meta.json" - srsly.write_json(meta_loc, meta) - util.set_env_log(verbose) - - progress = _get_progress( - i, - losses, - scorer.scores, - output_stats, - beam_width=beam_width if has_beam_widths else None, - cpu_wps=cpu_wps, - gpu_wps=gpu_wps, - ) - if i == 0 and "textcat" in pipeline: - textcats_per_cat = scorer.scores.get("textcats_per_cat", {}) - for cat, cat_score in textcats_per_cat.items(): - if cat_score.get("roc_auc_score", 0) < 0: - msg.warn( - f"Textcat ROC AUC score is undefined due to " - f"only one value in label '{cat}'." - ) - msg.row(progress, **row_settings) - # Early stopping - if n_early_stopping is not None: - current_score = _score_for_model(meta) - if current_score < best_score: - iter_since_best += 1 - else: - iter_since_best = 0 - best_score = current_score - if iter_since_best >= n_early_stopping: - msg.text( - f"Early stopping, best iteration is: {i - iter_since_best}" - ) - msg.text( - f"Best score = {best_score}; Final iteration score = {current_score}" - ) - break - except Exception as e: - msg.warn(f"Aborting and saving final best model. Encountered exception: {e}", exits=1) - finally: - best_pipes = nlp.pipe_names - if disabled_pipes: - disabled_pipes.restore() - with nlp.use_params(optimizer.averages): - final_model_path = output_path / "model-final" - nlp.to_disk(final_model_path) - meta_loc = output_path / "model-final" / "meta.json" - final_meta = srsly.read_json(meta_loc) - final_meta.setdefault("accuracy", {}) - final_meta["accuracy"].update(meta.get("accuracy", {})) - final_meta.setdefault("speed", {}) - final_meta["speed"].setdefault("cpu", None) - final_meta["speed"].setdefault("gpu", None) - meta.setdefault("speed", {}) - meta["speed"].setdefault("cpu", None) - meta["speed"].setdefault("gpu", None) - # combine cpu and gpu speeds with the base model speeds - if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: - speed = _get_total_speed( - [final_meta["speed"]["cpu"], meta["speed"]["cpu"]] - ) - final_meta["speed"]["cpu"] = speed - if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: - speed = _get_total_speed( - [final_meta["speed"]["gpu"], meta["speed"]["gpu"]] - ) - final_meta["speed"]["gpu"] = speed - # if there were no speeds to update, overwrite with meta - if ( - final_meta["speed"]["cpu"] is None - and final_meta["speed"]["gpu"] is None - ): - final_meta["speed"].update(meta["speed"]) - # note: beam speeds are not combined with the base model - if has_beam_widths: - final_meta.setdefault("beam_accuracy", {}) - final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {})) - final_meta.setdefault("beam_speed", {}) - final_meta["beam_speed"].update(meta.get("beam_speed", {})) - srsly.write_json(meta_loc, final_meta) - msg.good("Saved model to output directory", final_model_path) - with msg.loading("Creating best model..."): - best_model_path = _collate_best_model(final_meta, output_path, best_pipes) - msg.good("Created best model", best_model_path) - - -def _score_for_model(meta): - """ Returns mean score between tasks in pipeline that can be used for early stopping. """ - mean_acc = list() - pipes = meta["pipeline"] - acc = meta["accuracy"] - if "tagger" in pipes: - mean_acc.append(acc["tags_acc"]) - if "morphologizer" in pipes: - mean_acc.append((acc["morphs_acc"] + acc["pos_acc"]) / 2) - if "parser" in pipes: - mean_acc.append((acc["uas"] + acc["las"]) / 2) - if "ner" in pipes: - mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3) - if "textcat" in pipes: - mean_acc.append(acc["textcat_score"]) - if "senter" in pipes: - mean_acc.append((acc["sent_p"] + acc["sent_r"] + acc["sent_f"]) / 3) - return sum(mean_acc) / len(mean_acc) - - -@contextlib.contextmanager -def _create_progress_bar(total): - if int(os.environ.get("LOG_FRIENDLY", 0)): - yield - else: - pbar = tqdm.tqdm(total=total, leave=False) - yield pbar - - -def _load_vectors(nlp, vectors): - util.load_model(vectors, vocab=nlp.vocab) - - -def _load_pretrained_tok2vec(nlp, loc): - """Load pretrained weights for the 'token-to-vector' part of the component - models, which is typically a CNN. See 'spacy pretrain'. Experimental. - """ - with loc.open("rb") as file_: - weights_data = file_.read() - loaded = [] - for name, component in nlp.pipeline: - if hasattr(component, "model") and component.model.has_ref("tok2vec"): - component.get_ref("tok2vec").from_bytes(weights_data) - loaded.append(name) - return loaded - - -def _collate_best_model(meta, output_path, components): - bests = {} - meta.setdefault("accuracy", {}) - for component in components: - bests[component] = _find_best(output_path, component) - best_dest = output_path / "model-best" - shutil.copytree(str(output_path / "model-final"), str(best_dest)) - for component, best_component_src in bests.items(): - shutil.rmtree(str(best_dest / component)) - shutil.copytree(str(best_component_src / component), str(best_dest / component)) - accs = srsly.read_json(best_component_src / "accuracy.json") - for metric in _get_metrics(component): - meta["accuracy"][metric] = accs[metric] - srsly.write_json(best_dest / "meta.json", meta) - return best_dest - - -def _find_best(experiment_dir, component): - accuracies = [] - for epoch_model in experiment_dir.iterdir(): - if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final": - accs = srsly.read_json(epoch_model / "accuracy.json") - scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)] - # remove per_type dicts from score list for max() comparison - scores = [score for score in scores if isinstance(score, float)] - accuracies.append((scores, epoch_model)) - if accuracies: - return max(accuracies)[1] - else: - return None - - -def _get_metrics(component): - if component == "parser": - return ("las", "uas", "las_per_type", "sent_f", "token_acc") - elif component == "tagger": - return ("tags_acc", "token_acc") - elif component == "morphologizer": - return ("morphs_acc", "pos_acc", "token_acc") - elif component == "ner": - return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc") - elif component == "senter": - return ("sent_f", "sent_p", "sent_r", "token_acc") - elif component == "textcat": - return ("textcat_score", "token_acc") - return ("token_acc",) - - -def _configure_training_output(pipeline, use_gpu, has_beam_widths): - row_head = ["Itn"] - output_stats = [] - for pipe in pipeline: - if pipe == "tagger": - row_head.extend(["Tag Loss ", " Tag % "]) - output_stats.extend(["tag_loss", "tags_acc"]) - elif pipe == "morphologizer" or pipe == "morphologizertagger": - row_head.extend(["Morph Loss ", " Morph % ", " POS % "]) - output_stats.extend(["morph_loss", "morphs_acc", "pos_acc"]) - elif pipe == "parser": - row_head.extend( - ["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"] - ) - output_stats.extend( - ["dep_loss", "uas", "las", "sent_p", "sent_r", "sent_f"] - ) - elif pipe == "ner": - row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "]) - output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"]) - elif pipe == "textcat": - row_head.extend(["Textcat Loss", "Textcat"]) - output_stats.extend(["textcat_loss", "textcat_score"]) - elif pipe == "senter": - row_head.extend(["Senter Loss", "Sent P", "Sent R", "Sent F"]) - output_stats.extend(["senter_loss", "sent_p", "sent_r", "sent_f"]) - row_head.extend(["Token %", "CPU WPS"]) - output_stats.extend(["token_acc", "cpu_wps"]) - - if use_gpu >= 0: - row_head.extend(["GPU WPS"]) - output_stats.extend(["gpu_wps"]) - - if has_beam_widths: - row_head.insert(1, "Beam W.") - # remove duplicates - row_head_dict = {k: 1 for k in row_head} - output_stats_dict = {k: 1 for k in output_stats} - return row_head_dict.keys(), output_stats_dict.keys() - - -def _get_progress( - itn, losses, dev_scores, output_stats, beam_width=None, cpu_wps=0.0, gpu_wps=0.0 -): - scores = {} - for stat in output_stats: - scores[stat] = 0.0 - scores["dep_loss"] = losses.get("parser", 0.0) - scores["ner_loss"] = losses.get("ner", 0.0) - scores["tag_loss"] = losses.get("tagger", 0.0) - scores["morph_loss"] = losses.get("morphologizer", 0.0) - scores["textcat_loss"] = losses.get("textcat", 0.0) - scores["senter_loss"] = losses.get("senter", 0.0) - scores["cpu_wps"] = cpu_wps - scores["gpu_wps"] = gpu_wps or 0.0 - scores.update(dev_scores) - formatted_scores = [] - for stat in output_stats: - format_spec = "{:.3f}" - if stat.endswith("_wps"): - format_spec = "{:.0f}" - formatted_scores.append(format_spec.format(scores[stat])) - result = [itn + 1] - result.extend(formatted_scores) - if beam_width is not None: - result.insert(1, beam_width) - return result - - -def _get_total_speed(speeds): - seconds_per_word = 0.0 - for words_per_second in speeds: - if words_per_second is None: - return None - seconds_per_word += 1.0 / words_per_second - return 1.0 / seconds_per_word diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index a6d0a0abc..ec099b294 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -1,5 +1,7 @@ from typing import Optional, Dict, List, Union, Sequence from timeit import default_timer as timer + +import srsly from pydantic import BaseModel, FilePath import plac import tqdm @@ -11,9 +13,10 @@ from thinc.api import Model, use_pytorch_for_gpu_memory import random from ..gold import GoldCorpus +from ..lookups import Lookups from .. import util from ..errors import Errors -from ..ml import models # don't remove - required to load the built-in architectures +from ..ml import models # don't remove - required to load the built-in architectures registry = util.registry @@ -23,7 +26,6 @@ patience = 10 eval_frequency = 10 dropout = 0.2 init_tok2vec = null -vectors = null max_epochs = 100 orth_variant_level = 0.0 gold_preproc = false @@ -47,7 +49,7 @@ beta2 = 0.999 [nlp] lang = "en" -vectors = ${training:vectors} +vectors = null [nlp.pipeline.tok2vec] factory = "tok2vec" @@ -93,7 +95,6 @@ class ConfigSchema(BaseModel): eval_frequency: int = 100 dropout: float = 0.2 init_tok2vec: Optional[FilePath] = None - vectors: Optional[str] = None max_epochs: int = 100 orth_variant_level: float = 0.0 gold_preproc: bool = False @@ -119,9 +120,14 @@ class ConfigSchema(BaseModel): dev_path=("Location of JSON-formatted development data", "positional", None, Path), config_path=("Path to config file", "positional", None, Path), output_path=("Output directory to store model in", "option", "o", Path), - meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), + init_tok2vec=( + "Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", + Path), raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), + verbose=("Display more information for debugging purposes", "flag", "VV", bool), use_gpu=("Use GPU", "option", "g", int), + tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), + omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), # fmt: on ) def train_cli( @@ -129,30 +135,53 @@ def train_cli( dev_path, config_path, output_path=None, - meta_path=None, + init_tok2vec=None, raw_text=None, - debug=False, verbose=False, use_gpu=-1, + tag_map_path=None, + omit_extra_lookups=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ + util.set_env_log(verbose) + + # Make sure all files and paths exists if they are needed if not config_path or not config_path.exists(): msg.fail("Config file not found", config_path, exits=1) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) - if meta_path is not None and not meta_path.exists(): - msg.fail("Can't find model meta.json", meta_path, exits=1) if output_path is not None and not output_path.exists(): output_path.mkdir() + msg.good(f"Created output directory: {output_path}") + elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: + msg.warn( + "Output directory is not empty.", + "This can lead to unintended side effects when saving the model. " + "Please use an empty directory or a different path instead. If " + "the specified output path doesn't exist, the directory will be " + "created for you.", + ) + if raw_text is not None: + raw_text = list(srsly.read_jsonl(raw_text)) + tag_map = {} + if tag_map_path is not None: + tag_map = srsly.read_json(tag_map_path) + + weights_data = None + if init_tok2vec is not None: + if not init_tok2vec.exists(): + msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1) + with init_tok2vec.open("rb") as file_: + weights_data = file_.read() if use_gpu >= 0: - msg.info("Using GPU") + msg.info("Using GPU: {use_gpu}") util.use_gpu(use_gpu) else: msg.info("Using CPU") @@ -161,13 +190,21 @@ def train_cli( config_path, {"train": train_path, "dev": dev_path}, output_path=output_path, - meta_path=meta_path, raw_text=raw_text, + tag_map=tag_map, + weights_data=weights_data, + omit_extra_lookups=omit_extra_lookups, ) def train( - config_path, data_paths, raw_text=None, meta_path=None, output_path=None, + config_path, + data_paths, + raw_text=None, + output_path=None, + tag_map=None, + weights_data=None, + omit_extra_lookups=False, ): msg.info(f"Loading config from: {config_path}") # Read the config first without creating objects, to get to the original nlp_config @@ -177,15 +214,104 @@ def train( use_pytorch_for_gpu_memory() nlp_config = config["nlp"] config = util.load_config(config_path, create_objects=True) + training = config["training"] msg.info("Creating nlp from config") nlp = util.load_model_from_config(nlp_config) - training = config["training"] optimizer = training["optimizer"] limit = training["limit"] msg.info("Loading training corpus") corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) - msg.info("Initializing the nlp pipeline") - nlp.begin_training(lambda: corpus.train_examples) + + # verify textcat config + if "textcat" in nlp_config["pipeline"]: + textcat_labels = set(nlp.get_pipe("textcat").labels) + textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"] + + # check whether the setting 'exclusive_classes' corresponds to the provided training data + if textcat_multilabel: + multilabel_found = False + for ex in corpus.train_examples: + cats = ex.doc_annotation.cats + textcat_labels.update(cats.keys()) + if list(cats.values()).count(1.0) != 1: + multilabel_found = True + if not multilabel_found: + msg.warn( + "The textcat training instances look like they have " + "mutually exclusive classes. Set 'exclusive_classes' " + "to 'true' in the config to train a classifier with " + "mutually exclusive classes more accurately." + ) + else: + for ex in corpus.train_examples: + cats = ex.doc_annotation.cats + textcat_labels.update(cats.keys()) + if list(cats.values()).count(1.0) != 1: + msg.fail( + "Some textcat training instances do not have exactly " + "one positive label. Set 'exclusive_classes' " + "to 'false' in the config to train a classifier with classes " + "that are not mutually exclusive." + ) + msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels") + nlp.get_pipe("textcat").labels = tuple(textcat_labels) + + # if 'positive_label' is provided: double check whether it's in the data and the task is binary + if nlp_config["pipeline"]["textcat"].get("positive_label", None): + textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) + pos_label = nlp_config["pipeline"]["textcat"]["positive_label"] + if pos_label not in textcat_labels: + msg.fail( + f"The textcat's 'positive_label' config setting '{pos_label}' " + f"does not match any label in the training data.", + exits=1, + ) + if len(textcat_labels) != 2: + msg.fail( + f"A textcat 'positive_label' '{pos_label}' was " + f"provided for training data that does not appear to be a " + f"binary classification problem with two labels.", + exits=1, + ) + + if training.get("resume", False): + msg.info("Resuming training") + nlp.resume_training() + else: + msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") + nlp.begin_training( + lambda: corpus.train_examples + ) + + # Update tag map with provided mapping + nlp.vocab.morphology.tag_map.update(tag_map) + + # Create empty extra lexeme tables so the data from spacy-lookups-data + # isn't loaded if these features are accessed + if omit_extra_lookups: + nlp.vocab.lookups_extra = Lookups() + nlp.vocab.lookups_extra.add_table("lexeme_cluster") + nlp.vocab.lookups_extra.add_table("lexeme_prob") + nlp.vocab.lookups_extra.add_table("lexeme_settings") + + # Load a pretrained tok2vec model - cf. CLI command 'pretrain' + if weights_data is not None: + tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None) + if tok2vec_path is None: + msg.fail( + f"To use a pretrained tok2vec model, the config needs to specify which " + f"tok2vec layer to load in the setting [pretraining.tok2vec_model].", + exits=1, + ) + tok2vec = config + for subpath in tok2vec_path.split("."): + tok2vec = tok2vec.get(subpath) + if not tok2vec: + msg.fail( + f"Could not locate the tok2vec model at {tok2vec_path}.", + exits=1, + ) + tok2vec.from_bytes(weights_data) train_batches = create_train_batches(nlp, corpus, training) evaluate = create_evaluation_callback(nlp, optimizer, corpus, training) @@ -202,6 +328,7 @@ def train( patience=training.get("patience", 0), max_steps=training.get("max_steps", 0), eval_frequency=training["eval_frequency"], + raw_text=raw_text, ) msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") @@ -215,7 +342,8 @@ def train( progress.close() print_row(info) if is_best_checkpoint and output_path is not None: - nlp.to_disk(output_path) + update_meta(training, nlp, info) + nlp.to_disk(output_path / "model-best") progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) # Clean up the objects to faciliate garbage collection. for eg in batch: @@ -223,6 +351,12 @@ def train( eg.goldparse = None eg.doc_annotation = None eg.token_annotation = None + except Exception as e: + msg.warn( + f"Aborting and saving the final best model. " + f"Encountered exception: {str(e)}", + exits=1, + ) finally: if output_path is not None: final_model_path = output_path / "model-final" @@ -231,24 +365,30 @@ def train( nlp.to_disk(final_model_path) else: nlp.to_disk(final_model_path) - msg.good("Saved model to output directory", final_model_path) + msg.good(f"Saved model to output directory {final_model_path}") def create_train_batches(nlp, corpus, cfg): epochs_todo = cfg.get("max_epochs", 0) while True: - train_examples = list(corpus.train_dataset( - nlp, - noise_level=0.0, - orth_variant_level=cfg["orth_variant_level"], - gold_preproc=cfg["gold_preproc"], - max_length=cfg["max_length"], - ignore_misaligned=True, - )) + train_examples = list( + corpus.train_dataset( + nlp, + noise_level=cfg["noise_level"], + orth_variant_level=cfg["orth_variant_level"], + gold_preproc=cfg["gold_preproc"], + max_length=cfg["max_length"], + ignore_misaligned=True, + ) + ) if len(train_examples) == 0: raise ValueError(Errors.E988) random.shuffle(train_examples) - batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"]) + batches = util.minibatch_by_words( + train_examples, + size=cfg["batch_size"], + discard_oversize=cfg["discard_oversize"], + ) # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop try: first = next(batches) @@ -273,7 +413,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): ) n_words = sum(len(ex.doc) for ex in dev_examples) start_time = timer() - + if optimizer.averages: with nlp.use_params(optimizer.averages): scorer = nlp.evaluate(dev_examples, batch_size=32) @@ -284,7 +424,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): scores = scorer.scores # Calculate a weighted sum based on score_weights for the main score weights = cfg["score_weights"] - weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) + try: + weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) + except KeyError as e: + raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys()))) + scores["speed"] = wps return weighted_score, scores @@ -292,8 +436,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): def train_while_improving( - nlp, optimizer, train_data, evaluate, *, dropout, eval_frequency, - accumulate_gradient=1, patience=0, max_steps=0 + nlp, + optimizer, + train_data, + evaluate, + *, + dropout, + eval_frequency, + accumulate_gradient=1, + patience=0, + max_steps=0, + raw_text=None, ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -341,11 +494,22 @@ def train_while_improving( losses = {} to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")] + if raw_text: + random.shuffle(raw_text) + raw_batches = util.minibatch( + (nlp.make_doc(rt["text"]) for rt in raw_text), size=8 + ) + for step, batch in enumerate(train_data): dropout = next(dropouts) with nlp.select_pipes(enable=to_enable): for subbatch in subdivide_batch(batch, accumulate_gradient): nlp.update(subbatch, drop=dropout, losses=losses, sgd=False) + if raw_text: + # If raw text is available, perform 'rehearsal' updates, + # which use unlabelled data to reduce overfitting. + raw_batch = list(next(raw_batches)) + nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) for name, proc in nlp.pipeline: if hasattr(proc, "model"): proc.model.finish_update(optimizer) @@ -386,7 +550,7 @@ def subdivide_batch(batch, accumulate_gradient): if subbatch: yield subbatch start += len(subbatch) - subbatch = batch[start : ] + subbatch = batch[start:] if subbatch: yield subbatch @@ -405,14 +569,34 @@ def setup_printer(training, nlp): msg.row(["-" * width for width in table_widths]) def print_row(info): - losses = [ - "{0:.2f}".format(float(info["losses"].get(pipe_name, 0.0))) - for pipe_name in nlp.pipe_names - ] - scores = [ - "{0:.2f}".format(float(info["other_scores"].get(col, 0.0))) for col in score_cols - ] - data = [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] + try: + losses = [ + "{0:.2f}".format(float(info["losses"][pipe_name])) + for pipe_name in nlp.pipe_names + ] + except KeyError as e: + raise KeyError( + Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys()))) + + try: + scores = [ + "{0:.2f}".format(float(info["other_scores"][col])) + for col in score_cols + ] + except KeyError as e: + raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys()))) + data = ( + [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] + ) msg.row(data, widths=table_widths, aligns=table_aligns) return print_row + + +def update_meta(training, nlp, info): + score_cols = training["scores"] + nlp.meta["performance"] = {} + for metric in score_cols: + nlp.meta["performance"][metric] = info["other_scores"][metric] + for pipe_name in nlp.pipe_names: + nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] diff --git a/spacy/errors.py b/spacy/errors.py index 94a0218a7..d6fdd1b43 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -580,7 +580,14 @@ class Errors(object): "table, which contains {n_rows} vectors.") # TODO: fix numbering after merging develop into master - + E983 = ("Invalid key for '{dict_name}': {key}. Available keys: " + "{keys}") + E984 = ("Could not parse the {input} - double check the data is written " + "in the correct format as expected by spaCy.") + E985 = ("The pipeline component '{component}' is already available in the base " + "model. The settings in the component block in the config file are " + "being ignored. If you want to replace this component instead, set " + "'replace' to True in the training configuration.") E986 = ("Could not create any training batches: check your input. " "Perhaps discard_oversize should be set to False ?") E987 = ("The text of an example training instance is either a Doc or " diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 1e58f0635..19b135193 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -229,6 +229,10 @@ class GoldCorpus(object): if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)): raise ValueError(Errors.E987.format(type=type(doc))) examples.append(Example.from_dict(ex_dict, doc=doc)) + else: + raise ValueError(Errors.E984.format(input="JSONL format")) + else: + raise ValueError(Errors.E984.format(input="JSONL format")) elif file_name.endswith("msg"): text, ex_dict = srsly.read_msgpack(loc) @@ -550,14 +554,22 @@ def json_to_examples(doc): def read_json_file(loc, docs_filter=None, limit=None): loc = util.ensure_path(loc) if loc.is_dir(): + parsed = False for filename in loc.iterdir(): + parsed = True yield from read_json_file(loc / filename, limit=limit) + if not parsed: + raise ValueError(Errors.E984.format(input="JSON directory")) else: + parsed = False for doc in _json_iterate(loc): if docs_filter is not None and not docs_filter(doc): continue for json_data in json_to_examples(doc): + parsed = True yield json_data + if not parsed: + raise ValueError(Errors.E984.format(input="JSON file")) def _json_iterate(loc): diff --git a/spacy/language.py b/spacy/language.py index 6341dc858..97bdd698c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -319,14 +319,14 @@ class Language(object): # transform the model's config to an actual Model factory_cfg = dict(config) - # check whether we have a proper model config, or load a default one + # check whether we have a proper model config, ignore if the type is wrong if "model" in factory_cfg and not isinstance(factory_cfg["model"], dict): warnings.warn( Warnings.W099.format(type=type(factory_cfg["model"]), pipe=name) ) # refer to the model configuration in the cfg settings for this component - if "model" in factory_cfg: + elif "model" in factory_cfg: self.config[name] = {"model": factory_cfg["model"]} # create all objects in the config @@ -1086,6 +1086,7 @@ class component(object): requires=tuple(), retokenizes=False, default_model=lambda: None, + default_config=None, ): """Decorate a pipeline component. @@ -1099,6 +1100,7 @@ class component(object): self.requires = validate_attrs(requires) self.retokenizes = retokenizes self.default_model = default_model + self.default_config = default_config def __call__(self, *args, **kwargs): obj = args[0] @@ -1113,9 +1115,10 @@ class component(object): def factory(nlp, model, **cfg): if model is None: model = self.default_model() - warnings.warn(Warnings.W098.format(name=self.name)) - if model is None: - warnings.warn(Warnings.W097.format(name=self.name)) + if self.default_config: + for key, value in self.default_config.items(): + if key not in cfg: + cfg[key] = value if hasattr(obj, "from_nlp"): return obj.from_nlp(nlp, model, **cfg) elif isinstance(obj, type): diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 8000d1aff..4a360a9e6 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -3,26 +3,31 @@ import numpy from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model -def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96): +def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None): + softmax = Softmax(nO=nO, nI=token_vector_width * 2) model = chain( tok2vec, - Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=3, dropout=0.0), + Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0), LayerNorm(token_vector_width * 2), - Softmax(nO=n_tags, nI=token_vector_width * 2), + softmax, ) + model.set_ref("tok2vec", tok2vec) + model.set_ref("output_layer", softmax) return model -def build_cloze_multi_task_model(vocab, tok2vec): - output_size = vocab.vectors.data.shape[1] +def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None): + # nO = vocab.vectors.data.shape[1] output_layer = chain( Maxout( - nO=output_size, nI=tok2vec.get_dim("nO"), nP=3, normalize=True, dropout=0.0 + nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0 ), - Linear(nO=output_size, nI=output_size, init_W=zero_init), + Linear(nO=nO, nI=nO, init_W=zero_init), ) model = chain(tok2vec, output_layer) model = build_masked_language_model(vocab, model) + model.set_ref("tok2vec", tok2vec) + model.set_ref("output_layer", output_layer) return model diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 141c66f79..a02e1a5a1 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -31,6 +31,7 @@ def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None): model.set_ref("output_layer", linear_layer) model.set_ref("tok2vec", tok2vec) model.set_dim("nO", nO) + model.attrs["multi_label"] = not exclusive_classes return model @@ -44,6 +45,7 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO output_layer = softmax_activation() if exclusive_classes else Logistic() model = model >> with_cpu(output_layer, output_layer.ops) model.set_ref("output_layer", sparse_linear) + model.attrs["multi_label"] = not exclusive_classes return model @@ -110,6 +112,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class if model.has_dim("nO") is not False: model.set_dim("nO", nO) model.set_ref("output_layer", linear_model.get_ref("output_layer")) + model.attrs["multi_label"] = not exclusive_classes return model diff --git a/spacy/pipeline/defaults/multitask_defaults.cfg b/spacy/pipeline/defaults/multitask_defaults.cfg new file mode 100644 index 000000000..d3dbe9b53 --- /dev/null +++ b/spacy/pipeline/defaults/multitask_defaults.cfg @@ -0,0 +1,15 @@ +[model] +@architectures = "spacy.MultiTask.v1" +maxout_pieces = 3 +token_vector_width = 96 + +[model.tok2vec] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 96 +depth = 4 +embed_size = 2000 +window_size = 1 +maxout_pieces = 2 +subword_features = true +dropout = null diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index a6edf00d9..75628ce3c 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -648,9 +648,10 @@ class MultitaskObjective(Tagger): side-objective. """ - def __init__(self, vocab, model, target='dep_tag_offset', **cfg): + def __init__(self, vocab, model, **cfg): self.vocab = vocab self.model = model + target = cfg["target"] # default: 'dep_tag_offset' if target == "dep": self.make_label = self.make_dep elif target == "tag": @@ -668,8 +669,6 @@ class MultitaskObjective(Tagger): else: raise ValueError(Errors.E016) self.cfg = dict(cfg) - # TODO: remove - put in config - self.cfg.setdefault("maxout_pieces", 2) @property def labels(self): @@ -682,7 +681,7 @@ class MultitaskObjective(Tagger): def set_annotations(self, docs, dep_ids, tensors=None): pass - def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None, + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): gold_examples = nonproj.preprocess_training_data(get_examples()) # for raw_text, doc_annot in gold_tuples: @@ -808,13 +807,13 @@ class ClozeMultitask(Pipe): self.vocab = vocab self.model = model self.cfg = cfg - self.distance = CosineDistance(ignore_zeros=True, normalize=False) + self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config def set_annotations(self, docs, dep_ids, tensors=None): pass def begin_training(self, get_examples=lambda: [], pipeline=None, - tok2vec=None, sgd=None, **kwargs): + sgd=None, **kwargs): link_vectors_to_models(self.vocab) self.model.initialize() X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) @@ -951,13 +950,13 @@ class TextCategorizer(Pipe): losses[self.name] += (gradient**2).sum() def _examples_to_truth(self, examples): - golds = [ex.gold for ex in examples] - truths = numpy.zeros((len(golds), len(self.labels)), dtype="f") - not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f") - for i, gold in enumerate(golds): + gold_cats = [ex.doc_annotation.cats for ex in examples] + truths = numpy.zeros((len(gold_cats), len(self.labels)), dtype="f") + not_missing = numpy.ones((len(gold_cats), len(self.labels)), dtype="f") + for i, gold_cat in enumerate(gold_cats): for j, label in enumerate(self.labels): - if label in gold.cats: - truths[i, j] = gold.cats[label] + if label in gold_cat: + truths[i, j] = gold_cat[label] else: not_missing[i, j] = 0. truths = self.model.ops.asarray(truths) @@ -1026,28 +1025,27 @@ cdef class DependencyParser(Parser): output.append(merge_subtokens) return tuple(output) - def add_multitask_objective(self, target): - if target == "cloze": - cloze = ClozeMultitask(self.vocab) - self._multitasks.append(cloze) - else: - labeller = MultitaskObjective(self.vocab, target=target) - self._multitasks.append(labeller) + def add_multitask_objective(self, mt_component): + self._multitasks.append(mt_component) def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): + # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: - tok2vec = self.model.get_ref("tok2vec") - labeller.begin_training(get_examples, pipeline=pipeline, - tok2vec=tok2vec, sgd=sgd) + labeller.model.set_dim("nO", len(self.labels)) + if labeller.model.has_ref("output_layer"): + labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) + labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd) def __reduce__(self): - return (DependencyParser, (self.vocab, self.model), self.moves) + return (DependencyParser, (self.vocab, self.model), (self.moves, self.cfg)) def __getstate__(self): - return self.moves + return (self.moves, self.cfg) - def __setstate__(self, moves): + def __setstate__(self, state): + moves, config = state self.moves = moves + self.cfg = config @property def labels(self): @@ -1073,28 +1071,27 @@ cdef class EntityRecognizer(Parser): requires = [] TransitionSystem = BiluoPushDown - def add_multitask_objective(self, target): - if target == "cloze": - cloze = ClozeMultitask(self.vocab) - self._multitasks.append(cloze) - else: - labeller = MultitaskObjective(self.vocab, target=target) - self._multitasks.append(labeller) + def add_multitask_objective(self, mt_component): + self._multitasks.append(mt_component) def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): + # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: - tok2vec = self.model.get_ref("tok2vec") - labeller.begin_training(get_examples, pipeline=pipeline, - tok2vec=tok2vec) + labeller.model.set_dim("nO", len(self.labels)) + if labeller.model.has_ref("output_layer"): + labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) + labeller.begin_training(get_examples, pipeline=pipeline) def __reduce__(self): - return (EntityRecognizer, (self.vocab, self.model), self.moves) + return (EntityRecognizer, (self.vocab, self.model), (self.moves, self.cfg)) def __getstate__(self): - return self.moves + return self.moves, self.cfg - def __setstate__(self, moves): + def __setstate__(self, state): + moves, config = state self.moves = moves + self.cfg = config @property def labels(self): @@ -1565,15 +1562,23 @@ Language.factories["parser"] = lambda nlp, model, **cfg: parser_factory(nlp, mod Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg) def parser_factory(nlp, model, **cfg): + default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} if model is None: model = default_parser() warnings.warn(Warnings.W098.format(name="parser")) + for key, value in default_config.items(): + if key not in cfg: + cfg[key] = value return DependencyParser.from_nlp(nlp, model, **cfg) def ner_factory(nlp, model, **cfg): + default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} if model is None: model = default_ner() warnings.warn(Warnings.W098.format(name="ner")) + for key, value in default_config.items(): + if key not in cfg: + cfg[key] = value return EntityRecognizer.from_nlp(nlp, model, **cfg) __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 5882fa266..de30a55f0 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -172,7 +172,7 @@ class Tok2VecListener(Model): def verify_inputs(self, inputs): if self._batch_id is None and self._outputs is None: - raise ValueError + raise ValueError("The Tok2Vec listener did not receive valid input.") else: batch_id = self.get_batch_id(inputs) if batch_id != self._batch_id: diff --git a/spacy/scorer.py b/spacy/scorer.py index 7e2466be7..288da23aa 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -88,24 +88,20 @@ class Scorer(object): self.ner = PRFScore() self.ner_per_ents = dict() self.eval_punct = eval_punct - self.textcat = None - self.textcat_per_cat = dict() + self.textcat = PRFScore() + self.textcat_f_per_cat = dict() + self.textcat_auc_per_cat = dict() self.textcat_positive_label = None self.textcat_multilabel = False if pipeline: - for name, model in pipeline: + for name, component in pipeline: if name == "textcat": - self.textcat_positive_label = model.cfg.get("positive_label", None) - if self.textcat_positive_label: - self.textcat = PRFScore() - if not model.cfg.get("exclusive_classes", False): - self.textcat_multilabel = True - for label in model.cfg.get("labels", []): - self.textcat_per_cat[label] = ROCAUCScore() - else: - for label in model.cfg.get("labels", []): - self.textcat_per_cat[label] = PRFScore() + self.textcat_multilabel = component.model.attrs["multi_label"] + self.textcat_positive_label = component.cfg.get("positive_label", None) + for label in component.cfg.get("labels", []): + self.textcat_auc_per_cat[label] = ROCAUCScore() + self.textcat_f_per_cat[label] = PRFScore() @property def tags_acc(self): @@ -207,46 +203,52 @@ class Scorer(object): } @property - def textcat_score(self): - """RETURNS (float): f-score on positive label for binary exclusive, - macro-averaged f-score for 3+ exclusive, - macro-averaged AUC ROC score for multilabel (-1 if undefined) + def textcat_f(self): + """RETURNS (float): f-score on positive label for binary classification, + macro-averaged f-score for multilabel classification """ if not self.textcat_multilabel: - # binary multiclass if self.textcat_positive_label: + # binary classification return self.textcat.fscore * 100 - # other multiclass - return ( - sum([score.fscore for label, score in self.textcat_per_cat.items()]) - / (len(self.textcat_per_cat) + 1e-100) - * 100 - ) - # multilabel + # multi-class and/or multi-label + return ( + sum([score.fscore for label, score in self.textcat_f_per_cat.items()]) + / (len(self.textcat_f_per_cat) + 1e-100) + * 100 + ) + + @property + def textcat_auc(self): + """RETURNS (float): macro-averaged AUC ROC score for multilabel classification (-1 if undefined) + """ return max( - sum([score.score for label, score in self.textcat_per_cat.items()]) - / (len(self.textcat_per_cat) + 1e-100), + sum([score.score for label, score in self.textcat_auc_per_cat.items()]) + / (len(self.textcat_auc_per_cat) + 1e-100), -1, ) @property - def textcats_per_cat(self): - """RETURNS (dict): Scores per textcat label. + def textcats_auc_per_cat(self): + """RETURNS (dict): AUC ROC Scores per textcat label. """ - if not self.textcat_multilabel: - return { - k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} - for k, v in self.textcat_per_cat.items() - } return { k: {"roc_auc_score": max(v.score, -1)} - for k, v in self.textcat_per_cat.items() + for k, v in self.textcat_auc_per_cat.items() + } + + @property + def textcats_f_per_cat(self): + """RETURNS (dict): F-scores per textcat label. + """ + return { + k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} + for k, v in self.textcat_f_per_cat.items() } @property def scores(self): - """RETURNS (dict): All scores with keys `uas`, `las`, `ents_p`, - `ents_r`, `ents_f`, `tags_acc`, `token_acc`, and `textcat_score`. + """RETURNS (dict): All scores mapped by key. """ return { "uas": self.uas, @@ -264,8 +266,10 @@ class Scorer(object): "sent_r": self.sent_r, "sent_f": self.sent_f, "token_acc": self.token_acc, - "textcat_score": self.textcat_score, - "textcats_per_cat": self.textcats_per_cat, + "textcat_f": self.textcat_f, + "textcat_auc": self.textcat_auc, + "textcats_f_per_cat": self.textcats_f_per_cat, + "textcats_auc_per_cat": self.textcats_auc_per_cat, } def score(self, example, verbose=False, punct_labels=("p", "punct")): @@ -408,7 +412,7 @@ class Scorer(object): ) if ( len(gold.cats) > 0 - and set(self.textcat_per_cat) == set(gold.cats) + and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats) and set(gold.cats) == set(doc.cats) ): goldcat = max(gold.cats, key=gold.cats.get) @@ -418,17 +422,21 @@ class Scorer(object): set([self.textcat_positive_label]) & set([candcat]), set([self.textcat_positive_label]) & set([goldcat]), ) - for label in self.textcat_per_cat: - if self.textcat_multilabel: - self.textcat_per_cat[label].score_set( + for label in set(gold.cats): + self.textcat_auc_per_cat[label].score_set( doc.cats[label], gold.cats[label] - ) - else: - self.textcat_per_cat[label].score_set( + ) + self.textcat_f_per_cat[label].score_set( set([label]) & set([candcat]), set([label]) & set([goldcat]) - ) - elif len(self.textcat_per_cat) > 0: - model_labels = set(self.textcat_per_cat) + ) + elif len(self.textcat_f_per_cat) > 0: + model_labels = set(self.textcat_f_per_cat) + eval_labels = set(gold.cats) + raise ValueError( + Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) + ) + elif len(self.textcat_auc_per_cat) > 0: + model_labels = set(self.textcat_auc_per_cat) eval_labels = set(gold.cats) raise ValueError( Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index fcaff444e..7bd9562e2 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -63,15 +63,14 @@ cdef class Parser: # defined by EntityRecognizer as a BiluoPushDown moves = self.TransitionSystem(self.vocab.strings) self.moves = moves - cfg.setdefault('min_action_freq', 30) - cfg.setdefault('learn_tokens', False) - cfg.setdefault('beam_width', 1) - cfg.setdefault('beam_update_prob', 1.0) # or 0.5 (both defaults were previously used) self.model = model if self.moves.n_moves != 0: self.set_output(self.moves.n_moves) self.cfg = cfg self._multitasks = [] + for multitask in cfg.get("multitasks", []): + self.add_multitask_objective(multitask) + self._rehearsal_model = None @classmethod @@ -79,13 +78,15 @@ cdef class Parser: return cls(nlp.vocab, model, **cfg) def __reduce__(self): - return (Parser, (self.vocab, self.model), self.moves) + return (Parser, (self.vocab, self.model), (self.moves, self.cfg)) def __getstate__(self): - return self.moves + return (self.moves, self.cfg) - def __setstate__(self, moves): + def __setstate__(self, state): + moves, config = state self.moves = moves + self.cfg = config @property def move_names(self): diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index c92fc1ff9..879334056 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -9,7 +9,8 @@ from spacy.pipeline.defaults import default_ner def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - ner = EntityRecognizer(en_vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) assert len(list(doc.ents)) == 0 @@ -25,7 +26,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab): def test_ents_reset(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) - ner = EntityRecognizer(en_vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner = EntityRecognizer(en_vocab, default_ner(), **config) ner.begin_training([]) ner(doc) assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index ee1bba886..f9663ba32 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -17,7 +17,8 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = DependencyParser(vocab, default_parser()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(vocab, default_parser(), **config) return parser @@ -57,12 +58,13 @@ def test_add_label(parser): def test_add_label_deserializes_correctly(): - ner1 = EntityRecognizer(Vocab(), default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner1 = EntityRecognizer(Vocab(), default_ner(), **config) ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") ner1.begin_training([]) - ner2 = EntityRecognizer(Vocab(), default_ner()) + ner2 = EntityRecognizer(Vocab(), default_ner(), **config) # the second model needs to be resized before we can call from_bytes ner2.model.attrs["resize_output"](ner2.model, ner1.moves.n_moves) diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 30b4a6f6d..5d265261f 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -138,7 +138,8 @@ def test_get_oracle_actions(): deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) - parser = DependencyParser(doc.vocab, default_parser()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(doc.vocab, default_parser(), **config) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 8e41a16c0..b0a8109dc 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -138,7 +138,8 @@ def test_accept_blocked_token(): # 1. test normal behaviour nlp1 = English() doc1 = nlp1("I live in New York") - ner1 = EntityRecognizer(doc1.vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config) assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] @@ -156,7 +157,8 @@ def test_accept_blocked_token(): # 2. test blocking behaviour nlp2 = English() doc2 = nlp2("I live in New York") - ner2 = EntityRecognizer(doc2.vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config) # set "New York" to a blocked entity doc2.ents = [(0, 3, 5)] @@ -213,7 +215,8 @@ def test_overwrite_token(): assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] # Check that a new ner can overwrite O - ner2 = EntityRecognizer(doc.vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner2 = EntityRecognizer(doc.vocab, default_ner(), **config) ner2.moves.add_action(5, "") ner2.add_label("GPE") state = ner2.moves.init_batch([doc])[0] diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index b648e9a00..7f3e981ea 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -28,7 +28,8 @@ def tok2vec(): @pytest.fixture def parser(vocab, arc_eager): - return Parser(vocab, model=default_parser(), moves=arc_eager) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + return Parser(vocab, model=default_parser(), moves=arc_eager, **config) @pytest.fixture diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index db9eb5e6f..fa5d59f9e 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -94,7 +94,8 @@ def test_beam_advance_too_few_scores(beam, scores): def test_beam_parse(): nlp = Language() - nlp.add_pipe(DependencyParser(nlp.vocab, default_parser()), name="parser") + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) doc = nlp.make_doc("Australia is a country") diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index dc13fcdf1..ccf7d3ba3 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -16,7 +16,8 @@ def vocab(): @pytest.fixture def parser(vocab): - parser = DependencyParser(vocab, default_parser()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(vocab, default_parser(), **config) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 5a76697bc..177b6bb3d 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -270,7 +270,8 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): - ner = EntityRecognizer(Vocab(), default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner = EntityRecognizer(Vocab(), default_ner(), **config) example = Example(doc=None) example.set_token_annotation( ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label] diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 9ff118a1f..6df437b3c 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -196,7 +196,8 @@ def test_issue3345(): doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - ner = EntityRecognizer(doc.vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner = EntityRecognizer(doc.vocab, default_ner(), **config) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py index 3d8e80847..15632bdf8 100644 --- a/spacy/tests/regression/test_issue3830.py +++ b/spacy/tests/regression/test_issue3830.py @@ -6,7 +6,8 @@ from spacy.pipeline.defaults import default_parser def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" - parser = DependencyParser(Vocab(), default_parser()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.begin_training(lambda: []) @@ -15,7 +16,8 @@ def test_issue3830_no_subtok(): def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" - parser = DependencyParser(Vocab(), default_parser(), learn_tokens=True) + config = {"learn_tokens": True, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(Vocab(), default_parser(), **config) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.begin_training(lambda: []) diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 30081543b..4978aba44 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -74,6 +74,7 @@ def test_issue4042_bug2(): output_dir.mkdir() ner1.to_disk(output_dir) - ner2 = EntityRecognizer(vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner2 = EntityRecognizer(vocab, default_ner(), **config) ner2.from_disk(output_dir) assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index ba4d2deab..946316d85 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -12,7 +12,8 @@ def test_issue4313(): beam_width = 16 beam_density = 0.0001 nlp = English() - ner = EntityRecognizer(nlp.vocab, default_ner()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + ner = EntityRecognizer(nlp.vocab, default_ner(), **config) ner.add_label("SOME_LABEL") ner.begin_training([]) nlp.add_pipe(ner) diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py index 967db5d67..cdc3c09ca 100644 --- a/spacy/tests/regression/test_issue4725.py +++ b/spacy/tests/regression/test_issue4725.py @@ -1,12 +1,30 @@ -import pytest +import pickle import numpy from spacy.lang.en import English from spacy.vocab import Vocab +from spacy.tests.util import make_tempdir + + +def test_pickle_ner(): + """ Ensure the pickling of the NER goes well""" + vocab = Vocab(vectors_name="test_vocab_add_vector") + nlp = English(vocab=vocab) + ner = nlp.create_pipe("ner", config={"min_action_freq": 342}) + with make_tempdir() as tmp_path: + with (tmp_path / "ner.pkl").open("wb") as file_: + pickle.dump(ner, file_) + assert ner.cfg["min_action_freq"] == 342 + + with (tmp_path / "ner.pkl").open("rb") as file_: + ner2 = pickle.load(file_) + assert ner2.cfg["min_action_freq"] == 342 + def test_issue4725(): # ensures that this runs correctly and doesn't hang or crash because of the global vectors + # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows) vocab = Vocab(vectors_name="test_vocab_add_vector") data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 595a35a9f..9c4e1f61e 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -12,7 +12,8 @@ test_parsers = [DependencyParser, EntityRecognizer] @pytest.fixture def parser(en_vocab): - parser = DependencyParser(en_vocab, default_parser()) + config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} + parser = DependencyParser(en_vocab, default_parser(), **config) parser.add_label("nsubj") return parser diff --git a/spacy/util.py b/spacy/util.py index bc6c98a82..d2d87bef9 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -186,7 +186,7 @@ def load_model_from_path(model_path, meta=False, **overrides): return nlp.from_disk(model_path, exclude=disable) -def load_model_from_config(nlp_config): +def load_model_from_config(nlp_config, replace=False): if "name" in nlp_config: nlp = load_model(**nlp_config) elif "lang" in nlp_config: @@ -197,8 +197,15 @@ def load_model_from_config(nlp_config): if "pipeline" in nlp_config: for name, component_cfg in nlp_config["pipeline"].items(): factory = component_cfg.pop("factory") - component = nlp.create_pipe(factory, config=component_cfg) - nlp.add_pipe(component, name=name) + if name in nlp.pipe_names: + if replace: + component = nlp.create_pipe(factory, config=component_cfg) + nlp.replace_pipe(name, component) + else: + raise ValueError(Errors.E985.format(component=name)) + else: + component = nlp.create_pipe(factory, config=component_cfg) + nlp.add_pipe(component, name=name) return nlp diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index b1824573c..180665929 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -46,17 +46,19 @@ Update the evaluation scores from a single [`Doc`](/api/doc) / ## Properties -| Name | Type | Description | -| ----------------------------------------------- | ----- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `token_acc` | float | Tokenization accuracy. | -| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). | -| `uas` | float | Unlabelled dependency score. | -| `las` | float | Labelled dependency score. | -| `ents_p` | float | Named entity accuracy (precision). | -| `ents_r` | float | Named entity accuracy (recall). | -| `ents_f` | float | Named entity accuracy (F-score). | -| `ents_per_type` 2.1.5 | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. | -| `textcat_score` 2.2 | float | F-score on positive label for binary exclusive, macro-averaged F-score for 3+ exclusive, macro-averaged AUC ROC score for multilabel (`-1` if undefined). | -| `textcats_per_cat` 2.2 | dict | Scores per textcat label, keyed by label. | -| `las_per_type` 2.2.3 | dict | Labelled dependency scores, keyed by label. | -| `scores` | dict | All scores, keyed by type. | +| Name | Type | Description | +| --------------------------------------------------- | ----- | ---------------------------------------------------------------------------------------------------------- | +| `token_acc` | float | Tokenization accuracy. | +| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). | +| `uas` | float | Unlabelled dependency score. | +| `las` | float | Labelled dependency score. | +| `ents_p` | float | Named entity accuracy (precision). | +| `ents_r` | float | Named entity accuracy (recall). | +| `ents_f` | float | Named entity accuracy (F-score). | +| `ents_per_type` 2.1.5 | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. | +| `textcat_f` 3.0 | float | F-score on positive label for binary classification, macro-averaged F-score otherwise. | +| `textcat_auc` | float | Macro-averaged AUC ROC score for multilabel classification (`-1` if undefined). | +| `textcats_f_per_cat` 3.0 | dict | F-scores per textcat label, keyed by label. | +| `textcats_auc_per_cat` 3.0 | dict | ROC AUC scores per textcat label, keyed by label. | +| `las_per_type` 2.2.3 | dict | Labelled dependency scores, keyed by label. | +| `scores` | dict | All scores, keyed by type. |