From cc1516ec262a12dd3aa85a7c81788af51aae77ff Mon Sep 17 00:00:00 2001 From: Krzysztof Kowalczyk Date: Mon, 15 Apr 2019 12:04:36 +0200 Subject: [PATCH] Improved training and evaluation (#3538) * Add early stopping * Add return_score option to evaluate * Fix missing str to path conversion * Fix import + old python compatibility * Fix bad beam_width setting during cpu evaluation in spacy train with gpu option turned on --- spacy/cli/evaluate.py | 4 ++++ spacy/cli/train.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 68a7eca2c..df391d730 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -17,6 +17,7 @@ from .. import displacy gpu_id=("Use GPU", "option", "g", int), displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str), displacy_limit=("Limit of parses to render as HTML", "option", "dl", int), + return_scores=("Return dict containing model scores", "flag", "r", bool), ) def evaluate( model, @@ -25,6 +26,7 @@ def evaluate( gold_preproc=False, displacy_path=None, displacy_limit=25, + return_scores=False, ): """ Evaluate a model. To render a sample of parses in a HTML file, set an @@ -75,6 +77,8 @@ def evaluate( ents=render_ents, ) msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path) + if return_scores: + return scorer.scores def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True): diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 743fec9ea..5cf0f5f6f 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -35,6 +35,7 @@ from .. import about pipeline=("Comma-separated names of pipeline components", "option", "p", str), vectors=("Model to load vectors from", "option", "v", str), n_iter=("Number of iterations", "option", "n", int), + early_stopping_iter=("Maximum number of training epochs without dev accuracy improvement", "option", "e", int), n_examples=("Number of examples", "option", "ns", int), use_gpu=("Use GPU", "option", "g", int), version=("Model version", "option", "V", str), @@ -74,6 +75,7 @@ def train( pipeline="tagger,parser,ner", vectors=None, n_iter=30, + early_stopping_iter=None, n_examples=0, use_gpu=-1, version="0.0.0", @@ -101,6 +103,7 @@ def train( train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) + output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): @@ -222,6 +225,8 @@ def train( msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: + iter_since_best = 0 + best_score = 0. for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0 @@ -276,7 +281,9 @@ def train( gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path(epoch_model_path) - nlp_loaded.parser.cfg["beam_width"] + for name, component in nlp_loaded.pipeline: + if hasattr(component, "cfg"): + component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) ) @@ -328,6 +335,18 @@ def train( gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) + # early stopping + if early_stopping_iter is not None: + current_score = _score_for_model(meta) + if current_score < best_score: + iter_since_best += 1 + else: + iter_since_best = 0 + best_score = current_score + if iter_since_best >= early_stopping_iter: + msg.text("Early stopping, best iteration is: {}".format(i-iter_since_best)) + msg.text("Best score = {}; Final iteration score = {}".format(best_score, current_score)) + break finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" @@ -337,6 +356,18 @@ def train( best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path) +def _score_for_model(meta): + """ Returns mean score between tasks in pipeline that can be used for early stopping. """ + mean_acc = list() + pipes = meta['pipeline'] + acc = meta['accuracy'] + if 'tagger' in pipes: + mean_acc.append(acc['tags_acc']) + if 'parser' in pipes: + mean_acc.append((acc['uas']+acc['las']) / 2) + if 'ner' in pipes: + mean_acc.append((acc['ents_p']+acc['ents_r']+acc['ents_f']) / 3) + return sum(mean_acc) / len(mean_acc) @contextlib.contextmanager def _create_progress_bar(total):