From cc1516ec262a12dd3aa85a7c81788af51aae77ff Mon Sep 17 00:00:00 2001
From: Krzysztof Kowalczyk <k.kowaalczyk@gmail.com>
Date: Mon, 15 Apr 2019 12:04:36 +0200
Subject: [PATCH] Improved training and evaluation (#3538)

* Add early stopping

* Add return_score option to evaluate

* Fix missing str to path conversion

* Fix import + old python compatibility

* Fix bad beam_width setting during cpu evaluation in spacy train with gpu option turned on
---
 spacy/cli/evaluate.py |  4 ++++
 spacy/cli/train.py    | 33 ++++++++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 68a7eca2c..df391d730 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -17,6 +17,7 @@ from .. import displacy
     gpu_id=("Use GPU", "option", "g", int),
     displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
     displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
+    return_scores=("Return dict containing model scores", "flag", "r", bool),
 )
 def evaluate(
     model,
@@ -25,6 +26,7 @@ def evaluate(
     gold_preproc=False,
     displacy_path=None,
     displacy_limit=25,
+    return_scores=False,
 ):
     """
     Evaluate a model. To render a sample of parses in a HTML file, set an
@@ -75,6 +77,8 @@ def evaluate(
             ents=render_ents,
         )
         msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path)
+    if return_scores:
+        return scorer.scores
 
 
 def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 743fec9ea..5cf0f5f6f 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -35,6 +35,7 @@ from .. import about
     pipeline=("Comma-separated names of pipeline components", "option", "p", str),
     vectors=("Model to load vectors from", "option", "v", str),
     n_iter=("Number of iterations", "option", "n", int),
+    early_stopping_iter=("Maximum number of training epochs without dev accuracy improvement", "option", "e", int),
     n_examples=("Number of examples", "option", "ns", int),
     use_gpu=("Use GPU", "option", "g", int),
     version=("Model version", "option", "V", str),
@@ -74,6 +75,7 @@ def train(
     pipeline="tagger,parser,ner",
     vectors=None,
     n_iter=30,
+    early_stopping_iter=None,
     n_examples=0,
     use_gpu=-1,
     version="0.0.0",
@@ -101,6 +103,7 @@ def train(
     train_path = util.ensure_path(train_path)
     dev_path = util.ensure_path(dev_path)
     meta_path = util.ensure_path(meta_path)
+    output_path = util.ensure_path(output_path)
     if raw_text is not None:
         raw_text = list(srsly.read_jsonl(raw_text))
     if not train_path or not train_path.exists():
@@ -222,6 +225,8 @@ def train(
     msg.row(row_head, **row_settings)
     msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
     try:
+        iter_since_best = 0
+        best_score = 0.
         for i in range(n_iter):
             train_docs = corpus.train_docs(
                 nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
@@ -276,7 +281,9 @@ def train(
                         gpu_wps = nwords / (end_time - start_time)
                         with Model.use_device("cpu"):
                             nlp_loaded = util.load_model_from_path(epoch_model_path)
-                            nlp_loaded.parser.cfg["beam_width"]
+                            for name, component in nlp_loaded.pipeline:
+                                if hasattr(component, "cfg"):
+                                    component.cfg["beam_width"] = beam_width
                             dev_docs = list(
                                 corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                             )
@@ -328,6 +335,18 @@ def train(
                         gpu_wps=gpu_wps,
                     )
                     msg.row(progress, **row_settings)
+                # early stopping
+                if early_stopping_iter is not None:
+                    current_score = _score_for_model(meta)
+                    if current_score < best_score:
+                        iter_since_best += 1
+                    else:
+                        iter_since_best = 0
+                        best_score = current_score
+                    if iter_since_best >= early_stopping_iter:
+                        msg.text("Early stopping, best iteration is: {}".format(i-iter_since_best))
+                        msg.text("Best score = {}; Final iteration score = {}".format(best_score, current_score))
+                        break
     finally:
         with nlp.use_params(optimizer.averages):
             final_model_path = output_path / "model-final"
@@ -337,6 +356,18 @@ def train(
             best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
         msg.good("Created best model", best_model_path)
 
+def _score_for_model(meta):
+    """ Returns mean score between tasks in pipeline that can be used for early stopping. """
+    mean_acc = list()
+    pipes = meta['pipeline']
+    acc = meta['accuracy']
+    if 'tagger' in pipes:
+        mean_acc.append(acc['tags_acc'])
+    if 'parser' in pipes:
+        mean_acc.append((acc['uas']+acc['las']) / 2)
+    if 'ner' in pipes:
+        mean_acc.append((acc['ents_p']+acc['ents_r']+acc['ents_f']) / 3)
+    return sum(mean_acc) / len(mean_acc)
 
 @contextlib.contextmanager
 def _create_progress_bar(total):