From d163115e91aaa6a0f73b05b05bcca9774d76bf7c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 7 Oct 2017 21:00:43 -0500
Subject: [PATCH 1/4] Add non-linearity after history features

---
 spacy/_ml.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 898d6ab49..23facb9fb 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -264,7 +264,8 @@ def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
         return layerize(noop())
     embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
                     for i in range(hist_size)]
-    embed = concatenate(*embed_tables)
+    embed = chain(concatenate(*embed_tables),
+                  LN(Maxout(hist_size*nr_dim, hist_size*nr_dim)))
     ops = embed.ops
     def add_history_fwd(vectors_hists, drop=0.):
         vectors, hist_ids = vectors_hists

From 9d66a915da3c78346ebf6a47fac54dd5eb94c246 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 7 Oct 2017 21:02:38 -0500
Subject: [PATCH 2/4] Update training defaults

---
 spacy/cli/train.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index b27087056..80bb11798 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -78,11 +78,11 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
     # starts high and decays sharply, to force the optimizer to explore.
     # Batch size starts at 1 and grows, so that we make updates quickly
     # at the beginning of training.
-    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
-                                  util.env_opt('dropout_to', 0.2),
-                                  util.env_opt('dropout_decay', 0.0))
+    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.6),
+                                  util.env_opt('dropout_to', 0.1),
+                                  util.env_opt('dropout_decay', 1e-5))
     batch_sizes = util.compounding(util.env_opt('batch_from', 1),
-                                   util.env_opt('batch_to', 16),
+                                   util.env_opt('batch_to', 4),
                                    util.env_opt('batch_compound', 1.001))
     corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
     n_train_words = corpus.count_train()

From 42b401d08b5b4b6968d2ed3e70e0a3c580b6c60b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 7 Oct 2017 21:05:21 -0500
Subject: [PATCH 3/4] Change default hidden depth to 1

---
 spacy/syntax/nn_parser.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index fdcf1d2d1..153f7a484 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -239,7 +239,7 @@ cdef class Parser:
     """
     @classmethod
     def Model(cls, nr_class, **cfg):
-        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 2))
+        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
         token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
         hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1))

From be4f0b64605b036f06fdd919253b719fdc88b5bb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 8 Oct 2017 02:08:12 -0500
Subject: [PATCH 4/4] Update defaults

---
 spacy/cli/train.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 80bb11798..b27087056 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -78,11 +78,11 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
     # starts high and decays sharply, to force the optimizer to explore.
     # Batch size starts at 1 and grows, so that we make updates quickly
     # at the beginning of training.
-    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.6),
-                                  util.env_opt('dropout_to', 0.1),
-                                  util.env_opt('dropout_decay', 1e-5))
+    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
+                                  util.env_opt('dropout_to', 0.2),
+                                  util.env_opt('dropout_decay', 0.0))
     batch_sizes = util.compounding(util.env_opt('batch_from', 1),
-                                   util.env_opt('batch_to', 4),
+                                   util.env_opt('batch_to', 16),
                                    util.env_opt('batch_compound', 1.001))
     corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
     n_train_words = corpus.count_train()