diff --git a/examples/experiments/onto-ner.cfg b/examples/experiments/onto-ner.cfg
index 228289128..eab68a27f 100644
--- a/examples/experiments/onto-ner.cfg
+++ b/examples/experiments/onto-ner.cfg
@@ -5,7 +5,7 @@
 # data is passed in sentence-by-sentence via some prior preprocessing.
 gold_preproc = false
 # Limitations on training document length or number of examples.
-max_length = 5000
+max_length = 3000
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
@@ -17,20 +17,20 @@ max_steps = 0
 eval_frequency = 1000
 # Other settings
 seed = 0
-accumulate_gradient = 2
+accumulate_gradient = 1
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
 scores = ["speed", "ents_p", "ents_r", "ents_f"]
 score_weights = {"ents_f": 1.0}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
-discard_oversize = true
+discard_oversize = false
 omit_extra_lookups = false
-batch_by_words = true
+batch_by = "words"
 
 [training.batch_size]
 @schedules = "compounding.v1"
-start = 1000
+start = 100
 stop = 1000
 compound = 1.001
 
@@ -45,12 +45,6 @@ use_averages = true
 eps = 1e-8
 learn_rate = 0.001
 
-#[training.optimizer.learn_rate]
-#@schedules = "warmup_linear.v1"
-#warmup_steps = 1000
-#total_steps = 50000
-#initial_rate = 0.003
-
 [nlp]
 lang = "en"
 vectors = null
@@ -74,6 +68,6 @@ width = 96
 depth = 4
 window_size = 1
 embed_size = 2000
-maxout_pieces = 1
+maxout_pieces = 3
 subword_features = true
 dropout = ${training:dropout}