diff --git a/examples/experiments/onto-ner.cfg b/examples/experiments/onto-ner.cfg
index 8970bb3c0..228289128 100644
--- a/examples/experiments/onto-ner.cfg
+++ b/examples/experiments/onto-ner.cfg
@@ -13,24 +13,25 @@ dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 100000
 max_epochs = 0
-max_steps = 100000
-eval_frequency = 2000
+max_steps = 0
+eval_frequency = 1000
 # Other settings
 seed = 0
-accumulate_gradient = 1
+accumulate_gradient = 2
 use_pytorch_for_gpu_memory = false
 # Control how scores are printed and checkpoints are evaluated.
 scores = ["speed", "ents_p", "ents_r", "ents_f"]
 score_weights = {"ents_f": 1.0}
 # These settings are invalid for the transformer models.
 init_tok2vec = null
-discard_oversize = false
+discard_oversize = true
 omit_extra_lookups = false
+batch_by_words = true
 
 [training.batch_size]
 @schedules = "compounding.v1"
-start = 100
-stop = 2000
+start = 1000
+stop = 1000
 compound = 1.001
 
 [training.optimizer]
@@ -38,7 +39,7 @@ compound = 1.001
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
-L2 = 0.0
+L2 = 0.01
 grad_clip = 1.0
 use_averages = true
 eps = 1e-8
@@ -64,15 +65,15 @@ min_action_freq = 1
 nr_feature_tokens = 3
 hidden_width = 64
 maxout_pieces = 2
-use_upper = false
+use_upper = true
 
 [nlp.pipeline.ner.model.tok2vec]
 @architectures = "spacy.HashEmbedCNN.v1"
 pretrained_vectors = ${nlp:vectors}
-width = 300
+width = 96
 depth = 4
 window_size = 1
-embed_size = 7000
+embed_size = 2000
 maxout_pieces = 1
 subword_features = true
 dropout = ${training:dropout}