diff --git a/website/usage/_data.json b/website/usage/_data.json
index fa7a1fcd2..193787d91 100644
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@@ -150,6 +150,7 @@
             "Tagger & Parser": "tagger-parser",
             "Similarity": "similarity",
             "Text Classification": "textcat",
+            "Tips and Advice": "tips",
             "Saving & Loading": "saving-loading"
         }
     },
diff --git a/website/usage/_training/_basics.jade b/website/usage/_training/_basics.jade
index 6ac1477e8..abacfc839 100644
--- a/website/usage/_training/_basics.jade
+++ b/website/usage/_training/_basics.jade
@@ -206,6 +206,10 @@ p
     |  use the #[+api("cli#package") #[code package]] command to generate an
     |  installable Python package from your model.
 
++code(false, "bash").
+    spacy convert /tmp/train.conllu /tmp/data
+    spacy train en /tmp/model /tmp/data/train.json -n 5
+
 +h(3, "training-simple-style") Simple training style
     +tag-new(2)
 
diff --git a/website/usage/_training/_ner.jade b/website/usage/_training/_ner.jade
index b8ef43916..383db0f29 100644
--- a/website/usage/_training/_ner.jade
+++ b/website/usage/_training/_ner.jade
@@ -2,12 +2,9 @@
 
 p
     |  All #[+a("/models") spaCy models] support online learning, so
-    |  you can update a pre-trained model with new examples. To update the
-    |  model, you first need to create an instance of
-    |  #[+api("goldparse") #[code GoldParse]], with the entity labels
-    |  you want to learn. You'll usually need to provide many examples to
-    |  meaningfully improve the system — a few hundred is a good start, although
-    |  more is better.
+    |  you can update a pre-trained model with new examples. You'll usually
+    |  need to provide many #[strong examples] to meaningfully improve the
+    |  system — a few hundred is a good start, although more is better.
 
 p
     |  You should avoid iterating over the same few examples multiple times, or
@@ -21,7 +18,7 @@ p
     |  the model of other examples by augmenting your annotations with sentences
     |  annotated with entities automatically recognised by the original model.
     |  Ultimately, this is an empirical process: you'll need to
-    |  #[strong experiment on your own data] to find a solution that works best
+    |  #[strong experiment on your data] to find a solution that works best
     |  for you.
 
 +h(3, "example-train-ner") Updating the Named Entity Recognizer
diff --git a/website/usage/_training/_saving-loading.jade b/website/usage/_training/_saving-loading.jade
index e6e54385c..885b3ad24 100644
--- a/website/usage/_training/_saving-loading.jade
+++ b/website/usage/_training/_saving-loading.jade
@@ -41,7 +41,7 @@ p
         "author": "You",
         "email": "you@example.com",
         "license": "CC BY-SA 3.0",
-        "pipeline": ["token_vectors", "tagger"]
+        "pipeline": ["tagger", "parser", "ner"]
     }
 
 +code(false, "bash").
@@ -94,26 +94,13 @@ p
     |  The #[code load()] method that comes with our model package
     |  templates will take care of putting all this together and returning a
     |  #[code Language] object with the loaded pipeline and data. If your model
-    |  requires custom pipeline components, you should
-    |  #[strong ship then with your model] and register their
-    |  #[+a("/usage/processing-pipelines#creating-factory") factories]
-    |  via  #[+api("spacy#set_factory") #[code set_factory()]].
-
-+aside-code("Factory example").
-    def my_factory(vocab):
-        # load some state
-        def my_component(doc):
-            # process the doc
-            return doc
-        return my_component
-
-+code.
-    spacy.set_factory('custom_component', custom_component_factory)
-
-+infobox("Custom models with pipeline components")
-    |  For more details and an example of how to package a sentiment model
-    |  with a custom pipeline component, see the usage guide on
-    |  #[+a("/usage/processing-pipelines#example2") language processing pipelines].
+    |  requires custom #[+a("/usage/processing-pipelines") pipeline components]
+    |  or a custom language class, you can also
+    |  #[strong ship the code with your model]. For examples of this, check out
+    |  the implementations of spaCy's
+    |  #[+api("util#load_model_from_init_py") #[code load_model_from_init_py]]
+    |  and  #[+api("util#load_model_from_path") #[code load_model_from_path]]
+    |  utility functions.
 
 +h(3, "models-building") Building the model package
 
@@ -155,8 +142,7 @@ p
     |  #[+api("language#from_disk") #[code from_disk]] instead.
 
 +code.
-    from spacy.lang.en import English
-    nlp = English().from_disk('/path/to/data')
+    nlp = spacy.blank('en').from_disk('/path/to/data')
 
 +infobox("Important note: Loading data in v2.x")
     .o-block
@@ -168,7 +154,7 @@ p
         |  spaCy v2.0 solves this with a clear distinction between setting up
         |  the instance and loading the data.
 
-    +code-new nlp = English().from_disk('/path/to/data')
+    +code-new nlp = spacy.blank('en').from_disk('/path/to/data')
     +code-old nlp = spacy.load('en', path='/path/to/data')
 
 +h(3, "example-training-spacy") Example: How we're training and packaging models for spaCy
diff --git a/website/usage/_training/_tips.jade b/website/usage/_training/_tips.jade
new file mode 100644
index 000000000..942c8de8d
--- /dev/null
+++ b/website/usage/_training/_tips.jade
@@ -0,0 +1,135 @@
+//- 💫 DOCS > USAGE > TRAINING > OPTIMIZATION TIPS AND ADVICE
+
+p
+    |  There are lots of conflicting "recipes" for training deep neural
+    |  networks at the moment. The cutting-edge models take a very long time to
+    |  train, so most researchers can't run enough experiments to figure out
+    |  what's #[em really] going on. For what it's worth, here's a recipe seems
+    |  to work well on a lot of problems:
+
++code("Batch heuristic").
+    def get_batches(train_data, model_type):
+        max_batch_sizes = {'tagger': 32, 'parser': 16, 'ner': 16, 'textcat': 64}
+        max_batch_size = max_batch_sizes[model_type]
+        if len(train_data) &lt; 1000:
+            max_batch_size /= 2
+        if len(train_data) &lt; 500:
+            max_batch_size /= 2
+        batch_size = compounding(1, max_batch_size, 1.001)
+        batches = minibatch(train_data, size=batch_size)
+        return batches
+
+p
+    |  This will set the batch size to start at #[code 1], and increase each
+    |  batch until it reaches a maximum size. The tagger, parser and entity
+    |  recognizer all take whole sentences as input, so they're learning a lot
+    |  of labels in a single example. You therefore need smaller batches for
+    |  them. The batch size for the text categorizer should be somewhat larger,
+    |  especially if your documents are long.
+
+p
+    |  The trick of increasing the batch size is starting to become quite
+    |  popular (see #[+a("https://arxiv.org/abs/1711.00489") Smith et al., 2017]).
+    |  Their recipe is quite different from how spaCy's models are being
+    |  trained, but there are some similarities. In training the various spaCy
+    |  models, we haven't found much advantage from decaying the learning
+    |  rate – but starting with a low batch size has definitely helped. You
+    |  should try it out on your data, and see how you go.
+
++h(3, "tips-hyperparams") Learning rate, regularization and gradient clipping
+
+p
+    |  By default spaCy uses the Adam solver, with default settings
+    |  (learning rate #[code 0.001], #[code beta1=0.9], #[code beta2=0.999]).
+    |  Some researchers have said they found these settings terrible on their
+    |  problems – but they've always performed very well in training spaCy's
+    |  models, in combination with the rest of our recipe. You can change these
+    |  settings directly, by modifying the corresponding attributes on the
+    |  #[code optimizer] object. You can also set environment variables, to
+    |  adjust the defaults.
+
+p
+    |  There are two other key hyper-parameters of the solver: #[code L2]
+    |  #[strong regularization], and #[strong gradient clipping]
+    |  (#[code max_grad_norm]). Gradient clipping is a hack that's not discussed
+    |  often, but everybody seems to be using. It's quite important in helping
+    |  to ensure the network doesn't diverge, which is a fancy way of saying
+    |  "fall over during training". The effect is sort of similar to setting the
+    |  learning rate low. It can also compensate for a large batch size (this is
+    |  a good example of how the choices of all these hyper-parameters
+    |  intersect).
+
++h(3, "tips-dropout") Dropout rate
+
+p
+    |  For small datasets, it's useful to set a
+    |  #[strong high dropout rate at first], and #[strong decay] it down towards
+    |  a more reasonable value. This helps avoid the network immediately
+    |  overfitting, while still encouraging it to learn some of the more
+    |  interesting things in your data. spaCy comes with a
+    |  #[+api("top-level#util.decaying") #[code decaying]] utility function to
+    |  facilitate this. You might try setting:
+
++code.
+    from spacy.util import decaying
+    dropout = decaying(0.6, 0.2, 1e-4)
+
+p
+    |  You can then draw values from the iterator with #[code next(dropout)],
+    |  which you would pass to the #[code drop] keyword argument of
+    |  #[+api("language#update") #[code nlp.update]]. It's pretty much always a
+    |  good idea to use at least #[strong some dropout]. All of the models
+    |  currently use Bernoulli dropout, for no particularly principled reason –
+    |  we just haven't experimented with another scheme like Gaussian dropout
+    |  yet.
+
++h(3, "tips-param-avg") Parameter averaging
+
+p
+    |  The last part of our optimisation recipe is #[strong parameter averaging],
+    |  an old trick introduced by
+    |  #[+a("https://cseweb.ucsd.edu/~yfreund/papers/LargeMarginsUsingPerceptron.pdf") Freund and Schapire (1999)],
+    |  popularised in the NLP community by
+    |  #[+a("http://www.aclweb.org/anthology/P04-1015") Collins (2002)],
+    |  explained in more detail by
+    |  #[+a("http://leon.bottou.org/projects/sgd") Leon Botto]. Just about the
+    |  only other people who seem to be using this for neural network training
+    |  are the SyntaxNet team (one of whom is Michael Collins) – but it really
+    |  seems to work great on every problem.
+
+p
+    |  The trick is to store the moving average of the weights during training.
+    |  We don't optimise this average – we just track it. Then  when we want to
+    |  actually use the model, we use the averages, not the most recent value.
+    |  In spaCy (and #[+a(gh("thinc")) Thinc]) this is done by using a
+    |  context manager, #[+api("language#use_params") #[code use_params]], to
+    |  temporarily replace the weights:
+
++code.
+    with nlp.use_params(optimizer.averages):
+        nlp.to_disk('/model')
+
+p
+    |  The context manager is handy because you naturally want to evaluate and
+    |  save the model at various points during training (e.g. after each epoch).
+    |  After evaluating and saving, the context manager will exit and the
+    |  weights will be restored, so you resume training from the most recent
+    |  value, rather than the average. By evaluating the model after each epoch,
+    |  you can remove one hyper-parameter from consideration (the number of
+    |  epochs). Having one less magic number to guess is extremely nice – so
+    |  having the averaging under a context manager is very convenient.
+
++h(3, "tips-transfer-learning") Transfer learning
+
+p
+    |  Finally, if you're training from a small data set, it's very useful to
+    |  start off with some knowledge already in the model. #[strong Word vectors]
+    |  are an easy and reliable way to do that, but depending on the
+    |  application, you may also be able to start with useful knowledge from one
+    |  of spaCy's #[+a("/models") pre-trained models], such as the parser,
+    |  entity recogniser and tagger. If you're adapting a pre-trained model and
+    |  you want it to retain accuracy on the tasks it was originally trained
+    |  for, you should consider the  "catastrophic forgetting" problem.
+    |  #[+a("https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting", true) See this blog post]
+    |  to read more about the problem and our suggested solution,
+    |  pseudo-rehearsal.
diff --git a/website/usage/training.jade b/website/usage/training.jade
index 8f15668c4..ec8401fb1 100644
--- a/website/usage/training.jade
+++ b/website/usage/training.jade
@@ -28,6 +28,10 @@ p
     +h(2, "textcat") Training a text classification model
     include _training/_textcat
 
++section("tips")
+    +h(2, "tips") Optimization tips and advice
+    include _training/_tips
+
 +section("saving-loading")
     +h(2, "saving-loading") Saving and loading models
     include _training/_saving-loading