diff --git a/website/usage/_deep-learning/_pytorch.jade b/website/usage/_deep-learning/_pytorch.jade
index cf0f692f9..e3b2fee7b 100644
--- a/website/usage/_deep-learning/_pytorch.jade
+++ b/website/usage/_deep-learning/_pytorch.jade
@@ -9,83 +9,8 @@
     |  to create spaCy pipeline components, to add annotations to the
     |  #[code Doc] object.
 
-+under-construction
-
-p
-    |  Here's how a #[code begin_update] function that wraps an arbitrary
-    |  PyTorch model would look:
-
 +code.
-    class PytorchWrapper(thinc.neural.Model):
-        def __init__(self, pytorch_model):
-            self.pytorch_model = pytorch_model
+    from thinc.extra.wrappers import PyTorchWrapper
+    model = PyTorchWrapper(YOUR_PYTORCH_MODEL)
 
-        def begin_update(self, x_data, drop=0.):
-            x_var = Variable(x_data)
-            # Make prediction
-            y_var = pytorch_model.forward(x_var)
-            def backward(dy_data, sgd=None):
-                dy_var = Variable(dy_data)
-                dx_var = torch.autograd.backward(x_var, dy_var)
-                return dx_var
-            return y_var.data, backward
-
-p
-    |  PyTorch requires data to be wrapped in a container, #[code Variable],
-    |  that tracks the operations performed on the data. This "tape" of
-    |  operations is then used by #[code torch.autograd.backward] to compute the
-    |  gradient with respect to the input. For example, the following code
-    |  constructs a PyTorch Linear layer that takes a vector of shape
-    |  #[code (length, 2)], multiples it by a #[code (2, 2)] matrix of weights,
-    |  adds a #[code (2,)] bias, and returns the resulting #[code (length, 2)]
-    |  vector:
-
-+code("PyTorch Linear").
-    from torch import autograd
-    from torch import nn
-    import torch
-    import numpy
-
-    pt_model = nn.Linear(2, 2)
-    length = 5
-
-    input_data = numpy.ones((5, 2), dtype='f')
-    input_var = autograd.Variable(torch.Tensor(input_data))
-
-    output_var = pt_model(input_var)
-    output_data = output_var.data.numpy()
-
-p
-    |  Given target values we would like the output data to approximate, we can
-    |  then "learn" values of the parameters within #[code pt_model], to give us
-    |  output that's closer to our target. As a trivial example, let's make the
-    |  linear layer compute the negative inverse of the input:
-
-+code.
-    def get_target(input_data):
-        return -(1 / input_data)
-
-p
-    |  To update the PyTorch model, we create an optimizer and give it
-    |  references to the model's parameters. We'll then randomly generate input
-    |  data and get the target result we'd like the function to produce. We then
-    |  compute the #[strong gradient of the error] between the current output
-    |  and the target. Using the most popular definition of "error", this is
-    |  simply the average difference:
-
-+code.
-    from torch import optim
-
-    optimizer = optim.SGD(pt_model.parameters(), lr = 0.01)
-    for i in range(10):
-        input_data = numpy.random.uniform(-1., 1., (length, 2))
-        target = -(1 / input_data)
-
-        output_var = pt_model(autograd.Variable(torch.Tensor(input_data)))
-        output_data = output_var.data.numpy()
-
-        d_output_data = (output_data - target) / length
-        d_output_var = autograd.Variable(torch.Tensor(d_output_data))
-
-        d_input_var = torch.autograg.backward(output_var, d_output_var)
-        optimizer.step()
++github("spacy", "examples/training/train_pytorch_textcat.py")