diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 78c06f62a..6eda4ca7e 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -13,12 +13,14 @@ from thinc.misc import LayerNorm as LN
 from thinc.neural.util import prefer_gpu
 from wasabi import msg
 import srsly
+from thinc.neural.util import to_categorical
 
 from ..errors import Errors
 from ..tokens import Doc
 from ..attrs import ID, HEAD
 from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
 from .._ml import masked_language_model, get_cossim_loss
+from .._ml import MultiSoftmax
 from .. import util
 from .train import _load_pretrained_tok2vec
 
@@ -125,11 +127,7 @@ def pretrain(
             config[key] = str(config[key])
     util.fix_random_seed(seed)
 
-    has_gpu = prefer_gpu()
-    if has_gpu:
-        import torch
-
-        torch.set_default_tensor_type("torch.cuda.FloatTensor")
+    has_gpu = prefer_gpu(gpu_id=1)
     msg.info("Using GPU" if has_gpu else "Not using GPU")
 
     output_dir = Path(output_dir)
@@ -175,6 +173,7 @@ def pretrain(
             subword_features=not use_chars,  # Set to False for Chinese etc
             cnn_maxout_pieces=cnn_pieces,  # If set to 1, use Mish activation.
         ),
+        objective=loss_func
     )
     # Load in pretrained weights
     if init_tok2vec is not None:
@@ -265,7 +264,10 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
     RETURNS loss: A float for the loss.
     """
     predictions, backprop = model.begin_update(docs, drop=drop)
-    loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
+    if objective == "characters":
+        loss, gradients = get_characters_loss(model.ops, docs, predictions)
+    else:
+        loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
     backprop(gradients, sgd=optimizer)
     # Don't want to return a cupy object here
     # The gradients are modified in-place by the BERT MLM,
@@ -303,6 +305,17 @@ def make_docs(nlp, batch, min_length, max_length):
     return docs, skip_count
 
 
+def get_characters_loss(ops, docs, prediction, nr_char=10):
+    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
+    target_ids = target_ids.reshape((-1,))
+    target = ops.asarray(to_categorical(target_ids, nb_classes=256), dtype="f")
+    target = target.reshape((-1, 256*nr_char))
+    diff = prediction - target
+    loss = (diff**2).sum()
+    d_target = diff / float(prediction.shape[0])
+    return loss, d_target
+
+
 def get_vectors_loss(ops, docs, prediction, objective="L2"):
     """Compute a mean-squared error loss between the documents' vectors and
     the prediction.
@@ -327,16 +340,23 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
     return loss, d_target
 
 
-def create_pretraining_model(nlp, tok2vec):
+def create_pretraining_model(nlp, tok2vec, objective="cosine", nr_char=10):
     """Define a network for the pretraining. We simply add an output layer onto
     the tok2vec input model. The tok2vec input model needs to be a model that
     takes a batch of Doc objects (as a list), and returns a list of arrays.
     Each array in the output needs to have one row per token in the doc.
     """
-    output_size = nlp.vocab.vectors.data.shape[1]
-    output_layer = chain(
-        LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
-    )
+    if objective == "characters":
+        out_sizes = [256] * nr_char
+        output_layer = chain(
+            LN(Maxout(300, pieces=3)),
+            MultiSoftmax(out_sizes, 300)
+        )
+    else:
+        output_size = nlp.vocab.vectors.data.shape[1]
+        output_layer = chain(
+            LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
+        )
     # This is annoying, but the parser etc have the flatten step after
     # the tok2vec. To load the weights in cleanly, we need to match
     # the shape of the models' components exactly. So what we cann