diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 600436a1d..a61af3660 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -45,7 +45,7 @@ EPOCHS = 10
 DROPOUT = 0.2
 LEARN_RATE = 0.005
 L2 = 1e-6
-CONTEXT_WIDTH=128
+CONTEXT_WIDTH = 128
 
 
 def run_pipeline():
@@ -138,7 +138,9 @@ def run_pipeline():
     # STEP 6: create and train the entity linking pipe
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
-        el_pipe = nlp_2.create_pipe(name='entity_linker', config={"context_width": CONTEXT_WIDTH})
+        el_pipe = nlp_2.create_pipe(name='entity_linker',
+                                    config={"context_width": CONTEXT_WIDTH,
+                                            "pretrained_vectors": nlp_2.vocab.vectors.name})
         el_pipe.set_kb(kb_2)
         nlp_2.add_pipe(el_pipe, last=True)
 
@@ -195,11 +197,11 @@ def run_pipeline():
                 if batchnr > 0:
                     with el_pipe.model.use_params(optimizer.averages):
                         el_pipe.context_weight = 1
-                        el_pipe.prior_weight = 0
+                        el_pipe.prior_weight = 1
                         dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
                         losses['entity_linker'] = losses['entity_linker'] / batchnr
                         print("Epoch, train loss", itn, round(losses['entity_linker'], 2),
-                              " / dev acc context avg", round(dev_acc_context, 3))
+                              " / dev acc avg", round(dev_acc_context, 3))
 
         # STEP 7: measure the performance of our trained pipe on an independent dev set
         if len(dev_data) and measure_performance:
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 5a5bfa07e..07037f653 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -666,15 +666,16 @@ def build_nel_encoder(embed_width, hidden_width, **cfg):
     entity_width = cfg.get("entity_width")
 
     with Model.define_operators({">>": chain, "**": clone}):
-        model = Affine(1, entity_width+context_width, drop_factor=0.0)\
+        model = Affine(entity_width, entity_width+context_width+1)\
+                >> Affine(1, entity_width, drop_factor=0.0)\
                 >> logistic
 
         # context encoder
         tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors,
-                          cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth,
+                          cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth,
                           bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\
                                 >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
-                                >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
+                                >> zero_init(Affine(context_width, hidden_width))
 
         model.tok2vec = tok2vec
 
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 25df31f70..d3f6fa776 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1132,7 +1132,8 @@ class EntityLinker(Pipe):
 
         context_docs = []
         entity_encodings = []
-        labels = []
+        cats = []
+        priors = []
 
         for doc, gold in zip(docs, golds):
             for entity in gold.links:
@@ -1143,27 +1144,33 @@ class EntityLinker(Pipe):
                 nr_neg = 0
                 for c in candidates:
                     kb_id = c.entity_
+                    entity_encoding = c.entity_vector
+                    entity_encodings.append(entity_encoding)
+                    context_docs.append(doc)
+
+                    if self.prior_weight > 0:
+                        priors.append([c.prior_prob])
+                    else:
+                        priors.append([0])
+
                     if kb_id == gold_kb:
-                        entity_encoding = c.entity_vector
-                        entity_encodings.append(entity_encoding)
-                        context_docs.append(doc)
-                        labels.append([1])
-                    else:   # elif nr_neg < 1:
+                        cats.append([1])
+                    else:
                         nr_neg += 1
-                        entity_encoding = c.entity_vector
-                        entity_encodings.append(entity_encoding)
-                        context_docs.append(doc)
-                        labels.append([0])
+                        cats.append([0])
 
         if len(entity_encodings) > 0:
+            assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats)
+
             context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
             entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
 
-            mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
+            mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i]
+                                 for i in range(len(entity_encodings))]
             pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop)
-            labels = self.model.ops.asarray(labels, dtype="float32")
+            cats = self.model.ops.asarray(cats, dtype="float32")
 
-            loss, d_scores = self.get_loss(prediction=pred, golds=labels, docs=None)
+            loss, d_scores = self.get_loss(prediction=pred, golds=cats, docs=None)
             mention_gradient = bp_mention(d_scores, sgd=sgd)
 
             context_gradients = [list(x[0:self.context_width]) for x in mention_gradient]
@@ -1221,13 +1228,19 @@ class EntityLinker(Pipe):
                     candidates = self.kb.get_candidates(ent.text)
                     if candidates:
                         random.shuffle(candidates)
+
+                        # this will set the prior probabilities to 0 (just like in training) if their weight is 0
                         prior_probs = xp.asarray([[c.prior_prob] for c in candidates])
                         prior_probs *= self.prior_weight
+                        scores = prior_probs
 
-                        entity_encodings = xp.asarray([c.entity_vector for c in candidates])
-                        mention_encodings = [list(context_encoding) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
-                        predictions = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))
-                        scores = (prior_probs + predictions - (xp.dot(prior_probs.T, predictions)))
+                        if self.context_weight > 0:
+                            entity_encodings = xp.asarray([c.entity_vector for c in candidates])
+                            assert len(entity_encodings) == len(prior_probs)
+                            mention_encodings = [list(context_encoding) + list(entity_encodings[i])
+                                                 + list(prior_probs[i])
+                                                 for i in range(len(entity_encodings))]
+                            scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))
 
                         # TODO: thresholding
                         best_index = scores.argmax()