diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index a2db2dc95..b9a0dc843 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -24,8 +24,6 @@ from thinc.misc import LayerNorm as LN
 # from spacy.cli.pretrain import get_cossim_loss
 from spacy.matcher import PhraseMatcher
 
-""" TODO: this code needs to be implemented in pipes.pyx"""
-
 
 class EL_Model:
 
@@ -45,7 +43,7 @@ class EL_Model:
 
     DROP = 0.1
     LEARN_RATE = 0.001
-    EPOCHS = 20
+    EPOCHS = 5
     L2 = 1e-6
 
     name = "entity_linker"
@@ -213,8 +211,7 @@ class EL_Model:
         if avg:
             with self.article_encoder.use_params(self.sgd_article.averages) \
                  and self.desc_encoder.use_params(self.sgd_desc.averages)\
-                 and self.sent_encoder.use_params(self.sgd_sent.averages)\
-                 and self.cont_encoder.use_params(self.sgd_cont.averages):
+                 and self.sent_encoder.use_params(self.sgd_sent.averages):
                 desc_encodings = self.desc_encoder(desc_docs)
                 doc_encoding = self.article_encoder([article_doc])
                 sent_encoding = self.sent_encoder([sent_doc])
@@ -226,7 +223,13 @@ class EL_Model:
 
         concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
 
-        cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
+        if avg:
+            with self.cont_encoder.use_params(self.sgd_cont.averages):
+                cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
+
+        else:
+            cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
+
         context_enc = np.transpose(cont_encodings)
 
         highest_sim = -5
@@ -298,8 +301,8 @@ class EL_Model:
         self.sgd_desc.learn_rate = self.LEARN_RATE
         self.sgd_desc.L2 = self.L2
 
-    def get_loss(self, v1, v2, targets):
-        loss, gradients = self.get_cossim_loss(v1, v2, targets)
+    def get_loss(self, pred, gold, targets):
+        loss, gradients = self.get_cossim_loss(pred, gold, targets)
         return loss, gradients
 
     def get_cossim_loss(self, yh, y, t):
@@ -327,8 +330,6 @@ class EL_Model:
         return loss, -inverse
 
     def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents):
-        all_clusters = list(entity_clusters.keys())
-
         arts_list = list()
         sents_list = list()
         descs_list = list()
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 2ebf9973e..40d737a6f 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=20)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 349b88df9..29772c5ee 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -652,6 +652,28 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
     return model
 
 
+def build_nel_encoder(in_width, hidden_width, end_width, **cfg):
+    conv_depth = cfg.get("conv_depth", 2)
+    cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
+
+    with Model.define_operators({">>": chain, "**": clone}):
+        convolution = Residual((ExtractWindow(nW=1) >>
+                                LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
+
+        encoder = SpacyVectors \
+                  >> with_flatten(LN(Maxout(hidden_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \
+                  >> flatten_add_lengths \
+                  >> ParametricAttention(hidden_width) \
+                  >> Pooling(mean_pool) \
+                  >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
+                  >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
+
+        # TODO: ReLu or LN(Maxout)  ?
+        # sum_pool or mean_pool ?
+
+    encoder.nO = end_width
+    return encoder
+
 @layerize
 def flatten(seqs, drop=0.0):
     ops = Model.ops
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 69521c1b2..c8afd431e 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -13,6 +13,8 @@ from thinc.v2v import Affine, Maxout, Softmax
 from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical, copy_array
 
+from spacy.cli.pretrain import get_cossim_loss
+
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
 from ..syntax.ner cimport BiluoPushDown
@@ -23,14 +25,17 @@ from ..vocab cimport Vocab
 from ..syntax import nonproj
 from ..attrs import POS, ID
 from ..parts_of_speech import X
-from .._ml import Tok2Vec, build_tagger_model
+from .._ml import Tok2Vec, build_tagger_model, cosine
 from .._ml import build_text_classifier, build_simple_cnn_text_classifier
-from .._ml import build_bow_text_classifier
+from .._ml import build_bow_text_classifier, build_nel_encoder
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import masked_language_model, create_default_optimizer
 from ..errors import Errors, TempErrors
 from .. import util
 
+# TODO: remove
+from examples.pipeline.wiki_entity_linking import kb_creator
+
 
 def _load_cfg(path):
     if path.exists():
@@ -1065,50 +1070,141 @@ class EntityLinker(Pipe):
     name = 'entity_linker'
 
     @classmethod
-    def Model(cls, nr_class=1, **cfg):
-        # TODO: non-dummy EL implementation
-        return None
+    def Model(cls, **cfg):
+        embed_width = cfg.get("embed_width", 300)
+        hidden_width = cfg.get("hidden_width", 32)
+        entity_width = cfg.get("entity_width", 64)
+        article_width = cfg.get("article_width", 128)
+        sent_width = cfg.get("sent_width", 64)
+
+        entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width)
+
+        article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
+        sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
+
+        # dimension of the mention encoder needs to match the dimension of the entity encoder
+        mention_width = entity_encoder.nO
+        mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
+
+        return entity_encoder, article_encoder, sent_encoder, mention_encoder
+
+    def __init__(self, **cfg):
+        # TODO: bring-your-own-model
+        self.mention_encoder = True
 
-    def __init__(self, model=True, **cfg):
-        self.model = False
         self.cfg = dict(cfg)
         self.kb = self.cfg["kb"]
 
+        # TODO: fix this. store entity vectors in the KB ?
+        self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv')
+
+    def use_avg_params(self):
+        """Modify the pipe's encoders/models, to use their average parameter values."""
+        with self.article_encoder.use_params(self.sgd_article.averages) \
+                 and self.entity_encoder.use_params(self.sgd_entity.averages)\
+                 and self.sent_encoder.use_params(self.sgd_sent.averages) \
+                 and self.mention_encoder.use_params(self.sgd_mention.averages):
+            yield
+
+    def require_model(self):
+        """Raise an error if the component's model is not initialized."""
+        if getattr(self, "mention_encoder", None) in (None, True, False):
+            raise ValueError(Errors.E109.format(name=self.name))
+
+    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
+        if self.mention_encoder is True:
+            self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
+            self.sgd_article = create_default_optimizer(self.article_encoder.ops)
+            self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
+            self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
+            self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
+
+    def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
+        """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) """
+        self.require_model()
+
+        entity_docs, article_docs, sentence_docs = docs
+        assert len(entity_docs) == len(article_docs) == len(sentence_docs)
+
+        if isinstance(entity_docs, Doc):
+            entity_docs = [entity_docs]
+            article_docs = [article_docs]
+            sentence_docs = [sentence_docs]
+
+        entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop)
+        doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
+        sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
+
+        concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
+                            range(len(article_docs))]
+        mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
+
+        loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
+
+        mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont)
+
+        # gradient : concat (doc+sent) vs. desc
+        sent_start = self.article_encoder.nO
+        sent_gradients = list()
+        doc_gradients = list()
+        for x in mention_gradient:
+            doc_gradients.append(list(x[0:sent_start]))
+            sent_gradients.append(list(x[sent_start:]))
+
+        bp_doc(doc_gradients, sgd=self.sgd_article)
+        bp_sent(sent_gradients, sgd=self.sgd_sent)
+
+        if losses is not None:
+            losses.setdefault(self.name, 0.0)
+            losses[self.name] += loss
+        return loss
+
+    def get_loss(self, docs, golds, scores):
+        loss, gradients = get_cossim_loss(scores, golds)
+        return loss, gradients
+
     def __call__(self, doc):
-        self.set_annotations([doc], scores=None, tensors=None)
+        entities, kb_ids = self.predict([doc])
+        self.set_annotations([doc], entities, kb_ids)
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
-        """Apply the pipe to a stream of documents.
-        Both __call__ and pipe should delegate to the `predict()`
-        and `set_annotations()` methods.
-        """
         for docs in util.minibatch(stream, size=batch_size):
             docs = list(docs)
-            self.set_annotations(docs, scores=None, tensors=None)
+            entities, kb_ids = self.predict(docs)
+            self.set_annotations(docs, entities, kb_ids)
             yield from docs
 
-    def set_annotations(self, docs, scores, tensors=None):
-        """
-        Currently implemented as taking the KB entry with highest prior probability for each named entity
-        TODO: actually use context etc
-        """
-        for i, doc in enumerate(docs):
-            for ent in doc.ents:
+    def predict(self, docs):
+        self.require_model()
+        for i, article_doc in enumerate(docs):
+            doc_encoding = self.article_encoder([article_doc])
+            for ent in article_doc.ents:
+                sent_doc = ent.sent.as_doc()
+                sent_encoding = self.sent_encoder([sent_doc])
+                concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
+                mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]]))
+                mention_enc_t = np.transpose(mention_encoding)
+
                 candidates = self.kb.get_candidates(ent.text)
                 if candidates:
-                    best_candidate = max(candidates, key=lambda c: c.prior_prob)
-                    for token in ent:
-                        token.ent_kb_id_ = best_candidate.entity_
+                    highest_sim = -5
+                    best_i = -1
+                    with self.use_avg_params:
+                        for c in candidates:
+                            kb_id = c.entity_
+                            description = self.id_to_descr.get(kb_id)
+                            entity_encodings = self.entity_encoder([description])  # TODO: static entity vectors ?
+                            sim = cosine(entity_encodings, mention_enc_t)
+                            if sim >= highest_sim:
+                                best_i = i
+                                highest_sim = sim
 
-    def get_loss(self, docs, golds, scores):
-        # TODO
-        pass
-
-    def add_label(self, label):
-        # TODO
-        pass
+                    # TODO best_candidate = max(candidates, key=lambda c: c.prior_prob)
 
+    def set_annotations(self, docs, entities, kb_ids=None):
+        for token, kb_id in zip(entities, kb_ids):
+            token.ent_kb_id_ = kb_id
 
 class Sentencizer(object):
     """Segment the Doc into sentences using a rule-based strategy.