From 68a0662019760a20bbc740be43b2ec58aa5a816e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 28 Jun 2019 08:29:31 +0200
Subject: [PATCH] context encoder with Tok2Vec + linking model instead of
 cosine

---
 bin/wiki_entity_linking/kb_creator.py         |  4 +-
 bin/wiki_entity_linking/train_descriptions.py |  4 +-
 .../training_set_creator.py                   |  3 +-
 examples/pipeline/wikidata_entity_linking.py  |  9 +--
 spacy/_ml.py                                  | 45 +++++++------
 spacy/pipeline/pipes.pyx                      | 66 ++++++++++++-------
 6 files changed, 73 insertions(+), 58 deletions(-)

diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
index 6ee139174..e8e081cef 100644
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@@ -33,7 +33,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
     else:
         # read the mappings from file
         title_to_id = get_entity_to_id(entity_def_output)
-        id_to_descr = _get_id_to_description(entity_descr_output)
+        id_to_descr = get_id_to_description(entity_descr_output)
 
     print()
     print(" * _get_entity_frequencies", datetime.datetime.now())
@@ -109,7 +109,7 @@ def get_entity_to_id(entity_def_output):
     return entity_to_id
 
 
-def _get_id_to_description(entity_descr_output):
+def get_id_to_description(entity_descr_output):
     id_to_desc = dict()
     with open(entity_descr_output, 'r', encoding='utf8') as csvfile:
         csvreader = csv.reader(csvfile, delimiter='|')
diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py
index 948a0e2d1..6a4d046e5 100644
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@@ -14,7 +14,7 @@ from thinc.neural._classes.affine import Affine
 class EntityEncoder:
     """
     Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D).
-    This entity vector will be stored in the KB, and context vectors will be trained to be similar to them.
+    This entity vector will be stored in the KB, for further downstream use in the entity model.
     """
 
     DROP = 0
@@ -97,7 +97,7 @@ class EntityEncoder:
             else:
                 indices[i] = 0
         word_vectors = doc.vocab.vectors.data[indices]
-        doc_vector = np.mean(word_vectors, axis=0)  # TODO: min? max?
+        doc_vector = np.mean(word_vectors, axis=0)
         return doc_vector
 
     def _build_network(self, orig_width, hidden_with):
diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index 51105ce09..436154409 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -14,8 +14,7 @@ Process Wikipedia interlinks to generate a training dataset for the EL algorithm
 Gold-standard entities are stored in one file in standoff format (by character offset).
 """
 
-# ENTITY_FILE = "gold_entities.csv"
-ENTITY_FILE = "gold_entities_1000000.csv"   # use this file for faster processing
+ENTITY_FILE = "gold_entities.csv"
 
 
 def create_training(wikipedia_input, entity_def_input, training_output):
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 9ce3b9559..600436a1d 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -42,9 +42,10 @@ MIN_PAIR_OCC = 5
 
 # model training parameters
 EPOCHS = 10
-DROPOUT = 0.1
+DROPOUT = 0.2
 LEARN_RATE = 0.005
 L2 = 1e-6
+CONTEXT_WIDTH=128
 
 
 def run_pipeline():
@@ -136,7 +137,8 @@ def run_pipeline():
 
     # STEP 6: create and train the entity linking pipe
     if train_pipe:
-        el_pipe = nlp_2.create_pipe(name='entity_linker', config={})
+        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
+        el_pipe = nlp_2.create_pipe(name='entity_linker', config={"context_width": CONTEXT_WIDTH})
         el_pipe.set_kb(kb_2)
         nlp_2.add_pipe(el_pipe, last=True)
 
@@ -146,9 +148,8 @@ def run_pipeline():
             optimizer.learn_rate = LEARN_RATE
             optimizer.L2 = L2
 
-        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
         # define the size (nr of entities) of training and dev set
-        train_limit = 5000
+        train_limit = 500000
         dev_limit = 5000
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 82db0fc05..b00ceda62 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -652,37 +652,36 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
     return model
 
 
-def build_nel_encoder(in_width, hidden_width, end_width, **cfg):
+def build_nel_encoder(embed_width, hidden_width, **cfg):
+    # TODO proper error
+    if "entity_width" not in cfg:
+        raise ValueError("entity_width not found")
+    if "context_width" not in cfg:
+        raise ValueError("context_width not found")
+
     conv_depth = cfg.get("conv_depth", 2)
     cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
     pretrained_vectors = cfg.get("pretrained_vectors")   # self.nlp.vocab.vectors.name
-
-    tok2vec = Tok2Vec(width=hidden_width, embed_size=in_width, pretrained_vectors=pretrained_vectors,
-                      cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth, bilstm_depth=0)
+    context_width = cfg.get("context_width")
+    entity_width = cfg.get("entity_width")
 
     with Model.define_operators({">>": chain, "**": clone}):
-        # convolution = Residual((ExtractWindow(nW=1) >>
-        #                         LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
+        model = Affine(1, entity_width+context_width+1, drop_factor=0.0)\
+                >> logistic
 
-        # encoder = SpacyVectors \
-        #           >> with_flatten(Affine(hidden_width, in_width)) \
-        #           >> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \
-        #          >> flatten_add_lengths \
-        #          >> ParametricAttention(hidden_width) \
-        #          >> Pooling(sum_pool) \
-        #          >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
-        #          >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
+        # context encoder
+        tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors,
+                          cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth,
+                          bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\
+                                >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
+                                >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
 
-        encoder = tok2vec >> flatten_add_lengths >> Pooling(mean_pool)\
-                  >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
-                  >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
+        model.tok2vec = tok2vec
 
-        # TODO: ReLu or LN(Maxout)  ?
-        # sum_pool or mean_pool ?
-
-    encoder.tok2vec = tok2vec
-    encoder.nO = end_width
-    return encoder
+    model.tok2vec = tok2vec
+    model.tok2vec.nO = context_width
+    model.nO = 1
+    return model
 
 @layerize
 def flatten(seqs, drop=0.0):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 33b3baf8d..25df31f70 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -5,6 +5,7 @@ from __future__ import unicode_literals
 
 import numpy
 import srsly
+import random
 from collections import OrderedDict
 from thinc.api import chain
 from thinc.v2v import Affine, Maxout, Softmax
@@ -229,7 +230,7 @@ class Tensorizer(Pipe):
 
         vocab (Vocab): A `Vocab` instance. The model must share the same
             `Vocab` instance with the `Doc` objects it will process.
-        model (Model): A `Model` instance or `True` allocate one later.
+        model (Model): A `Model` instance or `True` to allocate one later.
         **cfg: Config parameters.
 
         EXAMPLE:
@@ -386,7 +387,7 @@ class Tagger(Pipe):
     def predict(self, docs):
         self.require_model()
         if not any(len(doc) for doc in docs):
-            # Handle case where there are no tokens in any docs.
+            # Handle cases where there are no tokens in any docs.
             n_labels = len(self.labels)
             guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
             tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
@@ -1071,22 +1072,20 @@ class EntityLinker(Pipe):
 
     @classmethod
     def Model(cls, **cfg):
-        if "entity_width" not in cfg:
-            raise ValueError("entity_width not found")
-
         embed_width = cfg.get("embed_width", 300)
         hidden_width = cfg.get("hidden_width", 128)
-        entity_width = cfg.get("entity_width")  # this needs to correspond with the KB entity length
 
-        model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=entity_width, **cfg)
+        model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, **cfg)
         return model
 
     def __init__(self, **cfg):
         self.model = True
         self.kb = None
+        self.sgd_context = None
         self.cfg = dict(cfg)
         self.context_weight = cfg.get("context_weight", 1)
         self.prior_weight = cfg.get("prior_weight", 1)
+        self.context_width = cfg.get("context_width")
 
     def set_kb(self, kb):
         self.kb = kb
@@ -1107,6 +1106,7 @@ class EntityLinker(Pipe):
 
         if self.model is True:
             self.model = self.Model(**self.cfg)
+            self.sgd_context = self.create_optimizer()
 
         if sgd is None:
             sgd = self.create_optimizer()
@@ -1132,35 +1132,55 @@ class EntityLinker(Pipe):
 
         context_docs = []
         entity_encodings = []
+        labels = []
 
         for doc, gold in zip(docs, golds):
             for entity in gold.links:
                 start, end, gold_kb = entity
                 mention = doc.text[start:end]
-
                 candidates = self.kb.get_candidates(mention)
+                random.shuffle(candidates)
+                nr_neg = 0
                 for c in candidates:
                     kb_id = c.entity_
-                    # Currently only training on the positive instances
                     if kb_id == gold_kb:
-                        prior_prob = c.prior_prob
                         entity_encoding = c.entity_vector
                         entity_encodings.append(entity_encoding)
                         context_docs.append(doc)
+                        labels.append([1])
+                    else:   # elif nr_neg < 1:
+                        nr_neg += 1
+                        entity_encoding = c.entity_vector
+                        entity_encodings.append(entity_encoding)
+                        context_docs.append(doc)
+                        labels.append([0])
 
         if len(entity_encodings) > 0:
-            context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop)
+            context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
             entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
 
-            loss, d_scores = self.get_loss(scores=context_encodings, golds=entity_encodings, docs=None)
-            bp_context(d_scores, sgd=sgd)
+            mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
+            pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop)
+            labels = self.model.ops.asarray(labels, dtype="float32")
+
+            loss, d_scores = self.get_loss(prediction=pred, golds=labels, docs=None)
+            mention_gradient = bp_mention(d_scores, sgd=sgd)
+
+            context_gradients = [list(x[0:self.context_width]) for x in mention_gradient]
+            bp_context(self.model.ops.asarray(context_gradients, dtype="float32"), sgd=self.sgd_context)
 
             if losses is not None:
                 losses[self.name] += loss
             return loss
         return 0
 
-    def get_loss(self, docs, golds, scores):
+    def get_loss(self, docs, golds, prediction):
+        d_scores = (prediction - golds)
+        loss = (d_scores ** 2).sum()
+        loss = loss / len(golds)
+        return loss, d_scores
+
+    def get_loss_old(self, docs, golds, scores):
         # this loss function assumes we're only using positive examples
         loss, gradients = get_cossim_loss(yh=scores, y=golds)
         loss = loss / len(golds)
@@ -1191,30 +1211,26 @@ class EntityLinker(Pipe):
         if isinstance(docs, Doc):
             docs = [docs]
 
-        context_encodings = self.model(docs)
+        context_encodings = self.model.tok2vec(docs)
         xp = get_array_module(context_encodings)
 
         for i, doc in enumerate(docs):
             if len(doc) > 0:
                 context_encoding = context_encodings[i]
-                context_enc_t = context_encoding.T
-                norm_1 = xp.linalg.norm(context_enc_t)
                 for ent in doc.ents:
                     candidates = self.kb.get_candidates(ent.text)
                     if candidates:
-                        prior_probs = xp.asarray([c.prior_prob for c in candidates])
+                        random.shuffle(candidates)
+                        prior_probs = xp.asarray([[c.prior_prob] for c in candidates])
                         prior_probs *= self.prior_weight
 
                         entity_encodings = xp.asarray([c.entity_vector for c in candidates])
-                        norm_2 = xp.linalg.norm(entity_encodings, axis=1)
-
-                        # cosine similarity
-                        sims = xp.dot(entity_encodings, context_enc_t) / (norm_1 * norm_2)
-                        sims *= self.context_weight
-                        scores = prior_probs + sims - (prior_probs*sims)
-                        best_index = scores.argmax()
+                        mention_encodings = [list(context_encoding) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
+                        predictions = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))
+                        scores = (prior_probs + predictions - (xp.dot(prior_probs.T, predictions)))
 
                         # TODO: thresholding
+                        best_index = scores.argmax()
                         best_candidate = candidates[best_index]
                         final_entities.append(ent)
                         final_kb_ids.append(best_candidate.entity_)