From 83dc7b46fd1b39023c6eb883471c961d9e5bd51c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 10 Jun 2019 21:25:26 +0200
Subject: [PATCH] first tests with EL pipe

---
 .../wiki_entity_linking/kb_creator.py         |  4 +--
 .../wiki_entity_linking/train_descriptions.py |  4 +--
 .../training_set_creator.py                   |  4 +--
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 19 +++++-----
 spacy/pipeline/pipes.pyx                      | 36 ++++++++++++++-----
 5 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index ee632bd48..e7e3d077d 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
     title_list = list(title_to_id.keys())
 
     # TODO: remove this filter (just for quicker testing of code)
-    title_list = title_list[0:34200]
-    title_to_id = {t: title_to_id[t] for t in title_list}
+    # title_list = title_list[0:34200]
+    # title_to_id = {t: title_to_id[t] for t in title_list}
 
     entity_list = [title_to_id[x] for x in title_list]
 
diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py
index f2c3fa05d..e1a2f1797 100644
--- a/examples/pipeline/wiki_entity_linking/train_descriptions.py
+++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py
@@ -17,7 +17,7 @@ class EntityEncoder:
 
     DROP = 0
     EPOCHS = 5
-    STOP_THRESHOLD = 0.9 # 0.1
+    STOP_THRESHOLD = 0.1
 
     BATCH_SIZE = 1000
 
@@ -32,7 +32,7 @@ class EntityEncoder:
 
         print("Encoding", len(description_list), "entities")
 
-        batch_size = 10000
+        batch_size = 100000
 
         start = 0
         stop = min(batch_size, len(description_list))
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index 156bce05f..38a86058d 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -298,7 +298,7 @@ def read_training_entities(training_output, collect_correct=True, collect_incorr
     return correct_entries_per_article, incorrect_entries_per_article
 
 
-def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_print):
+def read_training(nlp, training_dir, dev, limit, to_print):
     correct_entries, incorrect_entries = read_training_entities(training_output=training_dir,
                                                                 collect_correct=True,
                                                                 collect_incorrect=True)
@@ -306,7 +306,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
     data = []
 
     cnt = 0
-    next_entity_nr = 1
     files = listdir(training_dir)
     for f in files:
         if not limit or cnt < limit:
@@ -320,7 +319,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
                     with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
                         text = file.read()
                         article_doc = nlp(text)
-                        truncated_text = text[0:min(doc_cutoff, len(text))]
 
                     gold_entities = list()
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index ded4bdc24..4be1ae2fb 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -121,15 +121,16 @@ if __name__ == "__main__":
     if train_pipe:
         id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR)
 
-        train_data = training_set_creator.read_training(nlp=nlp,
-                                                         training_dir=TRAINING_DIR,
-                                                         id_to_descr=id_to_descr,
-                                                         doc_cutoff=DOC_CHAR_CUTOFF,
-                                                         dev=False,
-                                                         limit=100,
-                                                         to_print=False)
+        train_limit = 10
+        print("Training on", train_limit, "articles")
 
-        el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb})
+        train_data = training_set_creator.read_training(nlp=nlp,
+                                                        training_dir=TRAINING_DIR,
+                                                        dev=False,
+                                                        limit=train_limit,
+                                                        to_print=False)
+
+        el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF})
         nlp.add_pipe(el_pipe, last=True)
 
         other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
@@ -141,7 +142,7 @@ if __name__ == "__main__":
                 print("EPOCH", itn)
                 random.shuffle(train_data)
                 losses = {}
-                batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+                batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
                 for batch in batches:
                     docs, golds = zip(*batch)
                     nlp.update(
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 01302b618..e5ed2ec23 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -11,9 +11,8 @@ from collections import OrderedDict
 from thinc.api import chain
 from thinc.v2v import Affine, Maxout, Softmax
 from thinc.misc import LayerNorm
-from thinc.neural.util import to_categorical, copy_array
-
-from spacy.cli.pretrain import get_cossim_loss
+from thinc.neural.util import to_categorical
+from thinc.neural.util import get_array_module
 
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
@@ -33,9 +32,6 @@ from .._ml import masked_language_model, create_default_optimizer
 from ..errors import Errors, TempErrors
 from .. import util
 
-# TODO: remove
-from examples.pipeline.wiki_entity_linking import kb_creator
-
 
 def _load_cfg(path):
     if path.exists():
@@ -1094,6 +1090,7 @@ class EntityLinker(Pipe):
         self.mention_encoder = True
         self.cfg = dict(cfg)
         self.kb = self.cfg["kb"]
+        self.doc_cutoff = self.cfg["doc_cutoff"]
 
     def use_avg_params(self):
         """Modify the pipe's encoders/models, to use their average parameter values."""
@@ -1134,6 +1131,7 @@ class EntityLinker(Pipe):
                 start, end, gold_kb = entity
                 mention = doc[start:end]
                 sentence = mention.sent
+                first_par = doc[0:self.doc_cutoff].as_doc()
 
                 candidates = self.kb.get_candidates(mention.text)
                 for c in candidates:
@@ -1144,7 +1142,7 @@ class EntityLinker(Pipe):
                         entity_encoding = c.entity_vector
 
                         entity_encodings.append(entity_encoding)
-                        article_docs.append(doc)
+                        article_docs.append(first_par)
                         sentence_docs.append(sentence.as_doc())
 
         if len(entity_encodings) > 0:
@@ -1158,6 +1156,10 @@ class EntityLinker(Pipe):
             entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
 
             loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
+            # print("scores", mention_encodings)
+            # print("golds", entity_encodings)
+            # print("loss", loss)
+            # print("d_scores", d_scores)
 
             mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention)
 
@@ -1180,9 +1182,26 @@ class EntityLinker(Pipe):
         return 0
 
     def get_loss(self, docs, golds, scores):
-        loss, gradients = get_cossim_loss(scores, golds)
+        targets = [[1] for _  in golds]  # assuming we're only using positive examples
+        loss, gradients = self.get_cossim_loss_2(yh=scores, y=golds, t=targets)
+        #loss = loss / len(golds)
         return loss, gradients
 
+    def get_cossim_loss_2(self, yh, y, t):
+        # Add a small constant to avoid 0 vectors
+        yh = yh + 1e-8
+        y = y + 1e-8
+        # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
+        xp = get_array_module(yh)
+        norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
+        norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
+        mul_norms = norm_yh * norm_y
+        cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms
+        d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2))
+        loss = xp.abs(cos - t).sum()
+        inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))])
+        return loss, -inverse
+
     def __call__(self, doc):
         entities, kb_ids = self.predict([doc])
         self.set_annotations([doc], entities, kb_ids)
@@ -1220,6 +1239,7 @@ class EntityLinker(Pipe):
                             score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
                             scores.append(score)
 
+                        # TODO: thresholding
                         best_index = scores.index(max(scores))
                         best_candidate = candidates[best_index]
                         final_entities.append(ent)