From 61f0e2af654ae6202a9b283794021c84d458fd5b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 6 Jun 2019 20:22:14 +0200
Subject: [PATCH] code cleanup

---
 .../wiki_entity_linking/kb_creator.py         | 13 ++------
 .../pipeline/wiki_entity_linking/run_el.py    | 12 ++-----
 .../training_set_creator.py                   |  1 -
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 14 ++++++--
 spacy/pipeline/pipes.pyx                      | 32 ++++++++-----------
 5 files changed, 31 insertions(+), 41 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index 74e8efabd..ee632bd48 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -43,15 +43,10 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
     title_list = title_list[0:34200]
     title_to_id = {t: title_to_id[t] for t in title_list}
 
-    # print("title_list", len(title_list), title_list[0:3])
-
     entity_list = [title_to_id[x] for x in title_list]
-    # print("entity_list", len(entity_list), entity_list[0:3])
 
-    # TODO: should we remove entities from the KB where there is no description ?
+    # Currently keeping entities from the KB where there is no description - putting a default void description
     description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
-    # print("description_list", len(description_list), description_list[0:3])
-
 
     print()
     print("2. _get_entity_frequencies", datetime.datetime.now())
@@ -69,9 +64,6 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
     print("4. get entity embeddings", datetime.datetime.now())
     print()
     embeddings = encoder.apply_encoder(description_list)
-    # print("descriptions", description_list[0:3])
-    # print("embeddings", len(embeddings), embeddings[0:3])
-    #print("embeddings[0]", len(embeddings[0]), embeddings[0][0:3])
 
     print()
     print("5. adding", len(entity_list), "entities", datetime.datetime.now())
@@ -104,6 +96,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_
         for qid, descr in id_to_descr.items():
             descr_file.write(str(qid) + "|" + descr + "\n")
 
+
 def _get_entity_to_id(entity_def_output):
     entity_to_id = dict()
     with open(entity_def_output, 'r', encoding='utf8') as csvfile:
@@ -135,7 +128,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
         print("wp titles:", wp_titles)
 
     # adding aliases with prior probabilities
-        # we can read this file sequentially, it's sorted by alias, and then by count
+    # we can read this file sequentially, it's sorted by alias, and then by count
     with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
         # skip header
         prior_file.readline()
diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
index f6797587e..c3074ab5c 100644
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -13,7 +13,7 @@ from examples.pipeline.wiki_entity_linking import training_set_creator
 
 
 def run_kb_toy_example(kb):
-    for mention in ("Bush", "President", "Homer"):
+    for mention in ("Bush", "Douglas Adams", "Homer"):
         candidates = kb.get_candidates(mention)
 
         print("generating candidates for " + mention + " :")
@@ -128,18 +128,12 @@ def evaluate(predictions, golds, to_print=True, times_hundred=True):
     return precision, recall, fscore, accuracy
 
 
-def _prepare_pipeline(nlp, kb):
-    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
-    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
-    nlp.add_pipe(el_pipe, last=True)
+
 
 
 # TODO
-def add_coref():
+def add_coref(nlp):
     """ Add coreference resolution to our model """
-    nlp = spacy.load('en_core_web_sm')
-    # nlp = spacy.load('en')
-
     # TODO: this doesn't work yet
     # neuralcoref.add_to_pipe(nlp)
     print("done adding to pipe")
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index b1c63c55c..ac8ad0744 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -18,7 +18,6 @@ ENTITY_FILE = "gold_entities.csv"
 def create_training(kb, entity_def_input, training_output):
     if not kb:
         raise ValueError("kb should be defined")
-    # nlp = spacy.load('en_core_web_sm')
     wp_to_id = kb_creator._get_entity_to_id(entity_def_input)
     _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000)  # TODO: full dataset
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index a669634f9..390a6800b 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -37,11 +37,13 @@ if __name__ == "__main__":
 
     # read KB back in from file
     to_read_kb = True
-    to_test_kb = True
+    to_test_kb = False
 
     # create training dataset
     create_wp_training = False
 
+    train_pipe = True
+
     # run EL training
     run_el_training = False
 
@@ -106,7 +108,15 @@ if __name__ == "__main__":
         print("STEP 5: create training dataset", datetime.datetime.now())
         training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
 
-    # STEP 6: apply the EL algorithm on the training dataset
+    # STEP 6: create the entity linking pipe
+    if train_pipe:
+        # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
+        el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb})
+        nlp.add_pipe(el_pipe, last=True)
+
+    ### BELOW CODE IS DEPRECATED ###
+
+    # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx
     if run_el_training:
         print("STEP 6: training", datetime.datetime.now())
         trainer = EL_Model(kb=my_kb, nlp=nlp)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index d9fbe59ff..c5187a593 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1067,41 +1067,37 @@ cdef class EntityRecognizer(Parser):
 
 
 class EntityLinker(Pipe):
+    """Pipeline component for named entity linking.
+
+    DOCS: TODO
+    """
     name = 'entity_linker'
 
     @classmethod
     def Model(cls, **cfg):
         embed_width = cfg.get("embed_width", 300)
         hidden_width = cfg.get("hidden_width", 32)
-        entity_width = cfg.get("entity_width", 64)
         article_width = cfg.get("article_width", 128)
         sent_width = cfg.get("sent_width", 64)
-
-        entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width)
+        entity_width = cfg["kb"].entity_vector_length
 
         article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
         sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
 
         # dimension of the mention encoder needs to match the dimension of the entity encoder
-        mention_width = entity_encoder.nO
+        mention_width = article_width + sent_width
         mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
 
-        return entity_encoder, article_encoder, sent_encoder, mention_encoder
+        return article_encoder, sent_encoder, mention_encoder
 
     def __init__(self, **cfg):
-        # TODO: bring-your-own-model
         self.mention_encoder = True
-
         self.cfg = dict(cfg)
         self.kb = self.cfg["kb"]
 
-        # TODO: fix this. store entity vectors in the KB ?
-        self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv')
-
     def use_avg_params(self):
         """Modify the pipe's encoders/models, to use their average parameter values."""
         with self.article_encoder.use_params(self.sgd_article.averages) \
-                 and self.entity_encoder.use_params(self.sgd_entity.averages)\
                  and self.sent_encoder.use_params(self.sgd_sent.averages) \
                  and self.mention_encoder.use_params(self.sgd_mention.averages):
             yield
@@ -1113,14 +1109,13 @@ class EntityLinker(Pipe):
 
     def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
         if self.mention_encoder is True:
-            self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
+            self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
             self.sgd_article = create_default_optimizer(self.article_encoder.ops)
             self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
             self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
-            self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
 
     def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
-        """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) """
+        """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """
         self.require_model()
 
         entity_docs, article_docs, sentence_docs = docs
@@ -1131,7 +1126,7 @@ class EntityLinker(Pipe):
             article_docs = [article_docs]
             sentence_docs = [sentence_docs]
 
-        entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop)
+        entity_encodings = None #TODO
         doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
         sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
 
@@ -1195,10 +1190,9 @@ class EntityLinker(Pipe):
                         for c in candidates:
                             prior_prob = c.prior_prob
                             kb_id = c.entity_
-                            description = self.id_to_descr.get(kb_id)
-                            entity_encodings = self.entity_encoder([description])  # TODO: static entity vectors ?
-                            sim = cosine(entity_encodings, mention_enc_t)
-                            score = prior_prob + sim - (prior_prob*sim)  # TODO: weights ?
+                            entity_encoding = c.entity_vector
+                            sim = cosine([entity_encoding], mention_enc_t)
+                            score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
                             scores.append(score)
 
                         best_index = scores.index(max(scores))