code cleanup

2025-10-30 23:47:31 +03:00 · 2019-06-06 20:22:14 +02:00 · 2019-06-06 20:22:14 +02:00 · 61f0e2af65
commit 61f0e2af65
parent d8b435ceff
5 changed files with 31 additions and 41 deletions
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@ -43,15 +43,10 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
    title_list = title_list[0:34200]
    title_to_id = {t: title_to_id[t] for t in title_list}
    # print("title_list", len(title_list), title_list[0:3])
    entity_list = [title_to_id[x] for x in title_list]
    # print("entity_list", len(entity_list), entity_list[0:3])
-    # TODO: should we remove entities from the KB where there is no description ?
+    # Currently keeping entities from the KB where there is no description - putting a default void description
    description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
    # print("description_list", len(description_list), description_list[0:3])
    print()
    print("2. _get_entity_frequencies", datetime.datetime.now())
@ -69,9 +64,6 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
    print("4. get entity embeddings", datetime.datetime.now())
    print()
    embeddings = encoder.apply_encoder(description_list)
    # print("descriptions", description_list[0:3])
    # print("embeddings", len(embeddings), embeddings[0:3])
    #print("embeddings[0]", len(embeddings[0]), embeddings[0][0:3])
    print()
    print("5. adding", len(entity_list), "entities", datetime.datetime.now())
@ -104,6 +96,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_
        for qid, descr in id_to_descr.items():
            descr_file.write(str(qid) + "|" + descr + "\n")
 def _get_entity_to_id(entity_def_output):
    entity_to_id = dict()
    with open(entity_def_output, 'r', encoding='utf8') as csvfile:
@ -135,7 +128,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
        print("wp titles:", wp_titles)
    # adding aliases with prior probabilities
-        # we can read this file sequentially, it's sorted by alias, and then by count
+    # we can read this file sequentially, it's sorted by alias, and then by count
    with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
        # skip header
        prior_file.readline()
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@ -13,7 +13,7 @@ from examples.pipeline.wiki_entity_linking import training_set_creator
 def run_kb_toy_example(kb):
-    for mention in ("Bush", "President", "Homer"):
+    for mention in ("Bush", "Douglas Adams", "Homer"):
        candidates = kb.get_candidates(mention)
        print("generating candidates for " + mention + " :")
@ -128,18 +128,12 @@ def evaluate(predictions, golds, to_print=True, times_hundred=True):
    return precision, recall, fscore, accuracy
-def _prepare_pipeline(nlp, kb):
+
    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
    nlp.add_pipe(el_pipe, last=True)
 # TODO
-def add_coref():
+def add_coref(nlp):
    """ Add coreference resolution to our model """
    nlp = spacy.load('en_core_web_sm')
    # nlp = spacy.load('en')
    # TODO: this doesn't work yet
    # neuralcoref.add_to_pipe(nlp)
    print("done adding to pipe")
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@ -18,7 +18,6 @@ ENTITY_FILE = "gold_entities.csv"
 def create_training(kb, entity_def_input, training_output):
    if not kb:
        raise ValueError("kb should be defined")
    # nlp = spacy.load('en_core_web_sm')
    wp_to_id = kb_creator._get_entity_to_id(entity_def_input)
    _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000)  # TODO: full dataset
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@ -37,11 +37,13 @@ if __name__ == "__main__":
    # read KB back in from file
    to_read_kb = True
-    to_test_kb = True
+    to_test_kb = False
    # create training dataset
    create_wp_training = False
    train_pipe = True
    # run EL training
    run_el_training = False
@ -106,7 +108,15 @@ if __name__ == "__main__":
        print("STEP 5: create training dataset", datetime.datetime.now())
        training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
-    # STEP 6: apply the EL algorithm on the training dataset
+    # STEP 6: create the entity linking pipe
    if train_pipe:
        # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
        el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb})
        nlp.add_pipe(el_pipe, last=True)
    ### BELOW CODE IS DEPRECATED ###
    # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx
    if run_el_training:
        print("STEP 6: training", datetime.datetime.now())
        trainer = EL_Model(kb=my_kb, nlp=nlp)
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -1067,41 +1067,37 @@ cdef class EntityRecognizer(Parser):
 class EntityLinker(Pipe):
    """Pipeline component for named entity linking.
    DOCS: TODO
    """
    name = 'entity_linker'
    @classmethod
    def Model(cls, **cfg):
        embed_width = cfg.get("embed_width", 300)
        hidden_width = cfg.get("hidden_width", 32)
        entity_width = cfg.get("entity_width", 64)
        article_width = cfg.get("article_width", 128)
        sent_width = cfg.get("sent_width", 64)
-
+        entity_width = cfg["kb"].entity_vector_length
        entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width)
        article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
        sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
        # dimension of the mention encoder needs to match the dimension of the entity encoder
-        mention_width = entity_encoder.nO
+        mention_width = article_width + sent_width
        mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
-        return entity_encoder, article_encoder, sent_encoder, mention_encoder
+        return article_encoder, sent_encoder, mention_encoder
    def __init__(self, **cfg):
        # TODO: bring-your-own-model
        self.mention_encoder = True
        self.cfg = dict(cfg)
        self.kb = self.cfg["kb"]
        # TODO: fix this. store entity vectors in the KB ?
        self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv')
    def use_avg_params(self):
        """Modify the pipe's encoders/models, to use their average parameter values."""
        with self.article_encoder.use_params(self.sgd_article.averages) \
                 and self.entity_encoder.use_params(self.sgd_entity.averages)\
                 and self.sent_encoder.use_params(self.sgd_sent.averages) \
                 and self.mention_encoder.use_params(self.sgd_mention.averages):
            yield
@ -1113,14 +1109,13 @@ class EntityLinker(Pipe):
    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
        if self.mention_encoder is True:
-            self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
+            self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
            self.sgd_article = create_default_optimizer(self.article_encoder.ops)
            self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
            self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
            self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
    def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
-        """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) """
+        """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """
        self.require_model()
        entity_docs, article_docs, sentence_docs = docs
@ -1131,7 +1126,7 @@ class EntityLinker(Pipe):
            article_docs = [article_docs]
            sentence_docs = [sentence_docs]
-        entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop)
+        entity_encodings = None #TODO
        doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
        sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
@ -1195,10 +1190,9 @@ class EntityLinker(Pipe):
                        for c in candidates:
                            prior_prob = c.prior_prob
                            kb_id = c.entity_
-                            description = self.id_to_descr.get(kb_id)
+                            entity_encoding = c.entity_vector
-                            entity_encodings = self.entity_encoder([description])  # TODO: static entity vectors ?
+                            sim = cosine([entity_encoding], mention_enc_t)
-                            sim = cosine(entity_encodings, mention_enc_t)
+                            score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
                            score = prior_prob + sim - (prior_prob*sim)  # TODO: weights ?
                            scores.append(score)
                        best_index = scores.index(max(scores))