From 61f0e2af654ae6202a9b283794021c84d458fd5b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 6 Jun 2019 20:22:14 +0200 Subject: [PATCH] code cleanup --- .../wiki_entity_linking/kb_creator.py | 13 ++------ .../pipeline/wiki_entity_linking/run_el.py | 12 ++----- .../training_set_creator.py | 1 - .../wiki_entity_linking/wiki_nel_pipeline.py | 14 ++++++-- spacy/pipeline/pipes.pyx | 32 ++++++++----------- 5 files changed, 31 insertions(+), 41 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index 74e8efabd..ee632bd48 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -43,15 +43,10 @@ def create_kb(nlp, max_entities_per_alias, min_occ, title_list = title_list[0:34200] title_to_id = {t: title_to_id[t] for t in title_list} - # print("title_list", len(title_list), title_list[0:3]) - entity_list = [title_to_id[x] for x in title_list] - # print("entity_list", len(entity_list), entity_list[0:3]) - # TODO: should we remove entities from the KB where there is no description ? + # Currently keeping entities from the KB where there is no description - putting a default void description description_list = [id_to_descr.get(x, "No description defined") for x in entity_list] - # print("description_list", len(description_list), description_list[0:3]) - print() print("2. _get_entity_frequencies", datetime.datetime.now()) @@ -69,9 +64,6 @@ def create_kb(nlp, max_entities_per_alias, min_occ, print("4. get entity embeddings", datetime.datetime.now()) print() embeddings = encoder.apply_encoder(description_list) - # print("descriptions", description_list[0:3]) - # print("embeddings", len(embeddings), embeddings[0:3]) - #print("embeddings[0]", len(embeddings[0]), embeddings[0][0:3]) print() print("5. adding", len(entity_list), "entities", datetime.datetime.now()) @@ -104,6 +96,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_ for qid, descr in id_to_descr.items(): descr_file.write(str(qid) + "|" + descr + "\n") + def _get_entity_to_id(entity_def_output): entity_to_id = dict() with open(entity_def_output, 'r', encoding='utf8') as csvfile: @@ -135,7 +128,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in print("wp titles:", wp_titles) # adding aliases with prior probabilities - # we can read this file sequentially, it's sorted by alias, and then by count + # we can read this file sequentially, it's sorted by alias, and then by count with open(prior_prob_input, mode='r', encoding='utf8') as prior_file: # skip header prior_file.readline() diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index f6797587e..c3074ab5c 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -13,7 +13,7 @@ from examples.pipeline.wiki_entity_linking import training_set_creator def run_kb_toy_example(kb): - for mention in ("Bush", "President", "Homer"): + for mention in ("Bush", "Douglas Adams", "Homer"): candidates = kb.get_candidates(mention) print("generating candidates for " + mention + " :") @@ -128,18 +128,12 @@ def evaluate(predictions, golds, to_print=True, times_hundred=True): return precision, recall, fscore, accuracy -def _prepare_pipeline(nlp, kb): - # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) - nlp.add_pipe(el_pipe, last=True) + # TODO -def add_coref(): +def add_coref(nlp): """ Add coreference resolution to our model """ - nlp = spacy.load('en_core_web_sm') - # nlp = spacy.load('en') - # TODO: this doesn't work yet # neuralcoref.add_to_pipe(nlp) print("done adding to pipe") diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index b1c63c55c..ac8ad0744 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -18,7 +18,6 @@ ENTITY_FILE = "gold_entities.csv" def create_training(kb, entity_def_input, training_output): if not kb: raise ValueError("kb should be defined") - # nlp = spacy.load('en_core_web_sm') wp_to_id = kb_creator._get_entity_to_id(entity_def_input) _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000) # TODO: full dataset diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index a669634f9..390a6800b 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -37,11 +37,13 @@ if __name__ == "__main__": # read KB back in from file to_read_kb = True - to_test_kb = True + to_test_kb = False # create training dataset create_wp_training = False + train_pipe = True + # run EL training run_el_training = False @@ -106,7 +108,15 @@ if __name__ == "__main__": print("STEP 5: create training dataset", datetime.datetime.now()) training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) - # STEP 6: apply the EL algorithm on the training dataset + # STEP 6: create the entity linking pipe + if train_pipe: + # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb}) + nlp.add_pipe(el_pipe, last=True) + + ### BELOW CODE IS DEPRECATED ### + + # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx if run_el_training: print("STEP 6: training", datetime.datetime.now()) trainer = EL_Model(kb=my_kb, nlp=nlp) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index d9fbe59ff..c5187a593 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1067,41 +1067,37 @@ cdef class EntityRecognizer(Parser): class EntityLinker(Pipe): + """Pipeline component for named entity linking. + + DOCS: TODO + """ name = 'entity_linker' @classmethod def Model(cls, **cfg): embed_width = cfg.get("embed_width", 300) hidden_width = cfg.get("hidden_width", 32) - entity_width = cfg.get("entity_width", 64) article_width = cfg.get("article_width", 128) sent_width = cfg.get("sent_width", 64) - - entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width) + entity_width = cfg["kb"].entity_vector_length article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width) sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width) # dimension of the mention encoder needs to match the dimension of the entity encoder - mention_width = entity_encoder.nO + mention_width = article_width + sent_width mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0) - return entity_encoder, article_encoder, sent_encoder, mention_encoder + return article_encoder, sent_encoder, mention_encoder def __init__(self, **cfg): - # TODO: bring-your-own-model self.mention_encoder = True - self.cfg = dict(cfg) self.kb = self.cfg["kb"] - # TODO: fix this. store entity vectors in the KB ? - self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv') - def use_avg_params(self): """Modify the pipe's encoders/models, to use their average parameter values.""" with self.article_encoder.use_params(self.sgd_article.averages) \ - and self.entity_encoder.use_params(self.sgd_entity.averages)\ and self.sent_encoder.use_params(self.sgd_sent.averages) \ and self.mention_encoder.use_params(self.sgd_mention.averages): yield @@ -1113,14 +1109,13 @@ class EntityLinker(Pipe): def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): if self.mention_encoder is True: - self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) + self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) self.sgd_article = create_default_optimizer(self.article_encoder.ops) self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) - self.sgd_entity = create_default_optimizer(self.entity_encoder.ops) def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): - """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) """ + """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """ self.require_model() entity_docs, article_docs, sentence_docs = docs @@ -1131,7 +1126,7 @@ class EntityLinker(Pipe): article_docs = [article_docs] sentence_docs = [sentence_docs] - entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop) + entity_encodings = None #TODO doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) @@ -1195,10 +1190,9 @@ class EntityLinker(Pipe): for c in candidates: prior_prob = c.prior_prob kb_id = c.entity_ - description = self.id_to_descr.get(kb_id) - entity_encodings = self.entity_encoder([description]) # TODO: static entity vectors ? - sim = cosine(entity_encodings, mention_enc_t) - score = prior_prob + sim - (prior_prob*sim) # TODO: weights ? + entity_encoding = c.entity_vector + sim = cosine([entity_encoding], mention_enc_t) + score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? scores.append(score) best_index = scores.index(max(scores))