code cleanup

This commit is contained in:
svlandeg 2019-06-06 20:22:14 +02:00
parent d8b435ceff
commit 61f0e2af65
5 changed files with 31 additions and 41 deletions

View File

@ -43,15 +43,10 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
title_list = title_list[0:34200]
title_to_id = {t: title_to_id[t] for t in title_list}
# print("title_list", len(title_list), title_list[0:3])
entity_list = [title_to_id[x] for x in title_list]
# print("entity_list", len(entity_list), entity_list[0:3])
# TODO: should we remove entities from the KB where there is no description ?
# Currently keeping entities from the KB where there is no description - putting a default void description
description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
# print("description_list", len(description_list), description_list[0:3])
print()
print("2. _get_entity_frequencies", datetime.datetime.now())
@ -69,9 +64,6 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
print("4. get entity embeddings", datetime.datetime.now())
print()
embeddings = encoder.apply_encoder(description_list)
# print("descriptions", description_list[0:3])
# print("embeddings", len(embeddings), embeddings[0:3])
#print("embeddings[0]", len(embeddings[0]), embeddings[0][0:3])
print()
print("5. adding", len(entity_list), "entities", datetime.datetime.now())
@ -104,6 +96,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_
for qid, descr in id_to_descr.items():
descr_file.write(str(qid) + "|" + descr + "\n")
def _get_entity_to_id(entity_def_output):
entity_to_id = dict()
with open(entity_def_output, 'r', encoding='utf8') as csvfile:
@ -135,7 +128,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
print("wp titles:", wp_titles)
# adding aliases with prior probabilities
# we can read this file sequentially, it's sorted by alias, and then by count
# we can read this file sequentially, it's sorted by alias, and then by count
with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
# skip header
prior_file.readline()

View File

@ -13,7 +13,7 @@ from examples.pipeline.wiki_entity_linking import training_set_creator
def run_kb_toy_example(kb):
for mention in ("Bush", "President", "Homer"):
for mention in ("Bush", "Douglas Adams", "Homer"):
candidates = kb.get_candidates(mention)
print("generating candidates for " + mention + " :")
@ -128,18 +128,12 @@ def evaluate(predictions, golds, to_print=True, times_hundred=True):
return precision, recall, fscore, accuracy
def _prepare_pipeline(nlp, kb):
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
nlp.add_pipe(el_pipe, last=True)
# TODO
def add_coref():
def add_coref(nlp):
""" Add coreference resolution to our model """
nlp = spacy.load('en_core_web_sm')
# nlp = spacy.load('en')
# TODO: this doesn't work yet
# neuralcoref.add_to_pipe(nlp)
print("done adding to pipe")

View File

@ -18,7 +18,6 @@ ENTITY_FILE = "gold_entities.csv"
def create_training(kb, entity_def_input, training_output):
if not kb:
raise ValueError("kb should be defined")
# nlp = spacy.load('en_core_web_sm')
wp_to_id = kb_creator._get_entity_to_id(entity_def_input)
_process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000) # TODO: full dataset

View File

@ -37,11 +37,13 @@ if __name__ == "__main__":
# read KB back in from file
to_read_kb = True
to_test_kb = True
to_test_kb = False
# create training dataset
create_wp_training = False
train_pipe = True
# run EL training
run_el_training = False
@ -106,7 +108,15 @@ if __name__ == "__main__":
print("STEP 5: create training dataset", datetime.datetime.now())
training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
# STEP 6: apply the EL algorithm on the training dataset
# STEP 6: create the entity linking pipe
if train_pipe:
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb})
nlp.add_pipe(el_pipe, last=True)
### BELOW CODE IS DEPRECATED ###
# STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx
if run_el_training:
print("STEP 6: training", datetime.datetime.now())
trainer = EL_Model(kb=my_kb, nlp=nlp)

View File

@ -1067,41 +1067,37 @@ cdef class EntityRecognizer(Parser):
class EntityLinker(Pipe):
"""Pipeline component for named entity linking.
DOCS: TODO
"""
name = 'entity_linker'
@classmethod
def Model(cls, **cfg):
embed_width = cfg.get("embed_width", 300)
hidden_width = cfg.get("hidden_width", 32)
entity_width = cfg.get("entity_width", 64)
article_width = cfg.get("article_width", 128)
sent_width = cfg.get("sent_width", 64)
entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width)
entity_width = cfg["kb"].entity_vector_length
article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
# dimension of the mention encoder needs to match the dimension of the entity encoder
mention_width = entity_encoder.nO
mention_width = article_width + sent_width
mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
return entity_encoder, article_encoder, sent_encoder, mention_encoder
return article_encoder, sent_encoder, mention_encoder
def __init__(self, **cfg):
# TODO: bring-your-own-model
self.mention_encoder = True
self.cfg = dict(cfg)
self.kb = self.cfg["kb"]
# TODO: fix this. store entity vectors in the KB ?
self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv')
def use_avg_params(self):
"""Modify the pipe's encoders/models, to use their average parameter values."""
with self.article_encoder.use_params(self.sgd_article.averages) \
and self.entity_encoder.use_params(self.sgd_entity.averages)\
and self.sent_encoder.use_params(self.sgd_sent.averages) \
and self.mention_encoder.use_params(self.sgd_mention.averages):
yield
@ -1113,14 +1109,13 @@ class EntityLinker(Pipe):
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
if self.mention_encoder is True:
self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
self.sgd_article = create_default_optimizer(self.article_encoder.ops)
self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
""" docs should be a tuple of (entity_docs, article_docs, sentence_docs) """
""" docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """
self.require_model()
entity_docs, article_docs, sentence_docs = docs
@ -1131,7 +1126,7 @@ class EntityLinker(Pipe):
article_docs = [article_docs]
sentence_docs = [sentence_docs]
entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop)
entity_encodings = None #TODO
doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
@ -1195,10 +1190,9 @@ class EntityLinker(Pipe):
for c in candidates:
prior_prob = c.prior_prob
kb_id = c.entity_
description = self.id_to_descr.get(kb_id)
entity_encodings = self.entity_encoder([description]) # TODO: static entity vectors ?
sim = cosine(entity_encodings, mention_enc_t)
score = prior_prob + sim - (prior_prob*sim) # TODO: weights ?
entity_encoding = c.entity_vector
sim = cosine([entity_encoding], mention_enc_t)
score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ?
scores.append(score)
best_index = scores.index(max(scores))