mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
code cleanup
This commit is contained in:
parent
d8b435ceff
commit
61f0e2af65
|
@ -43,15 +43,10 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
|
|||
title_list = title_list[0:34200]
|
||||
title_to_id = {t: title_to_id[t] for t in title_list}
|
||||
|
||||
# print("title_list", len(title_list), title_list[0:3])
|
||||
|
||||
entity_list = [title_to_id[x] for x in title_list]
|
||||
# print("entity_list", len(entity_list), entity_list[0:3])
|
||||
|
||||
# TODO: should we remove entities from the KB where there is no description ?
|
||||
# Currently keeping entities from the KB where there is no description - putting a default void description
|
||||
description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
|
||||
# print("description_list", len(description_list), description_list[0:3])
|
||||
|
||||
|
||||
print()
|
||||
print("2. _get_entity_frequencies", datetime.datetime.now())
|
||||
|
@ -69,9 +64,6 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
|
|||
print("4. get entity embeddings", datetime.datetime.now())
|
||||
print()
|
||||
embeddings = encoder.apply_encoder(description_list)
|
||||
# print("descriptions", description_list[0:3])
|
||||
# print("embeddings", len(embeddings), embeddings[0:3])
|
||||
#print("embeddings[0]", len(embeddings[0]), embeddings[0][0:3])
|
||||
|
||||
print()
|
||||
print("5. adding", len(entity_list), "entities", datetime.datetime.now())
|
||||
|
@ -104,6 +96,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_
|
|||
for qid, descr in id_to_descr.items():
|
||||
descr_file.write(str(qid) + "|" + descr + "\n")
|
||||
|
||||
|
||||
def _get_entity_to_id(entity_def_output):
|
||||
entity_to_id = dict()
|
||||
with open(entity_def_output, 'r', encoding='utf8') as csvfile:
|
||||
|
|
|
@ -13,7 +13,7 @@ from examples.pipeline.wiki_entity_linking import training_set_creator
|
|||
|
||||
|
||||
def run_kb_toy_example(kb):
|
||||
for mention in ("Bush", "President", "Homer"):
|
||||
for mention in ("Bush", "Douglas Adams", "Homer"):
|
||||
candidates = kb.get_candidates(mention)
|
||||
|
||||
print("generating candidates for " + mention + " :")
|
||||
|
@ -128,18 +128,12 @@ def evaluate(predictions, golds, to_print=True, times_hundred=True):
|
|||
return precision, recall, fscore, accuracy
|
||||
|
||||
|
||||
def _prepare_pipeline(nlp, kb):
|
||||
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
|
||||
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
|
||||
nlp.add_pipe(el_pipe, last=True)
|
||||
|
||||
|
||||
|
||||
# TODO
|
||||
def add_coref():
|
||||
def add_coref(nlp):
|
||||
""" Add coreference resolution to our model """
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
# nlp = spacy.load('en')
|
||||
|
||||
# TODO: this doesn't work yet
|
||||
# neuralcoref.add_to_pipe(nlp)
|
||||
print("done adding to pipe")
|
||||
|
|
|
@ -18,7 +18,6 @@ ENTITY_FILE = "gold_entities.csv"
|
|||
def create_training(kb, entity_def_input, training_output):
|
||||
if not kb:
|
||||
raise ValueError("kb should be defined")
|
||||
# nlp = spacy.load('en_core_web_sm')
|
||||
wp_to_id = kb_creator._get_entity_to_id(entity_def_input)
|
||||
_process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000) # TODO: full dataset
|
||||
|
||||
|
|
|
@ -37,11 +37,13 @@ if __name__ == "__main__":
|
|||
|
||||
# read KB back in from file
|
||||
to_read_kb = True
|
||||
to_test_kb = True
|
||||
to_test_kb = False
|
||||
|
||||
# create training dataset
|
||||
create_wp_training = False
|
||||
|
||||
train_pipe = True
|
||||
|
||||
# run EL training
|
||||
run_el_training = False
|
||||
|
||||
|
@ -106,7 +108,15 @@ if __name__ == "__main__":
|
|||
print("STEP 5: create training dataset", datetime.datetime.now())
|
||||
training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
|
||||
|
||||
# STEP 6: apply the EL algorithm on the training dataset
|
||||
# STEP 6: create the entity linking pipe
|
||||
if train_pipe:
|
||||
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
|
||||
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb})
|
||||
nlp.add_pipe(el_pipe, last=True)
|
||||
|
||||
### BELOW CODE IS DEPRECATED ###
|
||||
|
||||
# STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx
|
||||
if run_el_training:
|
||||
print("STEP 6: training", datetime.datetime.now())
|
||||
trainer = EL_Model(kb=my_kb, nlp=nlp)
|
||||
|
|
|
@ -1067,41 +1067,37 @@ cdef class EntityRecognizer(Parser):
|
|||
|
||||
|
||||
class EntityLinker(Pipe):
|
||||
"""Pipeline component for named entity linking.
|
||||
|
||||
DOCS: TODO
|
||||
"""
|
||||
name = 'entity_linker'
|
||||
|
||||
@classmethod
|
||||
def Model(cls, **cfg):
|
||||
embed_width = cfg.get("embed_width", 300)
|
||||
hidden_width = cfg.get("hidden_width", 32)
|
||||
entity_width = cfg.get("entity_width", 64)
|
||||
article_width = cfg.get("article_width", 128)
|
||||
sent_width = cfg.get("sent_width", 64)
|
||||
|
||||
entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width)
|
||||
entity_width = cfg["kb"].entity_vector_length
|
||||
|
||||
article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
|
||||
sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
|
||||
|
||||
# dimension of the mention encoder needs to match the dimension of the entity encoder
|
||||
mention_width = entity_encoder.nO
|
||||
mention_width = article_width + sent_width
|
||||
mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
|
||||
|
||||
return entity_encoder, article_encoder, sent_encoder, mention_encoder
|
||||
return article_encoder, sent_encoder, mention_encoder
|
||||
|
||||
def __init__(self, **cfg):
|
||||
# TODO: bring-your-own-model
|
||||
self.mention_encoder = True
|
||||
|
||||
self.cfg = dict(cfg)
|
||||
self.kb = self.cfg["kb"]
|
||||
|
||||
# TODO: fix this. store entity vectors in the KB ?
|
||||
self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv')
|
||||
|
||||
def use_avg_params(self):
|
||||
"""Modify the pipe's encoders/models, to use their average parameter values."""
|
||||
with self.article_encoder.use_params(self.sgd_article.averages) \
|
||||
and self.entity_encoder.use_params(self.sgd_entity.averages)\
|
||||
and self.sent_encoder.use_params(self.sgd_sent.averages) \
|
||||
and self.mention_encoder.use_params(self.sgd_mention.averages):
|
||||
yield
|
||||
|
@ -1113,14 +1109,13 @@ class EntityLinker(Pipe):
|
|||
|
||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||
if self.mention_encoder is True:
|
||||
self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
|
||||
self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
|
||||
self.sgd_article = create_default_optimizer(self.article_encoder.ops)
|
||||
self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
|
||||
self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
|
||||
self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
|
||||
|
||||
def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
|
||||
""" docs should be a tuple of (entity_docs, article_docs, sentence_docs) """
|
||||
""" docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """
|
||||
self.require_model()
|
||||
|
||||
entity_docs, article_docs, sentence_docs = docs
|
||||
|
@ -1131,7 +1126,7 @@ class EntityLinker(Pipe):
|
|||
article_docs = [article_docs]
|
||||
sentence_docs = [sentence_docs]
|
||||
|
||||
entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop)
|
||||
entity_encodings = None #TODO
|
||||
doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
|
||||
sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
|
||||
|
||||
|
@ -1195,10 +1190,9 @@ class EntityLinker(Pipe):
|
|||
for c in candidates:
|
||||
prior_prob = c.prior_prob
|
||||
kb_id = c.entity_
|
||||
description = self.id_to_descr.get(kb_id)
|
||||
entity_encodings = self.entity_encoder([description]) # TODO: static entity vectors ?
|
||||
sim = cosine(entity_encodings, mention_enc_t)
|
||||
score = prior_prob + sim - (prior_prob*sim) # TODO: weights ?
|
||||
entity_encoding = c.entity_vector
|
||||
sim = cosine([entity_encoding], mention_enc_t)
|
||||
score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ?
|
||||
scores.append(score)
|
||||
|
||||
best_index = scores.index(max(scores))
|
||||
|
|
Loading…
Reference in New Issue
Block a user