regenerating KB

This commit is contained in:
svlandeg 2019-06-13 22:32:56 +02:00
parent 78dd3e11da
commit 0b04d142de
3 changed files with 15 additions and 16 deletions

View File

@ -14,6 +14,7 @@ from . import wikidata_processor as wd
INPUT_DIM = 300 # dimension of pre-trained vectors INPUT_DIM = 300 # dimension of pre-trained vectors
DESC_WIDTH = 64 DESC_WIDTH = 64
def create_kb(nlp, max_entities_per_alias, min_occ, def create_kb(nlp, max_entities_per_alias, min_occ,
entity_def_output, entity_descr_output, entity_def_output, entity_descr_output,
count_input, prior_prob_input, to_print=False): count_input, prior_prob_input, to_print=False):
@ -25,8 +26,7 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
if read_raw_data: if read_raw_data:
print() print()
print("1. _read_wikidata_entities", datetime.datetime.now()) print(" * _read_wikidata_entities", datetime.datetime.now())
print()
title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None) title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None)
# write the title-ID and ID-description mappings to file # write the title-ID and ID-description mappings to file
@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
title_list = list(title_to_id.keys()) title_list = list(title_to_id.keys())
# TODO: remove this filter (just for quicker testing of code) # TODO: remove this filter (just for quicker testing of code)
title_list = title_list[0:342] # title_list = title_list[0:342]
title_to_id = {t: title_to_id[t] for t in title_list} # title_to_id = {t: title_to_id[t] for t in title_list}
entity_list = [title_to_id[x] for x in title_list] entity_list = [title_to_id[x] for x in title_list]
@ -49,29 +49,28 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
description_list = [id_to_descr.get(x, "No description defined") for x in entity_list] description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
print() print()
print("2. _get_entity_frequencies", datetime.datetime.now()) print(" * _get_entity_frequencies", datetime.datetime.now())
print() print()
entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list) entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list)
print() print()
print("3. train entity encoder", datetime.datetime.now()) print(" * train entity encoder", datetime.datetime.now())
print() print()
encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH) encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH)
encoder.train(description_list=description_list, to_print=True) encoder.train(description_list=description_list, to_print=True)
print() print()
print("4. get entity embeddings", datetime.datetime.now()) print(" * get entity embeddings", datetime.datetime.now())
print() print()
embeddings = encoder.apply_encoder(description_list) embeddings = encoder.apply_encoder(description_list)
print() print()
print("5. adding", len(entity_list), "entities", datetime.datetime.now()) print(" * adding", len(entity_list), "entities", datetime.datetime.now())
print()
kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings) kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings)
print() print()
print("6. adding aliases", datetime.datetime.now()) print(" * adding aliases", datetime.datetime.now())
print() print()
_add_aliases(kb, title_to_id=title_to_id, _add_aliases(kb, title_to_id=title_to_id,
max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,

View File

@ -17,7 +17,7 @@ class EntityEncoder:
DROP = 0 DROP = 0
EPOCHS = 5 EPOCHS = 5
STOP_THRESHOLD = 0.1 STOP_THRESHOLD = 0.04
BATCH_SIZE = 1000 BATCH_SIZE = 1000
@ -127,7 +127,7 @@ class EntityEncoder:
return loss, gradients return loss, gradients
def _test_encoder(self): def _test_encoder(self):
""" Test encoder on some dummy examples """ # Test encoder on some dummy examples
desc_A1 = "Fictional character in The Simpsons" desc_A1 = "Fictional character in The Simpsons"
desc_A2 = "Simpsons - fictional human" desc_A2 = "Simpsons - fictional human"
desc_A3 = "Fictional character in The Flintstones" desc_A3 = "Fictional character in The Flintstones"

View File

@ -22,7 +22,7 @@ ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv' ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb_1/kb'
NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1' NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1'
NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2' NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'
@ -56,14 +56,14 @@ def run_pipeline():
create_wp_training = False create_wp_training = False
# train the EL pipe # train the EL pipe
train_pipe = True train_pipe = False
measure_performance = False measure_performance = False
# test the EL pipe on a simple example # test the EL pipe on a simple example
to_test_pipeline = True to_test_pipeline = False
# write the NLP object, read back in and test again # write the NLP object, read back in and test again
test_nlp_io = True test_nlp_io = False
# STEP 1 : create prior probabilities from WP # STEP 1 : create prior probabilities from WP
# run only once ! # run only once !