regenerating KB

This commit is contained in:
svlandeg 2019-06-13 22:32:56 +02:00
parent 78dd3e11da
commit 0b04d142de
3 changed files with 15 additions and 16 deletions

View File

@ -14,6 +14,7 @@ from . import wikidata_processor as wd
INPUT_DIM = 300 # dimension of pre-trained vectors
DESC_WIDTH = 64
def create_kb(nlp, max_entities_per_alias, min_occ,
entity_def_output, entity_descr_output,
count_input, prior_prob_input, to_print=False):
@ -25,8 +26,7 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
if read_raw_data:
print()
print("1. _read_wikidata_entities", datetime.datetime.now())
print()
print(" * _read_wikidata_entities", datetime.datetime.now())
title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None)
# write the title-ID and ID-description mappings to file
@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
title_list = list(title_to_id.keys())
# TODO: remove this filter (just for quicker testing of code)
title_list = title_list[0:342]
title_to_id = {t: title_to_id[t] for t in title_list}
# title_list = title_list[0:342]
# title_to_id = {t: title_to_id[t] for t in title_list}
entity_list = [title_to_id[x] for x in title_list]
@ -49,29 +49,28 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
print()
print("2. _get_entity_frequencies", datetime.datetime.now())
print(" * _get_entity_frequencies", datetime.datetime.now())
print()
entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list)
print()
print("3. train entity encoder", datetime.datetime.now())
print(" * train entity encoder", datetime.datetime.now())
print()
encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH)
encoder.train(description_list=description_list, to_print=True)
print()
print("4. get entity embeddings", datetime.datetime.now())
print(" * get entity embeddings", datetime.datetime.now())
print()
embeddings = encoder.apply_encoder(description_list)
print()
print("5. adding", len(entity_list), "entities", datetime.datetime.now())
print()
print(" * adding", len(entity_list), "entities", datetime.datetime.now())
kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings)
print()
print("6. adding aliases", datetime.datetime.now())
print(" * adding aliases", datetime.datetime.now())
print()
_add_aliases(kb, title_to_id=title_to_id,
max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,

View File

@ -17,7 +17,7 @@ class EntityEncoder:
DROP = 0
EPOCHS = 5
STOP_THRESHOLD = 0.1
STOP_THRESHOLD = 0.04
BATCH_SIZE = 1000
@ -127,7 +127,7 @@ class EntityEncoder:
return loss, gradients
def _test_encoder(self):
""" Test encoder on some dummy examples """
# Test encoder on some dummy examples
desc_A1 = "Fictional character in The Simpsons"
desc_A2 = "Simpsons - fictional human"
desc_A3 = "Fictional character in The Flintstones"

View File

@ -22,7 +22,7 @@ ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb_1/kb'
NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1'
NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'
@ -56,14 +56,14 @@ def run_pipeline():
create_wp_training = False
# train the EL pipe
train_pipe = True
train_pipe = False
measure_performance = False
# test the EL pipe on a simple example
to_test_pipeline = True
to_test_pipeline = False
# write the NLP object, read back in and test again
test_nlp_io = True
test_nlp_io = False
# STEP 1 : create prior probabilities from WP
# run only once !