From 0b04d142de01806e15a696fcc667c8563d438005 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 13 Jun 2019 22:32:56 +0200 Subject: [PATCH] regenerating KB --- .../wiki_entity_linking/kb_creator.py | 19 +++++++++---------- .../wiki_entity_linking/train_descriptions.py | 4 ++-- .../wiki_entity_linking/wiki_nel_pipeline.py | 8 ++++---- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index 785811ea6..7b740216b 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -14,6 +14,7 @@ from . import wikidata_processor as wd INPUT_DIM = 300 # dimension of pre-trained vectors DESC_WIDTH = 64 + def create_kb(nlp, max_entities_per_alias, min_occ, entity_def_output, entity_descr_output, count_input, prior_prob_input, to_print=False): @@ -25,8 +26,7 @@ def create_kb(nlp, max_entities_per_alias, min_occ, if read_raw_data: print() - print("1. _read_wikidata_entities", datetime.datetime.now()) - print() + print(" * _read_wikidata_entities", datetime.datetime.now()) title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None) # write the title-ID and ID-description mappings to file @@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ, title_list = list(title_to_id.keys()) # TODO: remove this filter (just for quicker testing of code) - title_list = title_list[0:342] - title_to_id = {t: title_to_id[t] for t in title_list} + # title_list = title_list[0:342] + # title_to_id = {t: title_to_id[t] for t in title_list} entity_list = [title_to_id[x] for x in title_list] @@ -49,29 +49,28 @@ def create_kb(nlp, max_entities_per_alias, min_occ, description_list = [id_to_descr.get(x, "No description defined") for x in entity_list] print() - print("2. _get_entity_frequencies", datetime.datetime.now()) + print(" * _get_entity_frequencies", datetime.datetime.now()) print() entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list) print() - print("3. train entity encoder", datetime.datetime.now()) + print(" * train entity encoder", datetime.datetime.now()) print() encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH) encoder.train(description_list=description_list, to_print=True) print() - print("4. get entity embeddings", datetime.datetime.now()) + print(" * get entity embeddings", datetime.datetime.now()) print() embeddings = encoder.apply_encoder(description_list) print() - print("5. adding", len(entity_list), "entities", datetime.datetime.now()) - print() + print(" * adding", len(entity_list), "entities", datetime.datetime.now()) kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings) print() - print("6. adding aliases", datetime.datetime.now()) + print(" * adding aliases", datetime.datetime.now()) print() _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py index e1a2f1797..92859fd84 100644 --- a/examples/pipeline/wiki_entity_linking/train_descriptions.py +++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py @@ -17,7 +17,7 @@ class EntityEncoder: DROP = 0 EPOCHS = 5 - STOP_THRESHOLD = 0.1 + STOP_THRESHOLD = 0.04 BATCH_SIZE = 1000 @@ -127,7 +127,7 @@ class EntityEncoder: return loss, gradients def _test_encoder(self): - """ Test encoder on some dummy examples """ + # Test encoder on some dummy examples desc_A1 = "Fictional character in The Simpsons" desc_A2 = "Simpsons - fictional human" desc_A3 = "Fictional character in The Flintstones" diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 0c03784a1..d5002e26f 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -22,7 +22,7 @@ ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv' ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv' -KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' +KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb_1/kb' NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1' NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2' @@ -56,14 +56,14 @@ def run_pipeline(): create_wp_training = False # train the EL pipe - train_pipe = True + train_pipe = False measure_performance = False # test the EL pipe on a simple example - to_test_pipeline = True + to_test_pipeline = False # write the NLP object, read back in and test again - test_nlp_io = True + test_nlp_io = False # STEP 1 : create prior probabilities from WP # run only once !