mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
regenerating KB
This commit is contained in:
parent
78dd3e11da
commit
0b04d142de
|
@ -14,6 +14,7 @@ from . import wikidata_processor as wd
|
||||||
INPUT_DIM = 300 # dimension of pre-trained vectors
|
INPUT_DIM = 300 # dimension of pre-trained vectors
|
||||||
DESC_WIDTH = 64
|
DESC_WIDTH = 64
|
||||||
|
|
||||||
|
|
||||||
def create_kb(nlp, max_entities_per_alias, min_occ,
|
def create_kb(nlp, max_entities_per_alias, min_occ,
|
||||||
entity_def_output, entity_descr_output,
|
entity_def_output, entity_descr_output,
|
||||||
count_input, prior_prob_input, to_print=False):
|
count_input, prior_prob_input, to_print=False):
|
||||||
|
@ -25,8 +26,7 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
|
||||||
|
|
||||||
if read_raw_data:
|
if read_raw_data:
|
||||||
print()
|
print()
|
||||||
print("1. _read_wikidata_entities", datetime.datetime.now())
|
print(" * _read_wikidata_entities", datetime.datetime.now())
|
||||||
print()
|
|
||||||
title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None)
|
title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None)
|
||||||
|
|
||||||
# write the title-ID and ID-description mappings to file
|
# write the title-ID and ID-description mappings to file
|
||||||
|
@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
|
||||||
title_list = list(title_to_id.keys())
|
title_list = list(title_to_id.keys())
|
||||||
|
|
||||||
# TODO: remove this filter (just for quicker testing of code)
|
# TODO: remove this filter (just for quicker testing of code)
|
||||||
title_list = title_list[0:342]
|
# title_list = title_list[0:342]
|
||||||
title_to_id = {t: title_to_id[t] for t in title_list}
|
# title_to_id = {t: title_to_id[t] for t in title_list}
|
||||||
|
|
||||||
entity_list = [title_to_id[x] for x in title_list]
|
entity_list = [title_to_id[x] for x in title_list]
|
||||||
|
|
||||||
|
@ -49,29 +49,28 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
|
||||||
description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
|
description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print("2. _get_entity_frequencies", datetime.datetime.now())
|
print(" * _get_entity_frequencies", datetime.datetime.now())
|
||||||
print()
|
print()
|
||||||
entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list)
|
entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print("3. train entity encoder", datetime.datetime.now())
|
print(" * train entity encoder", datetime.datetime.now())
|
||||||
print()
|
print()
|
||||||
|
|
||||||
encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH)
|
encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH)
|
||||||
encoder.train(description_list=description_list, to_print=True)
|
encoder.train(description_list=description_list, to_print=True)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
print("4. get entity embeddings", datetime.datetime.now())
|
print(" * get entity embeddings", datetime.datetime.now())
|
||||||
print()
|
print()
|
||||||
embeddings = encoder.apply_encoder(description_list)
|
embeddings = encoder.apply_encoder(description_list)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print("5. adding", len(entity_list), "entities", datetime.datetime.now())
|
print(" * adding", len(entity_list), "entities", datetime.datetime.now())
|
||||||
print()
|
|
||||||
kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings)
|
kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print("6. adding aliases", datetime.datetime.now())
|
print(" * adding aliases", datetime.datetime.now())
|
||||||
print()
|
print()
|
||||||
_add_aliases(kb, title_to_id=title_to_id,
|
_add_aliases(kb, title_to_id=title_to_id,
|
||||||
max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
|
max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
|
||||||
|
|
|
@ -17,7 +17,7 @@ class EntityEncoder:
|
||||||
|
|
||||||
DROP = 0
|
DROP = 0
|
||||||
EPOCHS = 5
|
EPOCHS = 5
|
||||||
STOP_THRESHOLD = 0.1
|
STOP_THRESHOLD = 0.04
|
||||||
|
|
||||||
BATCH_SIZE = 1000
|
BATCH_SIZE = 1000
|
||||||
|
|
||||||
|
@ -127,7 +127,7 @@ class EntityEncoder:
|
||||||
return loss, gradients
|
return loss, gradients
|
||||||
|
|
||||||
def _test_encoder(self):
|
def _test_encoder(self):
|
||||||
""" Test encoder on some dummy examples """
|
# Test encoder on some dummy examples
|
||||||
desc_A1 = "Fictional character in The Simpsons"
|
desc_A1 = "Fictional character in The Simpsons"
|
||||||
desc_A2 = "Simpsons - fictional human"
|
desc_A2 = "Simpsons - fictional human"
|
||||||
desc_A3 = "Fictional character in The Flintstones"
|
desc_A3 = "Fictional character in The Flintstones"
|
||||||
|
|
|
@ -22,7 +22,7 @@ ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
|
||||||
ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
|
ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
|
||||||
ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'
|
ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'
|
||||||
|
|
||||||
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb_1/kb'
|
||||||
NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1'
|
NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1'
|
||||||
NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'
|
NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'
|
||||||
|
|
||||||
|
@ -56,14 +56,14 @@ def run_pipeline():
|
||||||
create_wp_training = False
|
create_wp_training = False
|
||||||
|
|
||||||
# train the EL pipe
|
# train the EL pipe
|
||||||
train_pipe = True
|
train_pipe = False
|
||||||
measure_performance = False
|
measure_performance = False
|
||||||
|
|
||||||
# test the EL pipe on a simple example
|
# test the EL pipe on a simple example
|
||||||
to_test_pipeline = True
|
to_test_pipeline = False
|
||||||
|
|
||||||
# write the NLP object, read back in and test again
|
# write the NLP object, read back in and test again
|
||||||
test_nlp_io = True
|
test_nlp_io = False
|
||||||
|
|
||||||
# STEP 1 : create prior probabilities from WP
|
# STEP 1 : create prior probabilities from WP
|
||||||
# run only once !
|
# run only once !
|
||||||
|
|
Loading…
Reference in New Issue
Block a user