mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
regenerating KB
This commit is contained in:
parent
78dd3e11da
commit
0b04d142de
|
@ -14,6 +14,7 @@ from . import wikidata_processor as wd
|
|||
INPUT_DIM = 300 # dimension of pre-trained vectors
|
||||
DESC_WIDTH = 64
|
||||
|
||||
|
||||
def create_kb(nlp, max_entities_per_alias, min_occ,
|
||||
entity_def_output, entity_descr_output,
|
||||
count_input, prior_prob_input, to_print=False):
|
||||
|
@ -25,8 +26,7 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
|
|||
|
||||
if read_raw_data:
|
||||
print()
|
||||
print("1. _read_wikidata_entities", datetime.datetime.now())
|
||||
print()
|
||||
print(" * _read_wikidata_entities", datetime.datetime.now())
|
||||
title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None)
|
||||
|
||||
# write the title-ID and ID-description mappings to file
|
||||
|
@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
|
|||
title_list = list(title_to_id.keys())
|
||||
|
||||
# TODO: remove this filter (just for quicker testing of code)
|
||||
title_list = title_list[0:342]
|
||||
title_to_id = {t: title_to_id[t] for t in title_list}
|
||||
# title_list = title_list[0:342]
|
||||
# title_to_id = {t: title_to_id[t] for t in title_list}
|
||||
|
||||
entity_list = [title_to_id[x] for x in title_list]
|
||||
|
||||
|
@ -49,29 +49,28 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
|
|||
description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
|
||||
|
||||
print()
|
||||
print("2. _get_entity_frequencies", datetime.datetime.now())
|
||||
print(" * _get_entity_frequencies", datetime.datetime.now())
|
||||
print()
|
||||
entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list)
|
||||
|
||||
print()
|
||||
print("3. train entity encoder", datetime.datetime.now())
|
||||
print(" * train entity encoder", datetime.datetime.now())
|
||||
print()
|
||||
|
||||
encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH)
|
||||
encoder.train(description_list=description_list, to_print=True)
|
||||
print()
|
||||
|
||||
print("4. get entity embeddings", datetime.datetime.now())
|
||||
print(" * get entity embeddings", datetime.datetime.now())
|
||||
print()
|
||||
embeddings = encoder.apply_encoder(description_list)
|
||||
|
||||
print()
|
||||
print("5. adding", len(entity_list), "entities", datetime.datetime.now())
|
||||
print()
|
||||
print(" * adding", len(entity_list), "entities", datetime.datetime.now())
|
||||
kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings)
|
||||
|
||||
print()
|
||||
print("6. adding aliases", datetime.datetime.now())
|
||||
print(" * adding aliases", datetime.datetime.now())
|
||||
print()
|
||||
_add_aliases(kb, title_to_id=title_to_id,
|
||||
max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
|
||||
|
|
|
@ -17,7 +17,7 @@ class EntityEncoder:
|
|||
|
||||
DROP = 0
|
||||
EPOCHS = 5
|
||||
STOP_THRESHOLD = 0.1
|
||||
STOP_THRESHOLD = 0.04
|
||||
|
||||
BATCH_SIZE = 1000
|
||||
|
||||
|
@ -127,7 +127,7 @@ class EntityEncoder:
|
|||
return loss, gradients
|
||||
|
||||
def _test_encoder(self):
|
||||
""" Test encoder on some dummy examples """
|
||||
# Test encoder on some dummy examples
|
||||
desc_A1 = "Fictional character in The Simpsons"
|
||||
desc_A2 = "Simpsons - fictional human"
|
||||
desc_A3 = "Fictional character in The Flintstones"
|
||||
|
|
|
@ -22,7 +22,7 @@ ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
|
|||
ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
|
||||
ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'
|
||||
|
||||
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
||||
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb_1/kb'
|
||||
NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1'
|
||||
NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'
|
||||
|
||||
|
@ -56,14 +56,14 @@ def run_pipeline():
|
|||
create_wp_training = False
|
||||
|
||||
# train the EL pipe
|
||||
train_pipe = True
|
||||
train_pipe = False
|
||||
measure_performance = False
|
||||
|
||||
# test the EL pipe on a simple example
|
||||
to_test_pipeline = True
|
||||
to_test_pipeline = False
|
||||
|
||||
# write the NLP object, read back in and test again
|
||||
test_nlp_io = True
|
||||
test_nlp_io = False
|
||||
|
||||
# STEP 1 : create prior probabilities from WP
|
||||
# run only once !
|
||||
|
|
Loading…
Reference in New Issue
Block a user