2019-05-06 11:56:56 +03:00
|
|
|
# coding: utf-8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2019-06-07 14:54:45 +03:00
|
|
|
import random
|
|
|
|
|
|
|
|
from spacy.util import minibatch, compounding
|
|
|
|
|
2019-05-09 18:23:19 +03:00
|
|
|
from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
|
|
|
|
from examples.pipeline.wiki_entity_linking.train_el import EL_Model
|
2019-05-06 11:56:56 +03:00
|
|
|
|
|
|
|
import spacy
|
|
|
|
from spacy.vocab import Vocab
|
|
|
|
from spacy.kb import KnowledgeBase
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
"""
|
|
|
|
Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
|
|
|
|
"""
|
|
|
|
|
|
|
|
PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
|
|
|
ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
|
|
|
|
ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
|
2019-05-07 17:03:42 +03:00
|
|
|
ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'
|
2019-05-06 11:56:56 +03:00
|
|
|
|
|
|
|
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
|
|
|
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
|
|
|
|
|
2019-05-07 17:03:42 +03:00
|
|
|
TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
|
2019-05-06 11:56:56 +03:00
|
|
|
|
2019-06-07 14:54:45 +03:00
|
|
|
MAX_CANDIDATES = 10
|
|
|
|
MIN_PAIR_OCC = 5
|
|
|
|
DOC_CHAR_CUTOFF = 300
|
|
|
|
EPOCHS = 5
|
|
|
|
DROPOUT = 0.1
|
2019-05-06 11:56:56 +03:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
print("START", datetime.datetime.now())
|
|
|
|
print()
|
2019-06-06 20:51:27 +03:00
|
|
|
nlp = spacy.load('en_core_web_lg')
|
2019-05-06 11:56:56 +03:00
|
|
|
my_kb = None
|
|
|
|
|
|
|
|
# one-time methods to create KB and write to file
|
|
|
|
to_create_prior_probs = False
|
|
|
|
to_create_entity_counts = False
|
2019-05-09 18:23:19 +03:00
|
|
|
to_create_kb = False
|
2019-05-06 11:56:56 +03:00
|
|
|
|
|
|
|
# read KB back in from file
|
|
|
|
to_read_kb = True
|
2019-06-06 21:22:14 +03:00
|
|
|
to_test_kb = False
|
2019-06-05 01:09:46 +03:00
|
|
|
|
2019-05-06 16:13:50 +03:00
|
|
|
# create training dataset
|
2019-05-06 11:56:56 +03:00
|
|
|
create_wp_training = False
|
|
|
|
|
2019-06-06 21:22:14 +03:00
|
|
|
train_pipe = True
|
|
|
|
|
2019-06-05 01:09:46 +03:00
|
|
|
# run EL training
|
|
|
|
run_el_training = False
|
2019-05-07 17:03:42 +03:00
|
|
|
|
|
|
|
# apply named entity linking to the dev dataset
|
|
|
|
apply_to_dev = False
|
2019-05-06 16:13:50 +03:00
|
|
|
|
2019-06-06 20:51:27 +03:00
|
|
|
to_test_pipeline = False
|
|
|
|
|
2019-05-06 11:56:56 +03:00
|
|
|
# STEP 1 : create prior probabilities from WP
|
|
|
|
# run only once !
|
|
|
|
if to_create_prior_probs:
|
|
|
|
print("STEP 1: to_create_prior_probs", datetime.datetime.now())
|
|
|
|
wp.read_wikipedia_prior_probs(prior_prob_output=PRIOR_PROB)
|
|
|
|
print()
|
|
|
|
|
|
|
|
# STEP 2 : deduce entity frequencies from WP
|
|
|
|
# run only once !
|
|
|
|
if to_create_entity_counts:
|
|
|
|
print("STEP 2: to_create_entity_counts", datetime.datetime.now())
|
|
|
|
wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False)
|
|
|
|
print()
|
|
|
|
|
|
|
|
# STEP 3 : create KB and write to file
|
|
|
|
# run only once !
|
|
|
|
if to_create_kb:
|
|
|
|
print("STEP 3a: to_create_kb", datetime.datetime.now())
|
2019-06-06 20:51:27 +03:00
|
|
|
my_kb = kb_creator.create_kb(nlp,
|
2019-06-07 13:58:42 +03:00
|
|
|
max_entities_per_alias=MAX_CANDIDATES,
|
|
|
|
min_occ=MIN_PAIR_OCC,
|
2019-05-07 17:03:42 +03:00
|
|
|
entity_def_output=ENTITY_DEFS,
|
|
|
|
entity_descr_output=ENTITY_DESCR,
|
2019-05-06 11:56:56 +03:00
|
|
|
count_input=ENTITY_COUNTS,
|
|
|
|
prior_prob_input=PRIOR_PROB,
|
|
|
|
to_print=False)
|
|
|
|
print("kb entities:", my_kb.get_size_entities())
|
|
|
|
print("kb aliases:", my_kb.get_size_aliases())
|
|
|
|
print()
|
|
|
|
|
|
|
|
print("STEP 3b: write KB", datetime.datetime.now())
|
|
|
|
my_kb.dump(KB_FILE)
|
2019-06-06 20:51:27 +03:00
|
|
|
nlp.vocab.to_disk(VOCAB_DIR)
|
2019-05-06 11:56:56 +03:00
|
|
|
print()
|
|
|
|
|
|
|
|
# STEP 4 : read KB back in from file
|
|
|
|
if to_read_kb:
|
|
|
|
print("STEP 4: to_read_kb", datetime.datetime.now())
|
|
|
|
my_vocab = Vocab()
|
|
|
|
my_vocab.from_disk(VOCAB_DIR)
|
2019-06-05 19:29:18 +03:00
|
|
|
my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64) # TODO entity vectors
|
2019-05-06 11:56:56 +03:00
|
|
|
my_kb.load_bulk(KB_FILE)
|
|
|
|
print("kb entities:", my_kb.get_size_entities())
|
|
|
|
print("kb aliases:", my_kb.get_size_aliases())
|
|
|
|
print()
|
|
|
|
|
|
|
|
# test KB
|
|
|
|
if to_test_kb:
|
2019-06-06 20:51:27 +03:00
|
|
|
run_el.run_kb_toy_example(kb=my_kb)
|
2019-05-06 11:56:56 +03:00
|
|
|
print()
|
|
|
|
|
|
|
|
# STEP 5: create a training dataset from WP
|
|
|
|
if create_wp_training:
|
|
|
|
print("STEP 5: create training dataset", datetime.datetime.now())
|
2019-05-07 17:03:42 +03:00
|
|
|
training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
|
|
|
|
|
2019-06-06 21:22:14 +03:00
|
|
|
# STEP 6: create the entity linking pipe
|
|
|
|
if train_pipe:
|
2019-06-07 13:58:42 +03:00
|
|
|
id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR)
|
|
|
|
|
2019-06-10 22:25:26 +03:00
|
|
|
train_limit = 10
|
|
|
|
print("Training on", train_limit, "articles")
|
|
|
|
|
2019-06-07 14:54:45 +03:00
|
|
|
train_data = training_set_creator.read_training(nlp=nlp,
|
2019-06-10 22:25:26 +03:00
|
|
|
training_dir=TRAINING_DIR,
|
|
|
|
dev=False,
|
|
|
|
limit=train_limit,
|
|
|
|
to_print=False)
|
|
|
|
|
|
|
|
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF})
|
2019-06-06 21:22:14 +03:00
|
|
|
nlp.add_pipe(el_pipe, last=True)
|
|
|
|
|
2019-06-07 13:58:42 +03:00
|
|
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
|
|
|
|
with nlp.disable_pipes(*other_pipes): # only train Entity Linking
|
|
|
|
nlp.begin_training()
|
|
|
|
|
2019-06-07 14:54:45 +03:00
|
|
|
for itn in range(EPOCHS):
|
2019-06-07 16:55:10 +03:00
|
|
|
print()
|
|
|
|
print("EPOCH", itn)
|
2019-06-07 14:54:45 +03:00
|
|
|
random.shuffle(train_data)
|
|
|
|
losses = {}
|
2019-06-10 22:25:26 +03:00
|
|
|
batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
|
2019-06-07 14:54:45 +03:00
|
|
|
for batch in batches:
|
|
|
|
docs, golds = zip(*batch)
|
|
|
|
nlp.update(
|
|
|
|
docs,
|
|
|
|
golds,
|
|
|
|
drop=DROPOUT,
|
|
|
|
losses=losses,
|
|
|
|
)
|
|
|
|
print("Losses", losses)
|
|
|
|
|
2019-06-06 20:51:27 +03:00
|
|
|
# STEP 7: apply the EL algorithm on the dev dataset (TODO: overlaps with code from run_el_training ?)
|
2019-05-07 17:03:42 +03:00
|
|
|
if apply_to_dev:
|
2019-06-06 20:51:27 +03:00
|
|
|
run_el.run_el_dev(kb=my_kb, nlp=nlp, training_dir=TRAINING_DIR, limit=2000)
|
|
|
|
print()
|
|
|
|
|
|
|
|
# test KB
|
|
|
|
if to_test_pipeline:
|
|
|
|
run_el.run_el_toy_example(kb=my_kb, nlp=nlp)
|
2019-05-06 16:13:50 +03:00
|
|
|
print()
|
|
|
|
|
2019-05-06 11:56:56 +03:00
|
|
|
# TODO coreference resolution
|
|
|
|
# add_coref()
|
|
|
|
|
|
|
|
print()
|
|
|
|
print("STOP", datetime.datetime.now())
|