baseline evaluation using highest-freq candidate

This commit is contained in:
svlandeg 2019-05-06 15:13:50 +02:00
parent 6961215578
commit 7e348d7f7f
4 changed files with 152 additions and 29 deletions

View File

@ -112,26 +112,3 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
if to_print:
print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
def test_kb(kb):
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
nlp = spacy.load('en_core_web_sm')
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
nlp.add_pipe(el_pipe, last=True)
candidates = kb.get_candidates("Bush")
print("generating candidates for 'Bush' :")
for c in candidates:
print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
print()
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
"Douglas reminds us to always bring our towel. " \
"The main character in Doug's novel is the man Arthur Dent, " \
"but Douglas doesn't write about George Washington or Homer Simpson."
doc = nlp(text)
for ent in doc.ents:
print("ent", ent.text, ent.label_, ent.kb_id_)

View File

@ -1,12 +1,113 @@
# coding: utf-8
from __future__ import unicode_literals
import os
import spacy
import datetime
from os import listdir
from examples.pipeline.wiki_entity_linking import training_set_creator
# requires: pip install neuralcoref --no-binary neuralcoref
# import neuralcoref
def run_el_toy_example(nlp, kb):
_prepare_pipeline(nlp, kb)
candidates = kb.get_candidates("Bush")
print("generating candidates for 'Bush' :")
for c in candidates:
print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
print()
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
"Douglas reminds us to always bring our towel. " \
"The main character in Doug's novel is the man Arthur Dent, " \
"but Douglas doesn't write about George Washington or Homer Simpson."
doc = nlp(text)
for ent in doc.ents:
print("ent", ent.text, ent.label_, ent.kb_id_)
def run_el_training(nlp, kb, training_dir, limit=None):
_prepare_pipeline(nlp, kb)
correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir,
collect_correct=True,
collect_incorrect=False)
predictions = list()
golds = list()
cnt = 0
for f in listdir(training_dir):
if not limit or cnt < limit:
if is_dev(f):
article_id = f.replace(".txt", "")
if cnt % 500 == 0:
print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
cnt += 1
with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
text = file.read()
doc = nlp(text)
for ent in doc.ents:
if ent.label_ == "PERSON": # TODO: expand to other types
gold_entity = correct_entries_per_article[article_id].get(ent.text, None)
# only evaluating gold entities we know, because the training data is not complete
if gold_entity:
predictions.append(ent.kb_id_)
golds.append(gold_entity)
print("Processed", cnt, "dev articles")
print()
evaluate(predictions, golds)
def is_dev(file_name):
return file_name.endswith("3.txt")
def evaluate(predictions, golds):
if len(predictions) != len(golds):
raise ValueError("predictions and gold entities should have the same length")
print("Evaluating", len(golds), "entities")
tp = 0
fp = 0
fn = 0
for pred, gold in zip(predictions, golds):
is_correct = pred == gold
if not pred:
fn += 1
elif is_correct:
tp += 1
else:
fp += 1
print("tp", tp)
print("fp", fp)
print("fn", fn)
precision = tp / (tp + fp + 0.0000001)
recall = tp / (tp + fn + 0.0000001)
fscore = 2 * recall * precision / (recall + precision + 0.0000001)
print("precision", round(100 * precision, 1), "%")
print("recall", round(100 * recall, 1), "%")
print("Fscore", round(100 * fscore, 1), "%")
def _prepare_pipeline(nlp, kb):
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
nlp.add_pipe(el_pipe, last=True)
# TODO
def add_coref():
""" Add coreference resolution to our model """

View File

@ -12,6 +12,7 @@ from . import wikipedia_processor as wp
Process Wikipedia interlinks to generate a training dataset for the EL algorithm
"""
ENTITY_FILE = "gold_entities.csv"
def create_training(kb, entity_input, training_output):
if not kb:
@ -44,7 +45,7 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
read_ids = set()
entityfile_loc = training_output + "/" + "gold_entities.csv"
entityfile_loc = training_output + "/" + ENTITY_FILE
with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
# write entity training header file
_write_training_entity(outputfile=entityfile,
@ -274,3 +275,36 @@ def _write_training_article(article_id, clean_text, training_output):
def _write_training_entity(outputfile, article_id, alias, entity, correct):
outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
def read_training_entities(training_output, collect_correct=True, collect_incorrect=False):
entityfile_loc = training_output + "/" + ENTITY_FILE
incorrect_entries_per_article = dict()
correct_entries_per_article = dict()
with open(entityfile_loc, mode='r', encoding='utf8') as file:
for line in file:
fields = line.replace('\n', "").split(sep='|')
article_id = fields[0]
alias = fields[1]
entity = fields[2]
correct = fields[3]
if correct == "1" and collect_correct:
entry_dict = correct_entries_per_article.get(article_id, dict())
if alias in entry_dict:
raise ValueError("Found alias", alias, "multiple times for article", article_id, "in", ENTITY_FILE)
entry_dict[alias] = entity
correct_entries_per_article[article_id] = entry_dict
if correct == "0" and collect_incorrect:
entry_dict = incorrect_entries_per_article.get(article_id, dict())
entities = entry_dict.get(alias, set())
entities.add(entity)
entry_dict[alias] = entities
incorrect_entries_per_article[article_id] = entry_dict
return correct_entries_per_article, incorrect_entries_per_article

View File

@ -1,7 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
from . import wikipedia_processor as wp, kb_creator, training_set_creator
from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
import spacy
from spacy.vocab import Vocab
@ -19,8 +19,7 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
if __name__ == "__main__":
@ -37,8 +36,12 @@ if __name__ == "__main__":
to_read_kb = True
to_test_kb = False
# create training dataset
create_wp_training = False
# apply named entity linking to the training dataset
apply_to_training = True
# STEP 1 : create prior probabilities from WP
# run only once !
if to_create_prior_probs:
@ -88,13 +91,21 @@ if __name__ == "__main__":
# test KB
if to_test_kb:
kb_creator.test_kb(my_kb)
my_nlp = spacy.load('en_core_web_sm')
run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp)
print()
# STEP 5: create a training dataset from WP
if create_wp_training:
print("STEP 5: create training dataset", datetime.datetime.now())
training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR)
training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_DIR)
# STEP 6: apply the EL algorithm on the training dataset
if apply_to_training:
my_nlp = spacy.load('en_core_web_sm')
run_el.run_el_training(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=1000)
print()
# TODO coreference resolution
# add_coref()