diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index 7ca7cfad1..b9e663bb9 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -112,26 +112,3 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in if to_print: print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) - -def test_kb(kb): - # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO - nlp = spacy.load('en_core_web_sm') - - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) - nlp.add_pipe(el_pipe, last=True) - - candidates = kb.get_candidates("Bush") - - print("generating candidates for 'Bush' :") - for c in candidates: - print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") - print() - - text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ - "Douglas reminds us to always bring our towel. " \ - "The main character in Doug's novel is the man Arthur Dent, " \ - "but Douglas doesn't write about George Washington or Homer Simpson." - doc = nlp(text) - - for ent in doc.ents: - print("ent", ent.text, ent.label_, ent.kb_id_) diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index eb8343722..c2156e31b 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -1,12 +1,113 @@ # coding: utf-8 from __future__ import unicode_literals +import os import spacy +import datetime +from os import listdir + +from examples.pipeline.wiki_entity_linking import training_set_creator # requires: pip install neuralcoref --no-binary neuralcoref # import neuralcoref +def run_el_toy_example(nlp, kb): + _prepare_pipeline(nlp, kb) + + candidates = kb.get_candidates("Bush") + + print("generating candidates for 'Bush' :") + for c in candidates: + print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") + print() + + text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ + "Douglas reminds us to always bring our towel. " \ + "The main character in Doug's novel is the man Arthur Dent, " \ + "but Douglas doesn't write about George Washington or Homer Simpson." + doc = nlp(text) + + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) + + +def run_el_training(nlp, kb, training_dir, limit=None): + _prepare_pipeline(nlp, kb) + + correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir, + collect_correct=True, + collect_incorrect=False) + + predictions = list() + golds = list() + + cnt = 0 + for f in listdir(training_dir): + if not limit or cnt < limit: + if is_dev(f): + article_id = f.replace(".txt", "") + if cnt % 500 == 0: + print(datetime.datetime.now(), "processed", cnt, "files in the training dataset") + cnt += 1 + with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: + text = file.read() + doc = nlp(text) + for ent in doc.ents: + if ent.label_ == "PERSON": # TODO: expand to other types + gold_entity = correct_entries_per_article[article_id].get(ent.text, None) + # only evaluating gold entities we know, because the training data is not complete + if gold_entity: + predictions.append(ent.kb_id_) + golds.append(gold_entity) + + print("Processed", cnt, "dev articles") + print() + evaluate(predictions, golds) + + +def is_dev(file_name): + return file_name.endswith("3.txt") + + +def evaluate(predictions, golds): + if len(predictions) != len(golds): + raise ValueError("predictions and gold entities should have the same length") + + print("Evaluating", len(golds), "entities") + + tp = 0 + fp = 0 + fn = 0 + + for pred, gold in zip(predictions, golds): + is_correct = pred == gold + if not pred: + fn += 1 + elif is_correct: + tp += 1 + else: + fp += 1 + + print("tp", tp) + print("fp", fp) + print("fn", fn) + + precision = tp / (tp + fp + 0.0000001) + recall = tp / (tp + fn + 0.0000001) + fscore = 2 * recall * precision / (recall + precision + 0.0000001) + + print("precision", round(100 * precision, 1), "%") + print("recall", round(100 * recall, 1), "%") + print("Fscore", round(100 * fscore, 1), "%") + + +def _prepare_pipeline(nlp, kb): + # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) + nlp.add_pipe(el_pipe, last=True) + + # TODO def add_coref(): """ Add coreference resolution to our model """ diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index e46aeec5b..47349d3dc 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -12,6 +12,7 @@ from . import wikipedia_processor as wp Process Wikipedia interlinks to generate a training dataset for the EL algorithm """ +ENTITY_FILE = "gold_entities.csv" def create_training(kb, entity_input, training_output): if not kb: @@ -44,7 +45,7 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None): read_ids = set() - entityfile_loc = training_output + "/" + "gold_entities.csv" + entityfile_loc = training_output + "/" + ENTITY_FILE with open(entityfile_loc, mode="w", encoding='utf8') as entityfile: # write entity training header file _write_training_entity(outputfile=entityfile, @@ -274,3 +275,36 @@ def _write_training_article(article_id, clean_text, training_output): def _write_training_entity(outputfile, article_id, alias, entity, correct): outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n") + + +def read_training_entities(training_output, collect_correct=True, collect_incorrect=False): + entityfile_loc = training_output + "/" + ENTITY_FILE + incorrect_entries_per_article = dict() + correct_entries_per_article = dict() + with open(entityfile_loc, mode='r', encoding='utf8') as file: + for line in file: + fields = line.replace('\n', "").split(sep='|') + article_id = fields[0] + alias = fields[1] + entity = fields[2] + correct = fields[3] + + if correct == "1" and collect_correct: + entry_dict = correct_entries_per_article.get(article_id, dict()) + if alias in entry_dict: + raise ValueError("Found alias", alias, "multiple times for article", article_id, "in", ENTITY_FILE) + entry_dict[alias] = entity + correct_entries_per_article[article_id] = entry_dict + + if correct == "0" and collect_incorrect: + entry_dict = incorrect_entries_per_article.get(article_id, dict()) + entities = entry_dict.get(alias, set()) + entities.add(entity) + entry_dict[alias] = entities + incorrect_entries_per_article[article_id] = entry_dict + + return correct_entries_per_article, incorrect_entries_per_article + + + + diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 20d4f5953..ebc1e7958 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from . import wikipedia_processor as wp, kb_creator, training_set_creator +from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el import spacy from spacy.vocab import Vocab @@ -19,8 +19,7 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' -TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' -TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/' +TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' if __name__ == "__main__": @@ -37,8 +36,12 @@ if __name__ == "__main__": to_read_kb = True to_test_kb = False + # create training dataset create_wp_training = False + # apply named entity linking to the training dataset + apply_to_training = True + # STEP 1 : create prior probabilities from WP # run only once ! if to_create_prior_probs: @@ -88,13 +91,21 @@ if __name__ == "__main__": # test KB if to_test_kb: - kb_creator.test_kb(my_kb) + my_nlp = spacy.load('en_core_web_sm') + run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp) print() # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) - training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR) + training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_DIR) + + # STEP 6: apply the EL algorithm on the training dataset + if apply_to_training: + my_nlp = spacy.load('en_core_web_sm') + run_el.run_el_training(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=1000) + print() + # TODO coreference resolution # add_coref()