# coding: utf-8 from __future__ import unicode_literals import os import spacy import datetime from os import listdir from examples.pipeline.wiki_entity_linking import training_set_creator # requires: pip install neuralcoref --no-binary neuralcoref # import neuralcoref def run_kb_toy_example(kb): for mention in ("Bush", "Douglas Adams", "Homer"): candidates = kb.get_candidates(mention) print("generating candidates for " + mention + " :") for c in candidates: print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") print() def run_el_dev(nlp, kb, training_dir, limit=None): correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir, collect_correct=True, collect_incorrect=False) predictions = list() golds = list() cnt = 0 for f in listdir(training_dir): if not limit or cnt < limit: if is_dev(f): article_id = f.replace(".txt", "") if cnt % 500 == 0: print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset") cnt += 1 with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: text = file.read() doc = nlp(text) for ent in doc.ents: if ent.label_ == "PERSON": # TODO: expand to other types gold_entity = correct_entries_per_article[article_id].get(ent.text, None) # only evaluating gold entities we know, because the training data is not complete if gold_entity: predictions.append(ent.kb_id_) golds.append(gold_entity) print("Processed", cnt, "dev articles") print() evaluate(predictions, golds) def is_dev(file_name): return file_name.endswith("3.txt") def evaluate(predictions, golds, to_print=True, times_hundred=True): if len(predictions) != len(golds): raise ValueError("predictions and gold entities should have the same length") tp = 0 fp = 0 fn = 0 corrects = 0 incorrects = 0 for pred, gold in zip(predictions, golds): is_correct = pred == gold if is_correct: corrects += 1 else: incorrects += 1 if not pred: if not is_correct: # we don't care about tn fn += 1 elif is_correct: tp += 1 else: fp += 1 if to_print: print("Evaluating", len(golds), "entities") print("tp", tp) print("fp", fp) print("fn", fn) precision = tp / (tp + fp + 0.0000001) recall = tp / (tp + fn + 0.0000001) if times_hundred: precision = precision*100 recall = recall*100 fscore = 2 * recall * precision / (recall + precision + 0.0000001) accuracy = corrects / (corrects + incorrects) if to_print: print("precision", round(precision, 1), "%") print("recall", round(recall, 1), "%") print("Fscore", round(fscore, 1), "%") print("Accuracy", round(accuracy, 1), "%") return precision, recall, fscore, accuracy # TODO def add_coref(nlp): """ Add coreference resolution to our model """ # TODO: this doesn't work yet # neuralcoref.add_to_pipe(nlp) print("done adding to pipe") doc = nlp(u'My sister has a dog. She loves him.') print("done doc") print(doc._.has_coref) print(doc._.coref_clusters) # TODO def _run_ner_depr(nlp, clean_text, article_dict): doc = nlp(clean_text) for ent in doc.ents: if ent.label_ == "PERSON": # TODO: expand to non-persons ent_id = article_dict.get(ent.text) if ent_id: print(" -", ent.text, ent.label_, ent_id) else: print(" -", ent.text, ent.label_, '???') # TODO: investigate these cases