baseline evaluation using highest-freq candidate

2025-09-17 09:32:42 +03:00 · 2019-05-06 15:13:50 +02:00 · 2019-05-06 15:13:50 +02:00 · 7e348d7f7f
commit 7e348d7f7f
parent 6961215578
4 changed files with 152 additions and 29 deletions
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@ -112,26 +112,3 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
    if to_print:
        print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
 def test_kb(kb):
    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
    nlp = spacy.load('en_core_web_sm')
    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
    nlp.add_pipe(el_pipe, last=True)
    candidates = kb.get_candidates("Bush")
    print("generating candidates for 'Bush' :")
    for c in candidates:
        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
    print()
    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
           "Douglas reminds us to always bring our towel. " \
           "The main character in Doug's novel is the man Arthur Dent, " \
           "but Douglas doesn't write about George Washington or Homer Simpson."
    doc = nlp(text)
    for ent in doc.ents:
        print("ent", ent.text, ent.label_, ent.kb_id_)
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@ -1,12 +1,113 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import os
 import spacy
 import datetime
 from os import listdir
 from examples.pipeline.wiki_entity_linking import training_set_creator
 # requires: pip install neuralcoref --no-binary neuralcoref
 # import neuralcoref
 def run_el_toy_example(nlp, kb):
    _prepare_pipeline(nlp, kb)
    candidates = kb.get_candidates("Bush")
    print("generating candidates for 'Bush' :")
    for c in candidates:
        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
    print()
    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
           "Douglas reminds us to always bring our towel. " \
           "The main character in Doug's novel is the man Arthur Dent, " \
           "but Douglas doesn't write about George Washington or Homer Simpson."
    doc = nlp(text)
    for ent in doc.ents:
        print("ent", ent.text, ent.label_, ent.kb_id_)
 def run_el_training(nlp, kb, training_dir, limit=None):
    _prepare_pipeline(nlp, kb)
    correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir,
                                                                                 collect_correct=True,
                                                                                 collect_incorrect=False)
    predictions = list()
    golds = list()
    cnt = 0
    for f in listdir(training_dir):
        if not limit or cnt < limit:
            if is_dev(f):
                article_id = f.replace(".txt", "")
                if cnt % 500 == 0:
                    print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
                cnt += 1
                with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
                    text = file.read()
                    doc = nlp(text)
                    for ent in doc.ents:
                        if ent.label_ == "PERSON":  # TODO: expand to other types
                            gold_entity = correct_entries_per_article[article_id].get(ent.text, None)
                            # only evaluating gold entities we know, because the training data is not complete
                            if gold_entity:
                                predictions.append(ent.kb_id_)
                                golds.append(gold_entity)
    print("Processed", cnt, "dev articles")
    print()
    evaluate(predictions, golds)
 def is_dev(file_name):
    return file_name.endswith("3.txt")
 def evaluate(predictions, golds):
    if len(predictions) != len(golds):
        raise ValueError("predictions and gold entities should have the same length")
    print("Evaluating", len(golds), "entities")
    tp = 0
    fp = 0
    fn = 0
    for pred, gold in zip(predictions, golds):
        is_correct = pred == gold
        if not pred:
            fn += 1
        elif is_correct:
            tp += 1
        else:
            fp += 1
    print("tp", tp)
    print("fp", fp)
    print("fn", fn)
    precision = tp / (tp + fp + 0.0000001)
    recall = tp / (tp + fn + 0.0000001)
    fscore = 2 * recall * precision / (recall + precision + 0.0000001)
    print("precision", round(100 * precision, 1), "%")
    print("recall", round(100 * recall, 1), "%")
    print("Fscore", round(100 * fscore, 1), "%")
 def _prepare_pipeline(nlp, kb):
    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
    nlp.add_pipe(el_pipe, last=True)
 # TODO
 def add_coref():
    """ Add coreference resolution to our model """
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@ -12,6 +12,7 @@ from . import wikipedia_processor as wp
 Process Wikipedia interlinks to generate a training dataset for the EL algorithm
 """
 ENTITY_FILE = "gold_entities.csv"
 def create_training(kb, entity_input, training_output):
    if not kb:
@ -44,7 +45,7 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
    read_ids = set()
-    entityfile_loc = training_output + "/" + "gold_entities.csv"
+    entityfile_loc = training_output + "/" + ENTITY_FILE
    with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
        # write entity training header file
        _write_training_entity(outputfile=entityfile,
@ -274,3 +275,36 @@ def _write_training_article(article_id, clean_text, training_output):
 def _write_training_entity(outputfile, article_id, alias, entity, correct):
    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
 def read_training_entities(training_output, collect_correct=True, collect_incorrect=False):
    entityfile_loc = training_output + "/" + ENTITY_FILE
    incorrect_entries_per_article = dict()
    correct_entries_per_article = dict()
    with open(entityfile_loc, mode='r', encoding='utf8') as file:
        for line in file:
            fields = line.replace('\n', "").split(sep='|')
            article_id = fields[0]
            alias = fields[1]
            entity = fields[2]
            correct = fields[3]
            if correct == "1" and collect_correct:
                entry_dict = correct_entries_per_article.get(article_id, dict())
                if alias in entry_dict:
                    raise ValueError("Found alias", alias, "multiple times for article", article_id, "in", ENTITY_FILE)
                entry_dict[alias] = entity
                correct_entries_per_article[article_id] = entry_dict
            if correct == "0" and collect_incorrect:
                entry_dict = incorrect_entries_per_article.get(article_id, dict())
                entities = entry_dict.get(alias, set())
                entities.add(entity)
                entry_dict[alias] = entities
                incorrect_entries_per_article[article_id] = entry_dict
    return correct_entries_per_article, incorrect_entries_per_article
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@ -1,7 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
-from . import wikipedia_processor as wp, kb_creator, training_set_creator
+from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
 import spacy
 from spacy.vocab import Vocab
@ -19,8 +19,7 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
 VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
-TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
+TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
 TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
 if __name__ == "__main__":
@ -37,8 +36,12 @@ if __name__ == "__main__":
    to_read_kb = True
    to_test_kb = False
    # create training dataset
    create_wp_training = False
    # apply named entity linking to the training dataset
    apply_to_training = True
    # STEP 1 : create prior probabilities from WP
    # run only once !
    if to_create_prior_probs:
@ -88,13 +91,21 @@ if __name__ == "__main__":
        # test KB
        if to_test_kb:
-            kb_creator.test_kb(my_kb)
+            my_nlp = spacy.load('en_core_web_sm')
            run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp)
            print()
    # STEP 5: create a training dataset from WP
    if create_wp_training:
        print("STEP 5: create training dataset", datetime.datetime.now())
-        training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR)
+        training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_DIR)
    # STEP 6: apply the EL algorithm on the training dataset
    if apply_to_training:
        my_nlp = spacy.load('en_core_web_sm')
        run_el.run_el_training(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=1000)
        print()
    # TODO coreference resolution
    # add_coref()