baseline evaluation using highest-freq candidate

2025-08-10 07:04:53 +03:00 · 2019-05-06 15:13:50 +02:00 · 2019-05-06 15:13:50 +02:00 · 7e348d7f7f
commit 7e348d7f7f
parent 6961215578
4 changed files with 152 additions and 29 deletions
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@ -112,26 +112,3 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
    if to_print:
        print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())

-
-def test_kb(kb):
-    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
-    nlp = spacy.load('en_core_web_sm')
-
-    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
-    nlp.add_pipe(el_pipe, last=True)
-
-    candidates = kb.get_candidates("Bush")
-
-    print("generating candidates for 'Bush' :")
-    for c in candidates:
-        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
-    print()
-
-    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
-           "Douglas reminds us to always bring our towel. " \
-           "The main character in Doug's novel is the man Arthur Dent, " \
-           "but Douglas doesn't write about George Washington or Homer Simpson."
-    doc = nlp(text)
-
-    for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@ -1,12 +1,113 @@
 # coding: utf-8
 from __future__ import unicode_literals

+import os
 import spacy
+import datetime
+from os import listdir
+
+from examples.pipeline.wiki_entity_linking import training_set_creator

 # requires: pip install neuralcoref --no-binary neuralcoref
 # import neuralcoref


+def run_el_toy_example(nlp, kb):
+    _prepare_pipeline(nlp, kb)
+
+    candidates = kb.get_candidates("Bush")
+
+    print("generating candidates for 'Bush' :")
+    for c in candidates:
+        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
+    print()
+
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
+           "Douglas reminds us to always bring our towel. " \
+           "The main character in Doug's novel is the man Arthur Dent, " \
+           "but Douglas doesn't write about George Washington or Homer Simpson."
+    doc = nlp(text)
+
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
+
+
+def run_el_training(nlp, kb, training_dir, limit=None):
+    _prepare_pipeline(nlp, kb)
+
+    correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir,
+                                                                                 collect_correct=True,
+                                                                                 collect_incorrect=False)
+
+    predictions = list()
+    golds = list()
+
+    cnt = 0
+    for f in listdir(training_dir):
+        if not limit or cnt < limit:
+            if is_dev(f):
+                article_id = f.replace(".txt", "")
+                if cnt % 500 == 0:
+                    print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
+                cnt += 1
+                with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
+                    text = file.read()
+                    doc = nlp(text)
+                    for ent in doc.ents:
+                        if ent.label_ == "PERSON":  # TODO: expand to other types
+                            gold_entity = correct_entries_per_article[article_id].get(ent.text, None)
+                            # only evaluating gold entities we know, because the training data is not complete
+                            if gold_entity:
+                                predictions.append(ent.kb_id_)
+                                golds.append(gold_entity)
+
+    print("Processed", cnt, "dev articles")
+    print()
+    evaluate(predictions, golds)
+
+
+def is_dev(file_name):
+    return file_name.endswith("3.txt")
+
+
+def evaluate(predictions, golds):
+    if len(predictions) != len(golds):
+        raise ValueError("predictions and gold entities should have the same length")
+
+    print("Evaluating", len(golds), "entities")
+
+    tp = 0
+    fp = 0
+    fn = 0
+
+    for pred, gold in zip(predictions, golds):
+        is_correct = pred == gold
+        if not pred:
+            fn += 1
+        elif is_correct:
+            tp += 1
+        else:
+            fp += 1
+
+    print("tp", tp)
+    print("fp", fp)
+    print("fn", fn)
+
+    precision = tp / (tp + fp + 0.0000001)
+    recall = tp / (tp + fn + 0.0000001)
+    fscore = 2 * recall * precision / (recall + precision + 0.0000001)
+
+    print("precision", round(100 * precision, 1), "%")
+    print("recall", round(100 * recall, 1), "%")
+    print("Fscore", round(100 * fscore, 1), "%")
+
+
+def _prepare_pipeline(nlp, kb):
+    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
+    nlp.add_pipe(el_pipe, last=True)
+
+
 # TODO
 def add_coref():
    """ Add coreference resolution to our model """
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@ -12,6 +12,7 @@ from . import wikipedia_processor as wp
 Process Wikipedia interlinks to generate a training dataset for the EL algorithm
 """

+ENTITY_FILE = "gold_entities.csv"

 def create_training(kb, entity_input, training_output):
    if not kb:
@ -44,7 +45,7 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):

    read_ids = set()

-    entityfile_loc = training_output + "/" + "gold_entities.csv"
+    entityfile_loc = training_output + "/" + ENTITY_FILE
    with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
        # write entity training header file
        _write_training_entity(outputfile=entityfile,
@ -274,3 +275,36 @@ def _write_training_article(article_id, clean_text, training_output):

 def _write_training_entity(outputfile, article_id, alias, entity, correct):
    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
+
+
+def read_training_entities(training_output, collect_correct=True, collect_incorrect=False):
+    entityfile_loc = training_output + "/" + ENTITY_FILE
+    incorrect_entries_per_article = dict()
+    correct_entries_per_article = dict()
+    with open(entityfile_loc, mode='r', encoding='utf8') as file:
+        for line in file:
+            fields = line.replace('\n', "").split(sep='|')
+            article_id = fields[0]
+            alias = fields[1]
+            entity = fields[2]
+            correct = fields[3]
+
+            if correct == "1" and collect_correct:
+                entry_dict = correct_entries_per_article.get(article_id, dict())
+                if alias in entry_dict:
+                    raise ValueError("Found alias", alias, "multiple times for article", article_id, "in", ENTITY_FILE)
+                entry_dict[alias] = entity
+                correct_entries_per_article[article_id] = entry_dict
+
+            if correct == "0" and collect_incorrect:
+                entry_dict = incorrect_entries_per_article.get(article_id, dict())
+                entities = entry_dict.get(alias, set())
+                entities.add(entity)
+                entry_dict[alias] = entities
+                incorrect_entries_per_article[article_id] = entry_dict
+
+    return correct_entries_per_article, incorrect_entries_per_article
+
+
+
+
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@ -1,7 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals

-from . import wikipedia_processor as wp, kb_creator, training_set_creator
+from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el

 import spacy
 from spacy.vocab import Vocab
@ -19,8 +19,7 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
 VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'

-TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
-TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
+TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'


 if __name__ == "__main__":
@ -37,8 +36,12 @@ if __name__ == "__main__":
    to_read_kb = True
    to_test_kb = False

+    # create training dataset
    create_wp_training = False

+    # apply named entity linking to the training dataset
+    apply_to_training = True
+
    # STEP 1 : create prior probabilities from WP
    # run only once !
    if to_create_prior_probs:
@ -88,13 +91,21 @@ if __name__ == "__main__":

        # test KB
        if to_test_kb:
-            kb_creator.test_kb(my_kb)
+            my_nlp = spacy.load('en_core_web_sm')
+            run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp)
            print()

    # STEP 5: create a training dataset from WP
    if create_wp_training:
        print("STEP 5: create training dataset", datetime.datetime.now())
-        training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR)
+        training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_DIR)
+
+    # STEP 6: apply the EL algorithm on the training dataset
+    if apply_to_training:
+        my_nlp = spacy.load('en_core_web_sm')
+        run_el.run_el_training(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=1000)
+        print()
+

    # TODO coreference resolution
    # add_coref()