spaCy/examples/pipeline/wikidata_entity_linking.py

# coding: utf-8
from __future__ import unicode_literals

import random
import datetime
from pathlib import Path

from bin.wiki_entity_linking import wikipedia_processor as wp
from bin.wiki_entity_linking import training_set_creator, kb_creator
from bin.wiki_entity_linking.kb_creator import DESC_WIDTH

import spacy
from spacy.kb import KnowledgeBase
from spacy.util import minibatch, compounding

"""
Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
"""

ROOT_DIR = Path("C:/Users/Sofie/Documents/data/")
OUTPUT_DIR = ROOT_DIR / "wikipedia"
TRAINING_DIR = OUTPUT_DIR / "training_data_nel"

PRIOR_PROB = OUTPUT_DIR / "prior_prob.csv"
ENTITY_COUNTS = OUTPUT_DIR / "entity_freq.csv"
ENTITY_DEFS = OUTPUT_DIR / "entity_defs.csv"
ENTITY_DESCR = OUTPUT_DIR / "entity_descriptions.csv"

KB_FILE = OUTPUT_DIR / "kb_1" / "kb"
NLP_1_DIR = OUTPUT_DIR / "nlp_1"
NLP_2_DIR = OUTPUT_DIR / "nlp_2"

# get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
WIKIDATA_JSON = ROOT_DIR / "wikidata" / "wikidata-20190304-all.json.bz2"

# get enwiki-latest-pages-articles-multistream.xml.bz2 from https://dumps.wikimedia.org/enwiki/latest/
ENWIKI_DUMP = (
    ROOT_DIR / "wikipedia" / "enwiki-20190320-pages-articles-multistream.xml.bz2"
)

# KB construction parameters
MAX_CANDIDATES = 10
MIN_ENTITY_FREQ = 20
MIN_PAIR_OCC = 5

# model training parameters
EPOCHS = 10
DROPOUT = 0.5
LEARN_RATE = 0.005
L2 = 1e-6
CONTEXT_WIDTH = 128


def now():
    return datetime.datetime.now()


def run_pipeline():
    # set the appropriate booleans to define which parts of the pipeline should be re(run)
    print("START", now())
    print()
    nlp_1 = spacy.load("en_core_web_lg")
    nlp_2 = None
    kb_2 = None

    # one-time methods to create KB and write to file
    to_create_prior_probs = False
    to_create_entity_counts = False
    to_create_kb = False

    # read KB back in from file
    to_read_kb = True
    to_test_kb = False

    # create training dataset
    create_wp_training = False

    # train the EL pipe
    train_pipe = True
    measure_performance = True

    # test the EL pipe on a simple example
    to_test_pipeline = True

    # write the NLP object, read back in and test again
    to_write_nlp = True
    to_read_nlp = True
    test_from_file = False

    # STEP 1 : create prior probabilities from WP (run only once)
    if to_create_prior_probs:
        print("STEP 1: to_create_prior_probs", now())
        wp.read_prior_probs(ENWIKI_DUMP, PRIOR_PROB)
        print()

    # STEP 2 : deduce entity frequencies from WP (run only once)
    if to_create_entity_counts:
        print("STEP 2: to_create_entity_counts", now())
        wp.write_entity_counts(PRIOR_PROB, ENTITY_COUNTS, to_print=False)
        print()

    # STEP 3 : create KB and write to file (run only once)
    if to_create_kb:
        print("STEP 3a: to_create_kb", now())
        kb_1 = kb_creator.create_kb(
            nlp=nlp_1,
            max_entities_per_alias=MAX_CANDIDATES,
            min_entity_freq=MIN_ENTITY_FREQ,
            min_occ=MIN_PAIR_OCC,
            entity_def_output=ENTITY_DEFS,
            entity_descr_output=ENTITY_DESCR,
            count_input=ENTITY_COUNTS,
            prior_prob_input=PRIOR_PROB,
            wikidata_input=WIKIDATA_JSON,
        )
        print("kb entities:", kb_1.get_size_entities())
        print("kb aliases:", kb_1.get_size_aliases())
        print()

        print("STEP 3b: write KB and NLP", now())
        kb_1.dump(KB_FILE)
        nlp_1.to_disk(NLP_1_DIR)
        print()

    # STEP 4 : read KB back in from file
    if to_read_kb:
        print("STEP 4: to_read_kb", now())
        nlp_2 = spacy.load(NLP_1_DIR)
        kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH)
        kb_2.load_bulk(KB_FILE)
        print("kb entities:", kb_2.get_size_entities())
        print("kb aliases:", kb_2.get_size_aliases())
        print()

        # test KB
        if to_test_kb:
            check_kb(kb_2)
            print()

    # STEP 5: create a training dataset from WP
    if create_wp_training:
        print("STEP 5: create training dataset", now())
        training_set_creator.create_training(
            wikipedia_input=ENWIKI_DUMP,
            entity_def_input=ENTITY_DEFS,
            training_output=TRAINING_DIR,
        )

    # STEP 6: create and train the entity linking pipe
    if train_pipe:
        print("STEP 6: training Entity Linking pipe", now())
        type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)}
        print(" -analysing", len(type_to_int), "different entity types")
        el_pipe = nlp_2.create_pipe(
            name="entity_linker",
            config={
                "context_width": CONTEXT_WIDTH,
                "pretrained_vectors": nlp_2.vocab.vectors.name,
                "type_to_int": type_to_int,
            },
        )
        el_pipe.set_kb(kb_2)
        nlp_2.add_pipe(el_pipe, last=True)

        other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"]
        with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
            optimizer = nlp_2.begin_training()
            optimizer.learn_rate = LEARN_RATE
            optimizer.L2 = L2

        # define the size (nr of entities) of training and dev set
        train_limit = 5000
        dev_limit = 5000

        # for training, get pos & neg instances that correspond to entries in the kb
        train_data = training_set_creator.read_training(
            nlp=nlp_2,
            training_dir=TRAINING_DIR,
            dev=False,
            limit=train_limit,
            kb=el_pipe.kb,
        )

        print("Training on", len(train_data), "articles")
        print()

        # for testing, get all pos instances, whether or not they are in the kb
        dev_data = training_set_creator.read_training(
            nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit, kb=None
        )

        print("Dev testing on", len(dev_data), "articles")
        print()

        if not train_data:
            print("Did not find any training data")
        else:
            for itn in range(EPOCHS):
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
                batchnr = 0

                with nlp_2.disable_pipes(*other_pipes):
                    for batch in batches:
                        try:
                            docs, golds = zip(*batch)
                            nlp_2.update(
                                docs=docs,
                                golds=golds,
                                sgd=optimizer,
                                drop=DROPOUT,
                                losses=losses,
                            )
                            batchnr += 1
                        except Exception as e:
                            print("Error updating batch:", e)

                if batchnr > 0:
                    el_pipe.cfg["context_weight"] = 1
                    el_pipe.cfg["prior_weight"] = 1
                    dev_acc_context, _ = _measure_acc(dev_data, el_pipe)
                    losses["entity_linker"] = losses["entity_linker"] / batchnr
                    print(
                        "Epoch, train loss",
                        itn,
                        round(losses["entity_linker"], 2),
                        " / dev acc avg",
                        round(dev_acc_context, 3),
                    )

        # STEP 7: measure the performance of our trained pipe on an independent dev set
        if len(dev_data) and measure_performance:
            print()
            print("STEP 7: performance measurement of Entity Linking pipe", now())
            print()

            counts, acc_r, acc_r_d, acc_p, acc_p_d, acc_o, acc_o_d = _measure_baselines(
                dev_data, kb_2
            )
            print("dev counts:", sorted(counts.items(), key=lambda x: x[0]))

            oracle_by_label = [(x, round(y, 3)) for x, y in acc_o_d.items()]
            print("dev acc oracle:", round(acc_o, 3), oracle_by_label)

            random_by_label = [(x, round(y, 3)) for x, y in acc_r_d.items()]
            print("dev acc random:", round(acc_r, 3), random_by_label)

            prior_by_label = [(x, round(y, 3)) for x, y in acc_p_d.items()]
            print("dev acc prior:", round(acc_p, 3), prior_by_label)

            # using only context
            el_pipe.cfg["context_weight"] = 1
            el_pipe.cfg["prior_weight"] = 0
            dev_acc_context, dev_acc_cont_d = _measure_acc(dev_data, el_pipe)
            context_by_label = [(x, round(y, 3)) for x, y in dev_acc_cont_d.items()]
            print("dev acc context avg:", round(dev_acc_context, 3), context_by_label)

            # measuring combined accuracy (prior + context)
            el_pipe.cfg["context_weight"] = 1
            el_pipe.cfg["prior_weight"] = 1
            dev_acc_combo, dev_acc_combo_d = _measure_acc(dev_data, el_pipe)
            combo_by_label = [(x, round(y, 3)) for x, y in dev_acc_combo_d.items()]
            print("dev acc combo avg:", round(dev_acc_combo, 3), combo_by_label)

        # STEP 8: apply the EL pipe on a toy example
        if to_test_pipeline:
            print()
            print("STEP 8: applying Entity Linking to toy example", now())
            print()
            run_el_toy_example(nlp=nlp_2)

        # STEP 9: write the NLP pipeline (including entity linker) to file
        if to_write_nlp:
            print()
            print("STEP 9: testing NLP IO", now())
            print()
            print("writing to", NLP_2_DIR)
            nlp_2.to_disk(NLP_2_DIR)
            print()

    # verify that the IO has gone correctly
    if to_read_nlp:
        print("reading from", NLP_2_DIR)
        nlp_3 = spacy.load(NLP_2_DIR)

        print("running toy example with NLP 3")
        run_el_toy_example(nlp=nlp_3)

    # testing performance with an NLP model from file
    if test_from_file:
        nlp_2 = spacy.load(NLP_1_DIR)
        nlp_3 = spacy.load(NLP_2_DIR)
        el_pipe = nlp_3.get_pipe("entity_linker")

        dev_limit = 5000
        dev_data = training_set_creator.read_training(
            nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit, kb=None
        )

        print("Dev testing from file on", len(dev_data), "articles")
        print()

        dev_acc_combo, dev_acc_combo_dict = _measure_acc(dev_data, el_pipe)
        combo_by_label = [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]
        print("dev acc combo avg:", round(dev_acc_combo, 3), combo_by_label)

    print()
    print("STOP", now())


def _measure_acc(data, el_pipe=None, error_analysis=False):
    # If the docs in the data require further processing with an entity linker, set el_pipe
    correct_by_label = dict()
    incorrect_by_label = dict()

    docs = [d for d, g in data if len(d) > 0]
    if el_pipe is not None:
        docs = list(el_pipe.pipe(docs))
    golds = [g for d, g in data if len(d) > 0]

    for doc, gold in zip(docs, golds):
        try:
            correct_entries_per_article = dict()
            for entity, value in gold.links.items():
                # only evaluating on positive examples
                if value:
                    start, end, gold_kb = entity
                    correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb

            for ent in doc.ents:
                ent_label = ent.label_
                pred_entity = ent.kb_id_
                start = ent.start_char
                end = ent.end_char
                offset = str(start) + "-" + str(end)
                gold_entity = correct_entries_per_article.get(offset, None)
                # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
                if gold_entity is not None:
                    if gold_entity == pred_entity:
                        correct = correct_by_label.get(ent_label, 0)
                        correct_by_label[ent_label] = correct + 1
                    else:
                        incorrect = incorrect_by_label.get(ent_label, 0)
                        incorrect_by_label[ent_label] = incorrect + 1
                        if error_analysis:
                            print(ent.text, "in", doc)
                            print(
                                "Predicted",
                                pred_entity,
                                "should have been",
                                gold_entity,
                            )
                            print()

        except Exception as e:
            print("Error assessing accuracy", e)

    acc, acc_by_label = calculate_acc(correct_by_label, incorrect_by_label)
    return acc, acc_by_label


def _measure_baselines(data, kb):
    # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound
    counts_d = dict()

    random_correct_d = dict()
    random_incorrect_d = dict()

    oracle_correct_d = dict()
    oracle_incorrect_d = dict()

    prior_correct_d = dict()
    prior_incorrect_d = dict()

    docs = [d for d, g in data if len(d) > 0]
    golds = [g for d, g in data if len(d) > 0]

    for doc, gold in zip(docs, golds):
        try:
            correct_entries_per_article = dict()
            for entity, value in gold.links.items():
                start, end, gold_kb = entity
                # only evaluating on positive examples
                if value:
                    correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb

            for ent in doc.ents:
                label = ent.label_
                start = ent.start_char
                end = ent.end_char
                offset = str(start) + "-" + str(end)
                gold_entity = correct_entries_per_article.get(offset, None)

                # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
                if gold_entity is not None:
                    counts_d[label] = counts_d.get(label, 0) + 1
                    candidates = kb.get_candidates(ent.text)
                    oracle_candidate = ""
                    best_candidate = ""
                    random_candidate = ""
                    if candidates:
                        scores = []

                        for c in candidates:
                            scores.append(c.prior_prob)
                            if c.entity_ == gold_entity:
                                oracle_candidate = c.entity_

                        best_index = scores.index(max(scores))
                        best_candidate = candidates[best_index].entity_
                        random_candidate = random.choice(candidates).entity_

                    if gold_entity == best_candidate:
                        prior_correct_d[label] = prior_correct_d.get(label, 0) + 1
                    else:
                        prior_incorrect_d[label] = prior_incorrect_d.get(label, 0) + 1

                    if gold_entity == random_candidate:
                        random_correct_d[label] = random_correct_d.get(label, 0) + 1
                    else:
                        random_incorrect_d[label] = random_incorrect_d.get(label, 0) + 1

                    if gold_entity == oracle_candidate:
                        oracle_correct_d[label] = oracle_correct_d.get(label, 0) + 1
                    else:
                        oracle_incorrect_d[label] = oracle_incorrect_d.get(label, 0) + 1

        except Exception as e:
            print("Error assessing accuracy", e)

    acc_prior, acc_prior_d = calculate_acc(prior_correct_d, prior_incorrect_d)
    acc_rand, acc_rand_d = calculate_acc(random_correct_d, random_incorrect_d)
    acc_oracle, acc_oracle_d = calculate_acc(oracle_correct_d, oracle_incorrect_d)

    return (
        counts_d,
        acc_rand,
        acc_rand_d,
        acc_prior,
        acc_prior_d,
        acc_oracle,
        acc_oracle_d,
    )


def calculate_acc(correct_by_label, incorrect_by_label):
    acc_by_label = dict()
    total_correct = 0
    total_incorrect = 0
    all_keys = set()
    all_keys.update(correct_by_label.keys())
    all_keys.update(incorrect_by_label.keys())
    for label in sorted(all_keys):
        correct = correct_by_label.get(label, 0)
        incorrect = incorrect_by_label.get(label, 0)
        total_correct += correct
        total_incorrect += incorrect
        if correct == incorrect == 0:
            acc_by_label[label] = 0
        else:
            acc_by_label[label] = correct / (correct + incorrect)
    acc = 0
    if not (total_correct == total_incorrect == 0):
        acc = total_correct / (total_correct + total_incorrect)
    return acc, acc_by_label


def check_kb(kb):
    for mention in ("Bush", "Douglas Adams", "Homer", "Brazil", "China"):
        candidates = kb.get_candidates(mention)

        print("generating candidates for " + mention + " :")
        for c in candidates:
            print(
                " ",
                c.prior_prob,
                c.alias_,
                "-->",
                c.entity_ + " (freq=" + str(c.entity_freq) + ")",
            )
        print()


def run_el_toy_example(nlp):
    text = (
        "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, "
        "Douglas reminds us to always bring our towel, even in China or Brazil. "
        "The main character in Doug's novel is the man Arthur Dent, "
        "but Douglas doesn't write about George Washington or Homer Simpson."
    )
    doc = nlp(text)
    print(text)
    for ent in doc.ents:
        print(" ent", ent.text, ent.label_, ent.kb_id_)
    print()


if __name__ == "__main__":
    run_pipeline()