spaCy/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py

# coding: utf-8
from __future__ import unicode_literals

import random

from spacy.util import minibatch, compounding

from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
from examples.pipeline.wiki_entity_linking.kb_creator import DESC_WIDTH

import spacy
from spacy.vocab import Vocab
from spacy.kb import KnowledgeBase
import datetime

"""
Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
"""

PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'

KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1'
NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'

TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'

MAX_CANDIDATES = 10
MIN_PAIR_OCC = 5
DOC_CHAR_CUTOFF = 300
EPOCHS = 2
DROPOUT = 0.1


def run_pipeline():
    print("START", datetime.datetime.now())
    print()
    nlp_1 = spacy.load('en_core_web_lg')
    nlp_2 = None
    kb_1 = None
    kb_2 = None

    # one-time methods to create KB and write to file
    to_create_prior_probs = False
    to_create_entity_counts = False
    to_create_kb = True

    # read KB back in from file
    to_read_kb = True
    to_test_kb = True

    # create training dataset
    create_wp_training = False

    # train the EL pipe
    train_pipe = True
    measure_performance = False

    # test the EL pipe on a simple example
    to_test_pipeline = True

    # write the NLP object, read back in and test again
    test_nlp_io = True

    # STEP 1 : create prior probabilities from WP
    # run only once !
    if to_create_prior_probs:
        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
        wp.read_wikipedia_prior_probs(prior_prob_output=PRIOR_PROB)
        print()

    # STEP 2 : deduce entity frequencies from WP
    # run only once !
    if to_create_entity_counts:
        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
        wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False)
        print()

    # STEP 3 : create KB and write to file
    # run only once !
    if to_create_kb:
        print("STEP 3a: to_create_kb", datetime.datetime.now())
        kb_1 = kb_creator.create_kb(nlp_1,
                                     max_entities_per_alias=MAX_CANDIDATES,
                                     min_occ=MIN_PAIR_OCC,
                                     entity_def_output=ENTITY_DEFS,
                                     entity_descr_output=ENTITY_DESCR,
                                     count_input=ENTITY_COUNTS,
                                     prior_prob_input=PRIOR_PROB,
                                     to_print=False)
        print("kb entities:", kb_1.get_size_entities())
        print("kb aliases:", kb_1.get_size_aliases())
        print()

        print("STEP 3b: write KB and NLP", datetime.datetime.now())
        kb_1.dump(KB_FILE)
        nlp_1.to_disk(NLP_1_DIR)
        print()

    # STEP 4 : read KB back in from file
    if to_read_kb:
        print("STEP 4: to_read_kb", datetime.datetime.now())
        # my_vocab = Vocab()
        # my_vocab.from_disk(VOCAB_DIR)
        # my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64)
        nlp_2 = spacy.load(NLP_1_DIR)
        kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH)
        kb_2.load_bulk(KB_FILE)
        print("kb entities:", kb_2.get_size_entities())
        print("kb aliases:", kb_2.get_size_aliases())
        print()

        # test KB
        if to_test_kb:
            run_el.run_kb_toy_example(kb=kb_2)
            print()

    # STEP 5: create a training dataset from WP
    if create_wp_training:
        print("STEP 5: create training dataset", datetime.datetime.now())
        training_set_creator.create_training(kb=kb_2, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)

    # STEP 6: create the entity linking pipe
    if train_pipe:
        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
        train_limit = 10
        dev_limit = 5
        print("Training on", train_limit, "articles")
        print("Dev testing on", dev_limit, "articles")
        print()

        train_data = training_set_creator.read_training(nlp=nlp_2,
                                                        training_dir=TRAINING_DIR,
                                                        dev=False,
                                                        limit=train_limit,
                                                        to_print=False)

        dev_data = training_set_creator.read_training(nlp=nlp_2,
                                                      training_dir=TRAINING_DIR,
                                                      dev=True,
                                                      limit=dev_limit,
                                                      to_print=False)

        el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_CHAR_CUTOFF})
        el_pipe.set_kb(kb_2)
        nlp_2.add_pipe(el_pipe, last=True)

        other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"]
        with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
            nlp_2.begin_training()

        for itn in range(EPOCHS):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
            batchnr = 0

            with nlp_2.disable_pipes(*other_pipes):
                for batch in batches:
                    try:
                        docs, golds = zip(*batch)
                        nlp_2.update(
                            docs,
                            golds,
                            drop=DROPOUT,
                            losses=losses,
                        )
                        batchnr += 1
                    except Exception as e:
                        print("Error updating batch", e)

            losses['entity_linker'] = losses['entity_linker'] / batchnr
            print("Epoch, train loss", itn, round(losses['entity_linker'], 2))

        if measure_performance:
            print()
            print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())
            print()

            # print(" measuring accuracy 1-1")
            el_pipe.context_weight = 1
            el_pipe.prior_weight = 1
            dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe)
            train_acc_1_1 = _measure_accuracy(train_data, el_pipe)
            print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2))

            # baseline using only prior probabilities
            el_pipe.context_weight = 0
            el_pipe.prior_weight = 1
            dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe)
            train_acc_0_1 = _measure_accuracy(train_data, el_pipe)
            print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2))

            # using only context
            el_pipe.context_weight = 1
            el_pipe.prior_weight = 0
            dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe)
            train_acc_1_0 = _measure_accuracy(train_data, el_pipe)

            print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2))
            print()

    if to_test_pipeline:
        print()
        print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())
        print()
        run_el_toy_example(nlp=nlp_2)
        print()

    if test_nlp_io:
        print()
        print("STEP 9: testing NLP IO", datetime.datetime.now())
        print()
        print("writing to", NLP_2_DIR)
        print(" vocab len nlp_2", len(nlp_2.vocab))
        print(" vocab len kb_2", len(kb_2.vocab))
        nlp_2.to_disk(NLP_2_DIR)
        print()
        print("reading from", NLP_2_DIR)
        nlp_3 = spacy.load(NLP_2_DIR)
        print(" vocab len nlp_3", len(nlp_3.vocab))

        for pipe_name, pipe in nlp_3.pipeline:
            if pipe_name == "entity_linker":
                print(" vocab len kb_3", len(pipe.kb.vocab))

        print()
        print("running toy example with NLP 2")
        run_el_toy_example(nlp=nlp_3)

    print()
    print("STOP", datetime.datetime.now())


def _measure_accuracy(data, el_pipe):
    correct = 0
    incorrect = 0

    docs = [d for d, g in data if len(d) > 0]
    docs = el_pipe.pipe(docs)
    golds = [g for d, g in data if len(d) > 0]

    for doc, gold in zip(docs, golds):
        try:
            correct_entries_per_article = dict()
            for entity in gold.links:
                start, end, gold_kb = entity
                correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb

            for ent in doc.ents:
                if ent.label_ == "PERSON":  # TODO: expand to other types
                    pred_entity = ent.kb_id_
                    start = ent.start
                    end = ent.end
                    gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
                    if gold_entity is not None:
                        if gold_entity == pred_entity:
                            correct += 1
                        else:
                            incorrect += 1

        except Exception as e:
            print("Error assessing accuracy", e)

    if correct == incorrect == 0:
        return 0

    acc = correct / (correct + incorrect)
    return acc


def run_el_toy_example(nlp):
    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
           "Douglas reminds us to always bring our towel. " \
           "The main character in Doug's novel is the man Arthur Dent, " \
           "but Douglas doesn't write about George Washington or Homer Simpson."
    doc = nlp(text)

    for ent in doc.ents:
        print("ent", ent.text, ent.label_, ent.kb_id_)

    print()

    # Q4426480 is her husband, Q3568763 her tutor
    text = "Ada Lovelace loved her husband William King dearly. " \
           "Ada Lovelace was tutored by her favorite physics tutor William King."
    doc = nlp(text)

    for ent in doc.ents:
        print("ent", ent.text, ent.label_, ent.kb_id_)


if __name__ == "__main__":
    run_pipeline()
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

introduce goldparse.links 2019-06-07 14:54:45 +03:00			`import random`

			`from spacy.util import minibatch, compounding`

first stab at model - not functional yet 2019-05-09 18:23:19 +03:00			`from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`from examples.pipeline.wiki_entity_linking.kb_creator import DESC_WIDTH`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`import spacy`
			`from spacy.vocab import Vocab`
			`from spacy.kb import KnowledgeBase`
			`import datetime`

			`"""`
			`Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.`
			`"""`

			`PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'`
			`ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'`
			`ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1'`
			`NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
introduce goldparse.links 2019-06-07 14:54:45 +03:00			`MAX_CANDIDATES = 10`
			`MIN_PAIR_OCC = 5`
			`DOC_CHAR_CUTOFF = 300`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`EPOCHS = 2`
introduce goldparse.links 2019-06-07 14:54:45 +03:00			`DROPOUT = 0.1`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00
			`def run_pipeline():`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`print("START", datetime.datetime.now())`
			`print()`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`nlp_1 = spacy.load('en_core_web_lg')`
			`nlp_2 = None`
			`kb_1 = None`
			`kb_2 = None`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`# one-time methods to create KB and write to file`
			`to_create_prior_probs = False`
			`to_create_entity_counts = False`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`to_create_kb = True`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`# read KB back in from file`
			`to_read_kb = True`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`to_test_kb = True`
separate entity encoder to get 64D descriptions 2019-06-05 01:09:46 +03:00
baseline evaluation using highest-freq candidate 2019-05-06 16:13:50 +03:00			`# create training dataset`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`create_wp_training = False`

eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00			`# train the EL pipe`
code cleanup 2019-06-06 21:22:14 +03:00			`train_pipe = True`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`measure_performance = False`
code cleanup 2019-06-06 21:22:14 +03:00
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00			`# test the EL pipe on a simple example`
			`to_test_pipeline = True`
pretraining description vectors and storing them in the KB 2019-06-06 20:51:27 +03:00
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`# write the NLP object, read back in and test again`
			`test_nlp_io = True`

refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`# STEP 1 : create prior probabilities from WP`
			`# run only once !`
			`if to_create_prior_probs:`
			`print("STEP 1: to_create_prior_probs", datetime.datetime.now())`
			`wp.read_wikipedia_prior_probs(prior_prob_output=PRIOR_PROB)`
			`print()`

			`# STEP 2 : deduce entity frequencies from WP`
			`# run only once !`
			`if to_create_entity_counts:`
			`print("STEP 2: to_create_entity_counts", datetime.datetime.now())`
			`wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False)`
			`print()`

			`# STEP 3 : create KB and write to file`
			`# run only once !`
			`if to_create_kb:`
			`print("STEP 3a: to_create_kb", datetime.datetime.now())`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`kb_1 = kb_creator.create_kb(nlp_1,`
storing NEL training data in GoldParse objects 2019-06-07 13:58:42 +03:00			`max_entities_per_alias=MAX_CANDIDATES,`
			`min_occ=MIN_PAIR_OCC,`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`entity_def_output=ENTITY_DEFS,`
			`entity_descr_output=ENTITY_DESCR,`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`count_input=ENTITY_COUNTS,`
			`prior_prob_input=PRIOR_PROB,`
			`to_print=False)`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`print("kb entities:", kb_1.get_size_entities())`
			`print("kb aliases:", kb_1.get_size_aliases())`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`print()`

write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`print("STEP 3b: write KB and NLP", datetime.datetime.now())`
			`kb_1.dump(KB_FILE)`
			`nlp_1.to_disk(NLP_1_DIR)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`print()`

			`# STEP 4 : read KB back in from file`
			`if to_read_kb:`
			`print("STEP 4: to_read_kb", datetime.datetime.now())`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`# my_vocab = Vocab()`
			`# my_vocab.from_disk(VOCAB_DIR)`
			`# my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64)`
			`nlp_2 = spacy.load(NLP_1_DIR)`
			`kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH)`
			`kb_2.load_bulk(KB_FILE)`
			`print("kb entities:", kb_2.get_size_entities())`
			`print("kb aliases:", kb_2.get_size_aliases())`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`print()`

			`# test KB`
			`if to_test_kb:`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`run_el.run_kb_toy_example(kb=kb_2)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`print()`

			`# STEP 5: create a training dataset from WP`
			`if create_wp_training:`
			`print("STEP 5: create training dataset", datetime.datetime.now())`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`training_set_creator.create_training(kb=kb_2, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00
code cleanup 2019-06-06 21:22:14 +03:00			`# STEP 6: create the entity linking pipe`
			`if train_pipe:`
speeding up training 2019-06-12 14:37:05 +03:00			`print("STEP 6: training Entity Linking pipe", datetime.datetime.now())`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`train_limit = 10`
			`dev_limit = 5`
first tests with EL pipe 2019-06-10 22:25:26 +03:00			`print("Training on", train_limit, "articles")`
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00			`print("Dev testing on", dev_limit, "articles")`
			`print()`
first tests with EL pipe 2019-06-10 22:25:26 +03:00
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`train_data = training_set_creator.read_training(nlp=nlp_2,`
first tests with EL pipe 2019-06-10 22:25:26 +03:00			`training_dir=TRAINING_DIR,`
			`dev=False,`
			`limit=train_limit,`
			`to_print=False)`

write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`dev_data = training_set_creator.read_training(nlp=nlp_2,`
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00			`training_dir=TRAINING_DIR,`
			`dev=True,`
			`limit=dev_limit,`
speeding up training 2019-06-12 14:37:05 +03:00			`to_print=False)`
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_CHAR_CUTOFF})`
			`el_pipe.set_kb(kb_2)`
			`nlp_2.add_pipe(el_pipe, last=True)`
code cleanup 2019-06-06 21:22:14 +03:00
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"]`
			`with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking`
			`nlp_2.begin_training()`
storing NEL training data in GoldParse objects 2019-06-07 13:58:42 +03:00
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00			`for itn in range(EPOCHS):`
			`random.shuffle(train_data)`
			`losses = {}`
			`batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))`
small fixes 2019-06-12 23:05:53 +03:00			`batchnr = 0`
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`with nlp_2.disable_pipes(*other_pipes):`
introduce goldparse.links 2019-06-07 14:54:45 +03:00			`for batch in batches:`
speeding up training 2019-06-12 14:37:05 +03:00			`try:`
			`docs, golds = zip(*batch)`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`nlp_2.update(`
speeding up training 2019-06-12 14:37:05 +03:00			`docs,`
			`golds,`
			`drop=DROPOUT,`
			`losses=losses,`
			`)`
small fixes 2019-06-12 23:05:53 +03:00			`batchnr += 1`
speeding up training 2019-06-12 14:37:05 +03:00			`except Exception as e:`
			`print("Error updating batch", e)`

small fixes 2019-06-12 23:05:53 +03:00			`losses['entity_linker'] = losses['entity_linker'] / batchnr`
speeding up training 2019-06-12 14:37:05 +03:00			`print("Epoch, train loss", itn, round(losses['entity_linker'], 2))`

write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`if measure_performance:`
			`print()`
			`print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())`
			`print()`
speeding up training 2019-06-12 14:37:05 +03:00
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`# print(" measuring accuracy 1-1")`
			`el_pipe.context_weight = 1`
			`el_pipe.prior_weight = 1`
			`dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe)`
			`train_acc_1_1 = _measure_accuracy(train_data, el_pipe)`
			`print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2))`

			`# baseline using only prior probabilities`
			`el_pipe.context_weight = 0`
			`el_pipe.prior_weight = 1`
			`dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe)`
			`train_acc_0_1 = _measure_accuracy(train_data, el_pipe)`
			`print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2))`

			`# using only context`
			`el_pipe.context_weight = 1`
			`el_pipe.prior_weight = 0`
			`dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe)`
			`train_acc_1_0 = _measure_accuracy(train_data, el_pipe)`

			`print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2))`
			`print()`
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00
pretraining description vectors and storing them in the KB 2019-06-06 20:51:27 +03:00			`if to_test_pipeline:`
small fixes 2019-06-12 23:05:53 +03:00			`print()`
			`print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())`
baseline evaluation using highest-freq candidate 2019-05-06 16:13:50 +03:00			`print()`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`run_el_toy_example(nlp=nlp_2)`
			`print()`

			`if test_nlp_io:`
			`print()`
			`print("STEP 9: testing NLP IO", datetime.datetime.now())`
			`print()`
			`print("writing to", NLP_2_DIR)`
			`print(" vocab len nlp_2", len(nlp_2.vocab))`
			`print(" vocab len kb_2", len(kb_2.vocab))`
			`nlp_2.to_disk(NLP_2_DIR)`
			`print()`
			`print("reading from", NLP_2_DIR)`
			`nlp_3 = spacy.load(NLP_2_DIR)`
			`print(" vocab len nlp_3", len(nlp_3.vocab))`

			`for pipe_name, pipe in nlp_3.pipeline:`
			`if pipe_name == "entity_linker":`
			`print(" vocab len kb_3", len(pipe.kb.vocab))`

eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00			`print()`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`print("running toy example with NLP 2")`
			`run_el_toy_example(nlp=nlp_3)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`print()`
			`print("STOP", datetime.datetime.now())`
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00

speed up predictions 2019-06-11 15:18:20 +03:00			`def _measure_accuracy(data, el_pipe):`
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00			`correct = 0`
			`incorrect = 0`

small fixes 2019-06-12 23:05:53 +03:00			`docs = [d for d, g in data if len(d) > 0]`
speed up predictions 2019-06-11 15:18:20 +03:00			`docs = el_pipe.pipe(docs)`
small fixes 2019-06-12 23:05:53 +03:00			`golds = [g for d, g in data if len(d) > 0]`
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00
			`for doc, gold in zip(docs, golds):`
speeding up training 2019-06-12 14:37:05 +03:00			`try:`
			`correct_entries_per_article = dict()`
			`for entity in gold.links:`
			`start, end, gold_kb = entity`
			`correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb`

			`for ent in doc.ents:`
			`if ent.label_ == "PERSON": # TODO: expand to other types`
			`pred_entity = ent.kb_id_`
			`start = ent.start`
			`end = ent.end`
			`gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)`
			`if gold_entity is not None:`
			`if gold_entity == pred_entity:`
			`correct += 1`
			`else:`
			`incorrect += 1`

			`except Exception as e:`
			`print("Error assessing accuracy", e)`
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00
			`if correct == incorrect == 0:`
			`return 0`

			`acc = correct / (correct + incorrect)`
			`return acc`


write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`def run_el_toy_example(nlp):`
eval on dev set, varying combo's of prior and context scores 2019-06-11 12:40:58 +03:00			`text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \`
			`"Douglas reminds us to always bring our towel. " \`
			`"The main character in Doug's novel is the man Arthur Dent, " \`
			`"but Douglas doesn't write about George Washington or Homer Simpson."`
			`doc = nlp(text)`

			`for ent in doc.ents:`
			`print("ent", ent.text, ent.label_, ent.kb_id_)`

			`print()`

			`# Q4426480 is her husband, Q3568763 her tutor`
			`text = "Ada Lovelace loved her husband William King dearly. " \`
			`"Ada Lovelace was tutored by her favorite physics tutor William King."`
			`doc = nlp(text)`

			`for ent in doc.ents:`
			`print("ent", ent.text, ent.label_, ent.kb_id_)`


			`if __name__ == "__main__":`
write entity linking pipe to file and keep vocab consistent between kb and nlp 2019-06-13 17:25:39 +03:00			`run_pipeline()`