spaCy/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py

# coding: utf-8
from __future__ import unicode_literals

import random

from spacy.util import minibatch, compounding

from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
from examples.pipeline.wiki_entity_linking.train_el import EL_Model

import spacy
from spacy.vocab import Vocab
from spacy.kb import KnowledgeBase
import datetime

"""
Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
"""

PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'

KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'

TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'

MAX_CANDIDATES = 10
MIN_PAIR_OCC = 5
DOC_CHAR_CUTOFF = 300
EPOCHS = 5
DROPOUT = 0.1

if __name__ == "__main__":
    print("START", datetime.datetime.now())
    print()
    nlp = spacy.load('en_core_web_lg')
    my_kb = None

    # one-time methods to create KB and write to file
    to_create_prior_probs = False
    to_create_entity_counts = False
    to_create_kb = False

    # read KB back in from file
    to_read_kb = True
    to_test_kb = False

    # create training dataset
    create_wp_training = False

    train_pipe = True

    # run EL training
    run_el_training = False

    # apply named entity linking to the dev dataset
    apply_to_dev = False

    to_test_pipeline = False

    # STEP 1 : create prior probabilities from WP
    # run only once !
    if to_create_prior_probs:
        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
        wp.read_wikipedia_prior_probs(prior_prob_output=PRIOR_PROB)
        print()

    # STEP 2 : deduce entity frequencies from WP
    # run only once !
    if to_create_entity_counts:
        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
        wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False)
        print()

    # STEP 3 : create KB and write to file
    # run only once !
    if to_create_kb:
        print("STEP 3a: to_create_kb", datetime.datetime.now())
        my_kb = kb_creator.create_kb(nlp,
                                     max_entities_per_alias=MAX_CANDIDATES,
                                     min_occ=MIN_PAIR_OCC,
                                     entity_def_output=ENTITY_DEFS,
                                     entity_descr_output=ENTITY_DESCR,
                                     count_input=ENTITY_COUNTS,
                                     prior_prob_input=PRIOR_PROB,
                                     to_print=False)
        print("kb entities:", my_kb.get_size_entities())
        print("kb aliases:", my_kb.get_size_aliases())
        print()

        print("STEP 3b: write KB", datetime.datetime.now())
        my_kb.dump(KB_FILE)
        nlp.vocab.to_disk(VOCAB_DIR)
        print()

    # STEP 4 : read KB back in from file
    if to_read_kb:
        print("STEP 4: to_read_kb", datetime.datetime.now())
        my_vocab = Vocab()
        my_vocab.from_disk(VOCAB_DIR)
        my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64)  # TODO entity vectors
        my_kb.load_bulk(KB_FILE)
        print("kb entities:", my_kb.get_size_entities())
        print("kb aliases:", my_kb.get_size_aliases())
        print()

        # test KB
        if to_test_kb:
            run_el.run_kb_toy_example(kb=my_kb)
            print()

    # STEP 5: create a training dataset from WP
    if create_wp_training:
        print("STEP 5: create training dataset", datetime.datetime.now())
        training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)

    # STEP 6: create the entity linking pipe
    if train_pipe:
        id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR)

        train_limit = 10
        print("Training on", train_limit, "articles")

        train_data = training_set_creator.read_training(nlp=nlp,
                                                        training_dir=TRAINING_DIR,
                                                        dev=False,
                                                        limit=train_limit,
                                                        to_print=False)

        el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF})
        nlp.add_pipe(el_pipe, last=True)

        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
        with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
            nlp.begin_training()

            for itn in range(EPOCHS):
                print()
                print("EPOCH", itn)
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
                for batch in batches:
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        drop=DROPOUT,
                        losses=losses,
                    )
                print("Losses", losses)

    # STEP 7: apply the EL algorithm on the dev dataset (TODO: overlaps with code from run_el_training ?)
    if apply_to_dev:
        run_el.run_el_dev(kb=my_kb, nlp=nlp, training_dir=TRAINING_DIR, limit=2000)
        print()

    # test KB
    if to_test_pipeline:
        run_el.run_el_toy_example(kb=my_kb, nlp=nlp)
        print()

    # TODO coreference resolution
    # add_coref()

    print()
    print("STOP", datetime.datetime.now())
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

introduce goldparse.links 2019-06-07 14:54:45 +03:00			`import random`

			`from spacy.util import minibatch, compounding`

first stab at model - not functional yet 2019-05-09 18:23:19 +03:00			`from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el`
			`from examples.pipeline.wiki_entity_linking.train_el import EL_Model`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`import spacy`
			`from spacy.vocab import Vocab`
			`from spacy.kb import KnowledgeBase`
			`import datetime`

			`"""`
			`Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.`
			`"""`

			`PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'`
			`ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'`
			`ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'`
			`VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'`

using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
introduce goldparse.links 2019-06-07 14:54:45 +03:00			`MAX_CANDIDATES = 10`
			`MIN_PAIR_OCC = 5`
			`DOC_CHAR_CUTOFF = 300`
			`EPOCHS = 5`
			`DROPOUT = 0.1`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`if __name__ == "__main__":`
			`print("START", datetime.datetime.now())`
			`print()`
pretraining description vectors and storing them in the KB 2019-06-06 20:51:27 +03:00			`nlp = spacy.load('en_core_web_lg')`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`my_kb = None`

			`# one-time methods to create KB and write to file`
			`to_create_prior_probs = False`
			`to_create_entity_counts = False`
first stab at model - not functional yet 2019-05-09 18:23:19 +03:00			`to_create_kb = False`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`# read KB back in from file`
			`to_read_kb = True`
code cleanup 2019-06-06 21:22:14 +03:00			`to_test_kb = False`
separate entity encoder to get 64D descriptions 2019-06-05 01:09:46 +03:00
baseline evaluation using highest-freq candidate 2019-05-06 16:13:50 +03:00			`# create training dataset`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`create_wp_training = False`

code cleanup 2019-06-06 21:22:14 +03:00			`train_pipe = True`

separate entity encoder to get 64D descriptions 2019-06-05 01:09:46 +03:00			`# run EL training`
			`run_el_training = False`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00
			`# apply named entity linking to the dev dataset`
			`apply_to_dev = False`
baseline evaluation using highest-freq candidate 2019-05-06 16:13:50 +03:00
pretraining description vectors and storing them in the KB 2019-06-06 20:51:27 +03:00			`to_test_pipeline = False`

refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`# STEP 1 : create prior probabilities from WP`
			`# run only once !`
			`if to_create_prior_probs:`
			`print("STEP 1: to_create_prior_probs", datetime.datetime.now())`
			`wp.read_wikipedia_prior_probs(prior_prob_output=PRIOR_PROB)`
			`print()`

			`# STEP 2 : deduce entity frequencies from WP`
			`# run only once !`
			`if to_create_entity_counts:`
			`print("STEP 2: to_create_entity_counts", datetime.datetime.now())`
			`wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False)`
			`print()`

			`# STEP 3 : create KB and write to file`
			`# run only once !`
			`if to_create_kb:`
			`print("STEP 3a: to_create_kb", datetime.datetime.now())`
pretraining description vectors and storing them in the KB 2019-06-06 20:51:27 +03:00			`my_kb = kb_creator.create_kb(nlp,`
storing NEL training data in GoldParse objects 2019-06-07 13:58:42 +03:00			`max_entities_per_alias=MAX_CANDIDATES,`
			`min_occ=MIN_PAIR_OCC,`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`entity_def_output=ENTITY_DEFS,`
			`entity_descr_output=ENTITY_DESCR,`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`count_input=ENTITY_COUNTS,`
			`prior_prob_input=PRIOR_PROB,`
			`to_print=False)`
			`print("kb entities:", my_kb.get_size_entities())`
			`print("kb aliases:", my_kb.get_size_aliases())`
			`print()`

			`print("STEP 3b: write KB", datetime.datetime.now())`
			`my_kb.dump(KB_FILE)`
pretraining description vectors and storing them in the KB 2019-06-06 20:51:27 +03:00			`nlp.vocab.to_disk(VOCAB_DIR)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`print()`

			`# STEP 4 : read KB back in from file`
			`if to_read_kb:`
			`print("STEP 4: to_read_kb", datetime.datetime.now())`
			`my_vocab = Vocab()`
			`my_vocab.from_disk(VOCAB_DIR)`
entity vectors in the KB + serialization of them 2019-06-05 19:29:18 +03:00			`my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64) # TODO entity vectors`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`my_kb.load_bulk(KB_FILE)`
			`print("kb entities:", my_kb.get_size_entities())`
			`print("kb aliases:", my_kb.get_size_aliases())`
			`print()`

			`# test KB`
			`if to_test_kb:`
pretraining description vectors and storing them in the KB 2019-06-06 20:51:27 +03:00			`run_el.run_kb_toy_example(kb=my_kb)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`print()`

			`# STEP 5: create a training dataset from WP`
			`if create_wp_training:`
			`print("STEP 5: create training dataset", datetime.datetime.now())`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)`

code cleanup 2019-06-06 21:22:14 +03:00			`# STEP 6: create the entity linking pipe`
			`if train_pipe:`
storing NEL training data in GoldParse objects 2019-06-07 13:58:42 +03:00			`id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR)`

first tests with EL pipe 2019-06-10 22:25:26 +03:00			`train_limit = 10`
			`print("Training on", train_limit, "articles")`

introduce goldparse.links 2019-06-07 14:54:45 +03:00			`train_data = training_set_creator.read_training(nlp=nlp,`
first tests with EL pipe 2019-06-10 22:25:26 +03:00			`training_dir=TRAINING_DIR,`
			`dev=False,`
			`limit=train_limit,`
			`to_print=False)`

			`el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF})`
code cleanup 2019-06-06 21:22:14 +03:00			`nlp.add_pipe(el_pipe, last=True)`

storing NEL training data in GoldParse objects 2019-06-07 13:58:42 +03:00			`other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]`
			`with nlp.disable_pipes(*other_pipes): # only train Entity Linking`
			`nlp.begin_training()`

introduce goldparse.links 2019-06-07 14:54:45 +03:00			`for itn in range(EPOCHS):`
training loop in proper pipe format 2019-06-07 16:55:10 +03:00			`print()`
			`print("EPOCH", itn)`
introduce goldparse.links 2019-06-07 14:54:45 +03:00			`random.shuffle(train_data)`
			`losses = {}`
first tests with EL pipe 2019-06-10 22:25:26 +03:00			`batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))`
introduce goldparse.links 2019-06-07 14:54:45 +03:00			`for batch in batches:`
			`docs, golds = zip(*batch)`
			`nlp.update(`
			`docs,`
			`golds,`
			`drop=DROPOUT,`
			`losses=losses,`
			`)`
			`print("Losses", losses)`

pretraining description vectors and storing them in the KB 2019-06-06 20:51:27 +03:00			`# STEP 7: apply the EL algorithm on the dev dataset (TODO: overlaps with code from run_el_training ?)`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`if apply_to_dev:`
pretraining description vectors and storing them in the KB 2019-06-06 20:51:27 +03:00			`run_el.run_el_dev(kb=my_kb, nlp=nlp, training_dir=TRAINING_DIR, limit=2000)`
			`print()`

			`# test KB`
			`if to_test_pipeline:`
			`run_el.run_el_toy_example(kb=my_kb, nlp=nlp)`
baseline evaluation using highest-freq candidate 2019-05-06 16:13:50 +03:00			`print()`

refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`# TODO coreference resolution`
			`# add_coref()`

			`print()`
			`print("STOP", datetime.datetime.now())`