refactor code to separate functionality into different files

2025-07-21 21:49:49 +03:00 · 2019-05-06 10:56:56 +02:00 · 2019-05-06 10:56:56 +02:00 · 6961215578
commit 6961215578
parent f5190267e7
9 changed files with 908 additions and 855 deletions
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@ -66,6 +66,6 @@ def add_el(kb, nlp):


 if __name__ == "__main__":
-    nlp = spacy.load('en_core_web_sm')
-    my_kb = create_kb(nlp.vocab)
-    add_el(my_kb, nlp)
+    my_nlp = spacy.load('en_core_web_sm')
+    my_kb = create_kb(my_nlp.vocab)
+    add_el(my_kb, my_nlp)
--- a/examples/pipeline/wiki_entity_linking/init.py
+++ b/examples/pipeline/wiki_entity_linking/init.py
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@ -0,0 +1,137 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import spacy
+from spacy.kb import KnowledgeBase
+
+import datetime
+
+from . import wikipedia_processor as wp
+from . import wikidata_processor as wd
+
+
+def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input, prior_prob_input,
+              to_print=False, write_entity_defs=True):
+    """ Create the knowledge base from Wikidata entries """
+    kb = KnowledgeBase(vocab=vocab)
+
+    print()
+    print("1. _read_wikidata_entities", datetime.datetime.now())
+    print()
+    # title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
+    title_to_id = wd.read_wikidata_entities_json(limit=None)
+
+    # write the title-ID mapping to file
+    if write_entity_defs:
+        with open(entity_output, mode='w', encoding='utf8') as entity_file:
+            entity_file.write("WP_title" + "|" + "WD_id" + "\n")
+            for title, qid in title_to_id.items():
+                entity_file.write(title + "|" + str(qid) + "\n")
+
+    title_list = list(title_to_id.keys())
+    entity_list = [title_to_id[x] for x in title_list]
+
+    print()
+    print("2. _get_entity_frequencies", datetime.datetime.now())
+    print()
+    entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list)
+
+    print()
+    print("3. adding", len(entity_list), "entities", datetime.datetime.now())
+    print()
+    kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None)
+
+    print()
+    print("4. adding aliases", datetime.datetime.now())
+    print()
+    _add_aliases(kb, title_to_id=title_to_id,
+                 max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
+                 prior_prob_input=prior_prob_input)
+
+    if to_print:
+        print()
+        print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
+
+    print("done with kb", datetime.datetime.now())
+
+    return kb
+
+
+def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input, to_print=False):
+    wp_titles = title_to_id.keys()
+
+    if to_print:
+        print("wp titles:", wp_titles)
+
+    # adding aliases with prior probabilities
+    with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
+        # skip header
+        prior_file.readline()
+        line = prior_file.readline()
+        # we can read this file sequentially, it's sorted by alias, and then by count
+        previous_alias = None
+        total_count = 0
+        counts = list()
+        entities = list()
+        while line:
+            splits = line.replace('\n', "").split(sep='|')
+            new_alias = splits[0]
+            count = int(splits[1])
+            entity = splits[2]
+
+            if new_alias != previous_alias and previous_alias:
+                # done reading the previous alias --> output
+                if len(entities) > 0:
+                    selected_entities = list()
+                    prior_probs = list()
+                    for ent_count, ent_string in zip(counts, entities):
+                        if ent_string in wp_titles:
+                            wd_id = title_to_id[ent_string]
+                            p_entity_givenalias = ent_count / total_count
+                            selected_entities.append(wd_id)
+                            prior_probs.append(p_entity_givenalias)
+
+                    if selected_entities:
+                        try:
+                            kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
+                        except ValueError as e:
+                            print(e)
+                total_count = 0
+                counts = list()
+                entities = list()
+
+            total_count += count
+
+            if len(entities) < max_entities_per_alias and count >= min_occ:
+                counts.append(count)
+                entities.append(entity)
+            previous_alias = new_alias
+
+            line = prior_file.readline()
+
+    if to_print:
+        print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
+
+
+def test_kb(kb):
+    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
+    nlp = spacy.load('en_core_web_sm')
+
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
+    nlp.add_pipe(el_pipe, last=True)
+
+    candidates = kb.get_candidates("Bush")
+
+    print("generating candidates for 'Bush' :")
+    for c in candidates:
+        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
+    print()
+
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
+           "Douglas reminds us to always bring our towel. " \
+           "The main character in Doug's novel is the man Arthur Dent, " \
+           "but Douglas doesn't write about George Washington or Homer Simpson."
+    doc = nlp(text)
+
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@ -0,0 +1,36 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import spacy
+
+# requires: pip install neuralcoref --no-binary neuralcoref
+# import neuralcoref
+
+
+# TODO
+def add_coref():
+    """ Add coreference resolution to our model """
+    nlp = spacy.load('en_core_web_sm')
+    # nlp = spacy.load('en')
+
+    # TODO: this doesn't work yet
+    # neuralcoref.add_to_pipe(nlp)
+    print("done adding to pipe")
+
+    doc = nlp(u'My sister has a dog. She loves him.')
+    print("done doc")
+
+    print(doc._.has_coref)
+    print(doc._.coref_clusters)
+
+
+# TODO
+def _run_ner_depr(nlp, clean_text, article_dict):
+    doc = nlp(clean_text)
+    for ent in doc.ents:
+        if ent.label_ == "PERSON":           # TODO: expand to non-persons
+            ent_id = article_dict.get(ent.text)
+            if ent_id:
+                print(" -", ent.text, ent.label_, ent_id)
+            else:
+                print(" -", ent.text, ent.label_, '???')  # TODO: investigate these cases
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@ -0,0 +1,276 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import csv
+import bz2
+import datetime
+
+from . import wikipedia_processor as wp
+
+"""
+Process Wikipedia interlinks to generate a training dataset for the EL algorithm
+"""
+
+
+def create_training(kb, entity_input, training_output):
+    if not kb:
+        raise ValueError("kb should be defined")
+    # nlp = spacy.load('en_core_web_sm')
+    wp_to_id = _get_entity_to_id(entity_input)
+    _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000)  # TODO: full dataset
+
+
+def _get_entity_to_id(entity_input):
+    entity_to_id = dict()
+    with open(entity_input, 'r', encoding='utf8') as csvfile:
+        csvreader = csv.reader(csvfile, delimiter='|')
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            entity_to_id[row[0]] = row[1]
+
+    return entity_to_id
+
+
+def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
+    """
+    Read the XML wikipedia data to parse out training data:
+    raw text data + positive and negative instances
+    """
+
+    title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
+    id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
+
+    read_ids = set()
+
+    entityfile_loc = training_output + "/" + "gold_entities.csv"
+    with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
+        # write entity training header file
+        _write_training_entity(outputfile=entityfile,
+                               article_id="article_id",
+                               alias="alias",
+                               entity="entity",
+                               correct="correct")
+
+        with bz2.open(wp.ENWIKI_DUMP, mode='rb') as file:
+            line = file.readline()
+            cnt = 0
+            article_text = ""
+            article_title = None
+            article_id = None
+            reading_text = False
+            reading_revision = False
+            while line and (not limit or cnt < limit):
+                if cnt % 1000000 == 0:
+                    print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
+                clean_line = line.strip().decode("utf-8")
+                # print(clean_line)
+
+                if clean_line == "<revision>":
+                    reading_revision = True
+                elif clean_line == "</revision>":
+                    reading_revision = False
+
+                # Start reading new page
+                if clean_line == "<page>":
+                    article_text = ""
+                    article_title = None
+                    article_id = None
+
+                # finished reading this page
+                elif clean_line == "</page>":
+                    if article_id:
+                        try:
+                            _process_wp_text(kb, wp_to_id, entityfile, article_id, article_text.strip(), training_output)
+                        # on a previous run, an error occurred after 46M lines and 2h
+                        except Exception as e:
+                            print("Error processing article", article_id, article_title, e)
+                    else:
+                        print("Done processing a page, but couldn't find an article_id ?")
+                        print(article_title)
+                        print(article_text)
+                    article_text = ""
+                    article_title = None
+                    article_id = None
+                    reading_text = False
+                    reading_revision = False
+
+                # start reading text within a page
+                if "<text" in clean_line:
+                    reading_text = True
+
+                if reading_text:
+                    article_text += " " + clean_line
+
+                # stop reading text within a page (we assume a new page doesn't start on the same line)
+                if "</text" in clean_line:
+                    reading_text = False
+
+                # read the ID of this article (outside the revision portion of the document)
+                if not reading_revision:
+                    ids = id_regex.search(clean_line)
+                    if ids:
+                        article_id = ids[0]
+                        if article_id in read_ids:
+                            print("Found duplicate article ID", article_id, clean_line)  # This should never happen ...
+                        read_ids.add(article_id)
+
+                # read the title of this article  (outside the revision portion of the document)
+                if not reading_revision:
+                    titles = title_regex.search(clean_line)
+                    if titles:
+                        article_title = titles[0].strip()
+
+                line = file.readline()
+                cnt += 1
+
+
+text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
+
+
+def _process_wp_text(kb, wp_to_id, entityfile, article_id, article_text, training_output):
+    # remove the text tags
+    text = text_regex.search(article_text).group(0)
+
+    # stop processing if this is a redirect page
+    if text.startswith("#REDIRECT"):
+        return
+
+    # print("WP article", article_id, ":", article_title)
+    # print()
+    # print(text)
+
+    # get the raw text without markup etc
+    clean_text = _get_clean_wp_text(text)
+    # print()
+    # print(clean_text)
+
+    article_dict = dict()
+    ambiguous_aliases = set()
+    aliases, entities, normalizations = wp.get_wp_links(text)
+    for alias, entity, norm in zip(aliases, entities, normalizations):
+        if alias not in ambiguous_aliases:
+            entity_id = wp_to_id.get(entity)
+            if entity_id:
+                # TODO: take care of these conflicts ! Currently they are being removed from the dataset
+                if article_dict.get(alias) and article_dict[alias] != entity_id:
+                    ambiguous_aliases.add(alias)
+                    article_dict.pop(alias)
+                    # print("Found conflicting alias", alias, "in article", article_id, article_title)
+                else:
+                    article_dict[alias] = entity_id
+
+    # print("found entities:")
+    for alias, entity in article_dict.items():
+        # print(alias, "-->", entity)
+        candidates = kb.get_candidates(alias)
+
+        # as training data, we only store entities that are sufficiently ambiguous
+        if len(candidates) > 1:
+            _write_training_article(article_id=article_id, clean_text=clean_text, training_output=training_output)
+            # print("alias", alias)
+
+            # print all incorrect candidates
+            for c in candidates:
+                if entity != c.entity_:
+                    _write_training_entity(outputfile=entityfile,
+                                           article_id=article_id,
+                                           alias=alias,
+                                           entity=c.entity_,
+                                           correct="0")
+
+            # print the one correct candidate
+            _write_training_entity(outputfile=entityfile,
+                                   article_id=article_id,
+                                   alias=alias,
+                                   entity=entity,
+                                   correct="1")
+
+            # print("gold entity", entity)
+            # print()
+
+    # _run_ner_depr(nlp, clean_text, article_dict)
+    # print()
+
+
+info_regex = re.compile(r'{[^{]*?}')
+interwiki_regex = re.compile(r'\[\[([^|]*?)]]')
+interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]')
+htlm_regex = re.compile(r'&lt;!--[^!]*--&gt;')
+category_regex = re.compile(r'\[\[Category:[^\[]*]]')
+file_regex = re.compile(r'\[\[File:[^[\]]+]]')
+ref_regex = re.compile(r'&lt;ref.*?&gt;')     # non-greedy
+ref_2_regex = re.compile(r'&lt;/ref.*?&gt;')  # non-greedy
+
+
+def _get_clean_wp_text(article_text):
+    clean_text = article_text.strip()
+
+    # remove bolding & italic markup
+    clean_text = clean_text.replace('\'\'\'', '')
+    clean_text = clean_text.replace('\'\'', '')
+
+    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
+    try_again = True
+    previous_length = len(clean_text)
+    while try_again:
+        clean_text = info_regex.sub('', clean_text)  # non-greedy match excluding a nested {
+        if len(clean_text) < previous_length:
+            try_again = True
+        else:
+            try_again = False
+        previous_length = len(clean_text)
+
+    # remove simple interwiki links (no alternative name)
+    clean_text = interwiki_regex.sub(r'\1', clean_text)
+
+    # remove simple interwiki links by picking the alternative name
+    clean_text = interwiki_2_regex.sub(r'\1', clean_text)
+
+    # remove HTML comments
+    clean_text = htlm_regex.sub('', clean_text)
+
+    # remove Category and File statements
+    clean_text = category_regex.sub('', clean_text)
+    clean_text = file_regex.sub('', clean_text)
+
+    # remove multiple =
+    while '==' in clean_text:
+        clean_text = clean_text.replace("==", "=")
+
+    clean_text = clean_text.replace(". =", ".")
+    clean_text = clean_text.replace(" = ", ". ")
+    clean_text = clean_text.replace("= ", ".")
+    clean_text = clean_text.replace(" =", "")
+
+    # remove refs (non-greedy match)
+    clean_text = ref_regex.sub('', clean_text)
+    clean_text = ref_2_regex.sub('', clean_text)
+
+    # remove additional wikiformatting
+    clean_text = re.sub(r'&lt;blockquote&gt;', '', clean_text)
+    clean_text = re.sub(r'&lt;/blockquote&gt;', '', clean_text)
+
+    # change special characters back to normal ones
+    clean_text = clean_text.replace(r'&lt;', '<')
+    clean_text = clean_text.replace(r'&gt;', '>')
+    clean_text = clean_text.replace(r'&quot;', '"')
+    clean_text = clean_text.replace(r'&amp;nbsp;', ' ')
+    clean_text = clean_text.replace(r'&amp;', '&')
+
+    # remove multiple spaces
+    while '  ' in clean_text:
+        clean_text = clean_text.replace('  ', ' ')
+
+    return clean_text.strip()
+
+
+def _write_training_article(article_id, clean_text, training_output):
+    file_loc = training_output + "/" + str(article_id) + ".txt"
+    with open(file_loc, mode='w', encoding='utf8') as outputfile:
+        outputfile.write(clean_text)
+
+
+def _write_training_entity(outputfile, article_id, alias, entity, correct):
+    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from . import wikipedia_processor as wp, kb_creator, training_set_creator
+
+import spacy
+from spacy.vocab import Vocab
+from spacy.kb import KnowledgeBase
+import datetime
+
+"""
+Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
+"""
+
+PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
+ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
+ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
+
+KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
+VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
+
+TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
+TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
+
+
+if __name__ == "__main__":
+    print("START", datetime.datetime.now())
+    print()
+    my_kb = None
+
+    # one-time methods to create KB and write to file
+    to_create_prior_probs = False
+    to_create_entity_counts = False
+    to_create_kb = False
+
+    # read KB back in from file
+    to_read_kb = True
+    to_test_kb = False
+
+    create_wp_training = False
+
+    # STEP 1 : create prior probabilities from WP
+    # run only once !
+    if to_create_prior_probs:
+        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
+        wp.read_wikipedia_prior_probs(prior_prob_output=PRIOR_PROB)
+        print()
+
+    # STEP 2 : deduce entity frequencies from WP
+    # run only once !
+    if to_create_entity_counts:
+        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
+        wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False)
+        print()
+
+    # STEP 3 : create KB and write to file
+    # run only once !
+    if to_create_kb:
+        print("STEP 3a: to_create_kb", datetime.datetime.now())
+        my_nlp = spacy.load('en_core_web_sm')
+        my_vocab = my_nlp.vocab
+        my_kb = kb_creator.create_kb(my_vocab,
+                                     max_entities_per_alias=10,
+                                     min_occ=5,
+                                     entity_output=ENTITY_DEFS,
+                                     count_input=ENTITY_COUNTS,
+                                     prior_prob_input=PRIOR_PROB,
+                                     to_print=False)
+        print("kb entities:", my_kb.get_size_entities())
+        print("kb aliases:", my_kb.get_size_aliases())
+        print()
+
+        print("STEP 3b: write KB", datetime.datetime.now())
+        my_kb.dump(KB_FILE)
+        my_vocab.to_disk(VOCAB_DIR)
+        print()
+
+    # STEP 4 : read KB back in from file
+    if to_read_kb:
+        print("STEP 4: to_read_kb", datetime.datetime.now())
+        my_vocab = Vocab()
+        my_vocab.from_disk(VOCAB_DIR)
+        my_kb = KnowledgeBase(vocab=my_vocab)
+        my_kb.load_bulk(KB_FILE)
+        print("kb entities:", my_kb.get_size_entities())
+        print("kb aliases:", my_kb.get_size_aliases())
+        print()
+
+        # test KB
+        if to_test_kb:
+            kb_creator.test_kb(my_kb)
+            print()
+
+    # STEP 5: create a training dataset from WP
+    if create_wp_training:
+        print("STEP 5: create training dataset", datetime.datetime.now())
+        training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR)
+
+    # TODO coreference resolution
+    # add_coref()
+
+    print()
+    print("STOP", datetime.datetime.now())
--- a/examples/pipeline/wiki_entity_linking/wikidata_processor.py
+++ b/examples/pipeline/wiki_entity_linking/wikidata_processor.py
@ -0,0 +1,166 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import bz2
+import json
+import datetime
+
+# TODO: remove hardcoded paths
+WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
+
+
+def read_wikidata_entities_json(limit=None, to_print=False):
+    """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
+
+    languages = {'en', 'de'}
+    prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
+    site_filter = 'enwiki'
+
+    title_to_id = dict()
+
+    # parse appropriate fields - depending on what we need in the KB
+    parse_properties = False
+    parse_sitelinks = True
+    parse_labels = False
+    parse_descriptions = False
+    parse_aliases = False
+
+    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
+        line = file.readline()
+        cnt = 0
+        while line and (not limit or cnt < limit):
+            if cnt % 500000 == 0:
+                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
+            clean_line = line.strip()
+            if clean_line.endswith(b","):
+                clean_line = clean_line[:-1]
+            if len(clean_line) > 1:
+                obj = json.loads(clean_line)
+                entry_type = obj["type"]
+
+                if entry_type == "item":
+                    # filtering records on their properties
+                    keep = False
+
+                    claims = obj["claims"]
+                    for prop, value_set in prop_filter.items():
+                        claim_property = claims.get(prop, None)
+                        if claim_property:
+                            for cp in claim_property:
+                                cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
+                                cp_rank = cp['rank']
+                                if cp_rank != "deprecated" and cp_id in value_set:
+                                    keep = True
+
+                    if keep:
+                        unique_id = obj["id"]
+
+                        if to_print:
+                            print("ID:", unique_id)
+                            print("type:", entry_type)
+
+                        # parsing all properties that refer to other entities
+                        if parse_properties:
+                            for prop, claim_property in claims.items():
+                                cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
+                                cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
+                                if cp_values:
+                                    if to_print:
+                                        print("prop:", prop, cp_values)
+
+                        if parse_sitelinks:
+                            site_value = obj["sitelinks"].get(site_filter, None)
+                            if site_value:
+                                site = site_value['title']
+                                if to_print:
+                                    print(site_filter, ":", site)
+                                title_to_id[site] = unique_id
+                                # print(site, "for", unique_id)
+
+                        if parse_labels:
+                            labels = obj["labels"]
+                            if labels:
+                                for lang in languages:
+                                    lang_label = labels.get(lang, None)
+                                    if lang_label:
+                                        if to_print:
+                                            print("label (" + lang + "):", lang_label["value"])
+
+                        if parse_descriptions:
+                            descriptions = obj["descriptions"]
+                            if descriptions:
+                                for lang in languages:
+                                    lang_descr = descriptions.get(lang, None)
+                                    if lang_descr:
+                                        if to_print:
+                                            print("description (" + lang + "):", lang_descr["value"])
+
+                        if parse_aliases:
+                            aliases = obj["aliases"]
+                            if aliases:
+                                for lang in languages:
+                                    lang_aliases = aliases.get(lang, None)
+                                    if lang_aliases:
+                                        for item in lang_aliases:
+                                            if to_print:
+                                                print("alias (" + lang + "):", item["value"])
+
+                        if to_print:
+                            print()
+            line = file.readline()
+            cnt += 1
+
+    return title_to_id
+
+
+def _read_wikidata_entities_regex_depr(limit=None):
+    """
+    Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines.
+    TODO: doesn't work yet. may be deleted ?
+    """
+
+    regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE)
+    regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
+    regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE)
+    regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
+
+    title_to_id = dict()
+
+    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
+        line = file.readline()
+        cnt = 0
+        while line and (not limit or cnt < limit):
+            if cnt % 500000 == 0:
+                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
+            clean_line = line.strip()
+            if clean_line.endswith(b","):
+                clean_line = clean_line[:-1]
+            if len(clean_line) > 1:
+                clean_line = line.strip().decode("utf-8")
+                keep = False
+
+                p31_matches = regex_p31.findall(clean_line)
+                if p31_matches:
+                    for p31_match in p31_matches:
+                        id_matches = regex_id.findall(p31_match)
+                        for id_match in id_matches:
+                            id_match = id_match[6:][:-1]
+                            if id_match == "Q5" or id_match == "Q15632617":
+                                keep = True
+
+                if keep:
+                    id_match = regex_id.search(clean_line).group(0)
+                    id_match = id_match[6:][:-1]
+
+                    enwiki_matches = regex_enwiki.findall(clean_line)
+                    if enwiki_matches:
+                        for enwiki_match in enwiki_matches:
+                            title_match = regex_title.search(enwiki_match).group(0)
+                            title = title_match[9:][:-1]
+                            title_to_id[title] = id_match
+
+            line = file.readline()
+            cnt += 1
+
+    return title_to_id
--- a/examples/pipeline/wiki_entity_linking/wikipedia_processor.py
+++ b/examples/pipeline/wiki_entity_linking/wikipedia_processor.py
@ -0,0 +1,187 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import bz2
+import csv
+import datetime
+
+"""
+Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
+"""
+
+
+# TODO: remove hardcoded paths
+ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
+ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
+
+map_alias_to_link = dict()
+
+# these will/should be matched ignoring case
+wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
+                   "d", "dbdump", "download", "Draft", "Education", "Foundation",
+                   "Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator",
+                   "m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki",
+                   "MediaZilla", "Meta", "Metawikipedia", "Module",
+                   "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki",
+                   "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev",
+                   "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn",
+                   "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki",
+                   "User", "User talk", "v", "voy",
+                   "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews",
+                   "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech",
+                   "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"]
+
+# find the links
+link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
+
+# match on interwiki links, e.g. `en:` or `:fr:`
+ns_regex = r":?" + "[a-z][a-z]" + ":"
+
+# match on Namespace: optionally preceded by a :
+for ns in wiki_namespaces:
+    ns_regex += "|" + ":?" + ns + ":"
+
+ns_regex = re.compile(ns_regex, re.IGNORECASE)
+
+
+def read_wikipedia_prior_probs(prior_prob_output):
+    """
+    Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
+    The full file takes about 2h to parse 1100M lines (update printed every 5M lines).
+    It works relatively fast because we don't care about which article we parsed the interwiki from,
+    we just process line by line.
+    """
+
+    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
+        line = file.readline()
+        cnt = 0
+        while line:
+            if cnt % 5000000 == 0:
+                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
+            clean_line = line.strip().decode("utf-8")
+
+            aliases, entities, normalizations = get_wp_links(clean_line)
+            for alias, entity, norm in zip(aliases, entities, normalizations):
+                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
+                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
+
+            line = file.readline()
+            cnt += 1
+
+    # write all aliases and their entities and occurrences to file
+    with open(prior_prob_output, mode='w', encoding='utf8') as outputfile:
+        outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
+        for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
+            for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
+                outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
+
+
+def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
+    alias = alias.strip()
+    entity = entity.strip()
+
+    # remove everything after # as this is not part of the title but refers to a specific paragraph
+    if normalize_entity:
+        # wikipedia titles are always capitalized
+        entity = _capitalize_first(entity.split("#")[0])
+    if normalize_alias:
+        alias = alias.split("#")[0]
+
+    if alias and entity:
+        alias_dict = map_alias_to_link.get(alias, dict())
+        entity_count = alias_dict.get(entity, 0)
+        alias_dict[entity] = entity_count + 1
+        map_alias_to_link[alias] = alias_dict
+
+
+def get_wp_links(text):
+    aliases = []
+    entities = []
+    normalizations = []
+
+    matches = link_regex.findall(text)
+    for match in matches:
+        match = match[2:][:-2].replace("_", " ").strip()
+
+        if ns_regex.match(match):
+            pass  # ignore namespaces at the beginning of the string
+
+        # this is a simple link, with the alias the same as the mention
+        elif "|" not in match:
+            aliases.append(match)
+            entities.append(match)
+            normalizations.append(True)
+
+        # in wiki format, the link is written as [[entity|alias]]
+        else:
+            splits = match.split("|")
+            entity = splits[0].strip()
+            alias = splits[1].strip()
+            # specific wiki format  [[alias (specification)|]]
+            if len(alias) == 0 and "(" in entity:
+                alias = entity.split("(")[0]
+                aliases.append(alias)
+                entities.append(entity)
+                normalizations.append(False)
+            else:
+                aliases.append(alias)
+                entities.append(entity)
+                normalizations.append(False)
+
+    return aliases, entities, normalizations
+
+
+def _capitalize_first(text):
+    if not text:
+        return None
+    result = text[0].capitalize()
+    if len(result) > 0:
+        result += text[1:]
+    return result
+
+
+def write_entity_counts(prior_prob_input, count_output, to_print=False):
+    """ Write entity counts for quick access later  """
+    entity_to_count = dict()
+    total_count = 0
+
+    with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
+        # skip header
+        prior_file.readline()
+        line = prior_file.readline()
+
+        while line:
+            splits = line.replace('\n', "").split(sep='|')
+            # alias = splits[0]
+            count = int(splits[1])
+            entity = splits[2]
+
+            current_count = entity_to_count.get(entity, 0)
+            entity_to_count[entity] = current_count + count
+
+            total_count += count
+
+            line = prior_file.readline()
+
+    with open(count_output, mode='w', encoding='utf8') as entity_file:
+        entity_file.write("entity" + "|" + "count" + "\n")
+        for entity, count in entity_to_count.items():
+            entity_file.write(entity + "|" + str(count) + "\n")
+
+    if to_print:
+        for entity, count in entity_to_count.items():
+            print("Entity count:", entity, count)
+        print("Total count:", total_count)
+
+
+def get_entity_frequencies(count_input, entities):
+    entity_to_count = dict()
+    with open(count_input, 'r', encoding='utf8') as csvfile:
+        csvreader = csv.reader(csvfile, delimiter='|')
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            entity_to_count[row[0]] = int(row[1])
+
+    return [entity_to_count.get(e, 0) for e in entities]
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@ -1,852 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-"""
-Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
-"""
-import re
-import csv
-import json
-import spacy
-import datetime
-import bz2
-
-from spacy.kb import KnowledgeBase
-from spacy.vocab import Vocab
-
-# requires: pip install neuralcoref --no-binary neuralcoref
-# import neuralcoref
-
-# TODO: remove hardcoded paths
-WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
-ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
-ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
-
-PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
-ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
-ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
-
-KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
-VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
-
-TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
-TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
-
-
-# these will/should be matched ignoring case
-wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
-                   "d", "dbdump", "download", "Draft", "Education", "Foundation",
-                   "Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator",
-                   "m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki",
-                   "MediaZilla", "Meta", "Metawikipedia", "Module",
-                   "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki",
-                   "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev",
-                   "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn",
-                   "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki",
-                   "User", "User talk", "v", "voy",
-                   "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews",
-                   "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech",
-                   "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"]
-
-map_alias_to_link = dict()
-
-
-def read_wikipedia_prior_probs():
-    """
-    STEP 1: Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
-    The full file takes about 2h to parse 1100M lines (update printed every 5M lines).
-    It works relatively fast because we don't care about which article we parsed the interwiki from,
-    we just process line by line.
-    """
-
-    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
-        line = file.readline()
-        cnt = 0
-        while line:
-            if cnt % 5000000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
-            clean_line = line.strip().decode("utf-8")
-
-            aliases, entities, normalizations = _get_wp_links(clean_line)
-            for alias, entity, norm in zip(aliases, entities, normalizations):
-                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
-                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
-
-            line = file.readline()
-            cnt += 1
-
-    # write all aliases and their entities and occurrences to file
-    with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile:
-        outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
-        for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
-            for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
-                outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
-
-
-# find the links
-link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
-
-# match on interwiki links, e.g. `en:` or `:fr:`
-ns_regex = r":?" + "[a-z][a-z]" + ":"
-
-# match on Namespace: optionally preceded by a :
-for ns in wiki_namespaces:
-    ns_regex += "|" + ":?" + ns + ":"
-
-ns_regex = re.compile(ns_regex, re.IGNORECASE)
-
-
-def _get_wp_links(text):
-    aliases = []
-    entities = []
-    normalizations = []
-
-    matches = link_regex.findall(text)
-    for match in matches:
-        match = match[2:][:-2].replace("_", " ").strip()
-
-        if ns_regex.match(match):
-            pass  # ignore namespaces at the beginning of the string
-
-        # this is a simple link, with the alias the same as the mention
-        elif "|" not in match:
-            aliases.append(match)
-            entities.append(match)
-            normalizations.append(True)
-
-        # in wiki format, the link is written as [[entity|alias]]
-        else:
-            splits = match.split("|")
-            entity = splits[0].strip()
-            alias = splits[1].strip()
-            # specific wiki format  [[alias (specification)|]]
-            if len(alias) == 0 and "(" in entity:
-                alias = entity.split("(")[0]
-                aliases.append(alias)
-                entities.append(entity)
-                normalizations.append(False)
-            else:
-                aliases.append(alias)
-                entities.append(entity)
-                normalizations.append(False)
-
-    return aliases, entities, normalizations
-
-
-def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
-    alias = alias.strip()
-    entity = entity.strip()
-
-    # remove everything after # as this is not part of the title but refers to a specific paragraph
-    if normalize_entity:
-        # wikipedia titles are always capitalized
-        entity = _capitalize_first(entity.split("#")[0])
-    if normalize_alias:
-        alias = alias.split("#")[0]
-
-    if alias and entity:
-        alias_dict = map_alias_to_link.get(alias, dict())
-        entity_count = alias_dict.get(entity, 0)
-        alias_dict[entity] = entity_count + 1
-        map_alias_to_link[alias] = alias_dict
-
-
-def _capitalize_first(text):
-    if not text:
-        return None
-    result = text[0].capitalize()
-    if len(result) > 0:
-        result += text[1:]
-    return result
-
-
-def write_entity_counts(to_print=False):
-    """ STEP 2: write entity counts  """
-    entity_to_count = dict()
-    total_count = 0
-
-    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
-        # skip header
-        prior_file.readline()
-        line = prior_file.readline()
-
-        while line:
-            splits = line.replace('\n', "").split(sep='|')
-            # alias = splits[0]
-            count = int(splits[1])
-            entity = splits[2]
-
-            current_count = entity_to_count.get(entity, 0)
-            entity_to_count[entity] = current_count + count
-
-            total_count += count
-
-            line = prior_file.readline()
-
-    with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file:
-        entity_file.write("entity" + "|" + "count" + "\n")
-        for entity, count in entity_to_count.items():
-            entity_file.write(entity + "|" + str(count) + "\n")
-
-    if to_print:
-        for entity, count in entity_to_count.items():
-            print("Entity count:", entity, count)
-        print("Total count:", total_count)
-
-
-def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False, write_entity_defs=True):
-    """ STEP 3: create the knowledge base """
-    kb = KnowledgeBase(vocab=vocab)
-
-    print()
-    print("1. _read_wikidata_entities", datetime.datetime.now())
-    print()
-    # title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
-    title_to_id = _read_wikidata_entities_json(limit=None)
-
-    # write the title-ID mapping to file
-    if write_entity_defs:
-        with open(ENTITY_DEFS, mode='w', encoding='utf8') as entity_file:
-            entity_file.write("WP_title" + "|" + "WD_id" + "\n")
-            for title, qid in title_to_id.items():
-                entity_file.write(title + "|" + str(qid) + "\n")
-
-    title_list = list(title_to_id.keys())
-    entity_list = [title_to_id[x] for x in title_list]
-
-    print()
-    print("2. _get_entity_frequencies", datetime.datetime.now())
-    print()
-    entity_frequencies = _get_entity_frequencies(entities=title_list)
-
-    print()
-    print("3. adding", len(entity_list), "entities", datetime.datetime.now())
-    print()
-    kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None)
-
-    print()
-    print("4. adding aliases", datetime.datetime.now())
-    print()
-    _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ)
-
-    # TODO: read wikipedia texts for entity context
-    # _read_wikipedia()
-
-    if to_print:
-        print()
-        print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
-
-    print("done with kb", datetime.datetime.now())
-
-    return kb
-
-
-def _get_entity_frequencies(entities):
-    entity_to_count = dict()
-    with open(ENTITY_COUNTS, 'r', encoding='utf8') as csvfile:
-        csvreader = csv.reader(csvfile, delimiter='|')
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            entity_to_count[row[0]] = int(row[1])
-
-    return [entity_to_count.get(e, 0) for e in entities]
-
-
-def _get_entity_to_id():
-    entity_to_id = dict()
-    with open(ENTITY_DEFS, 'r', encoding='utf8') as csvfile:
-        csvreader = csv.reader(csvfile, delimiter='|')
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            entity_to_id[row[0]] = row[1]
-
-    return entity_to_id
-
-
-def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False):
-    wp_titles = title_to_id.keys()
-
-    if to_print:
-        print("wp titles:", wp_titles)
-
-    # adding aliases with prior probabilities
-    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
-        # skip header
-        prior_file.readline()
-        line = prior_file.readline()
-        # we can read this file sequentially, it's sorted by alias, and then by count
-        previous_alias = None
-        total_count = 0
-        counts = list()
-        entities = list()
-        while line:
-            splits = line.replace('\n', "").split(sep='|')
-            new_alias = splits[0]
-            count = int(splits[1])
-            entity = splits[2]
-
-            if new_alias != previous_alias and previous_alias:
-                # done reading the previous alias --> output
-                if len(entities) > 0:
-                    selected_entities = list()
-                    prior_probs = list()
-                    for ent_count, ent_string in zip(counts, entities):
-                        if ent_string in wp_titles:
-                            wd_id = title_to_id[ent_string]
-                            p_entity_givenalias = ent_count / total_count
-                            selected_entities.append(wd_id)
-                            prior_probs.append(p_entity_givenalias)
-
-                    if selected_entities:
-                        try:
-                            kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
-                        except ValueError as e:
-                            print(e)
-                total_count = 0
-                counts = list()
-                entities = list()
-
-            total_count += count
-
-            if len(entities) < max_entities_per_alias and count >= min_occ:
-                counts.append(count)
-                entities.append(entity)
-            previous_alias = new_alias
-
-            line = prior_file.readline()
-
-    if to_print:
-        print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
-
-
-def _read_wikidata_entities_json(limit=None, to_print=False):
-    """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
-
-    languages = {'en', 'de'}
-    prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
-    site_filter = 'enwiki'
-
-    title_to_id = dict()
-
-    # parse appropriate fields - depending on what we need in the KB
-    parse_properties = False
-    parse_sitelinks = True
-    parse_labels = False
-    parse_descriptions = False
-    parse_aliases = False
-
-    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
-        line = file.readline()
-        cnt = 0
-        while line and (not limit or cnt < limit):
-            if cnt % 500000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
-            clean_line = line.strip()
-            if clean_line.endswith(b","):
-                clean_line = clean_line[:-1]
-            if len(clean_line) > 1:
-                obj = json.loads(clean_line)
-                entry_type = obj["type"]
-
-                if entry_type == "item":
-                    # filtering records on their properties
-                    keep = False
-
-                    claims = obj["claims"]
-                    for prop, value_set in prop_filter.items():
-                        claim_property = claims.get(prop, None)
-                        if claim_property:
-                            for cp in claim_property:
-                                cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
-                                cp_rank = cp['rank']
-                                if cp_rank != "deprecated" and cp_id in value_set:
-                                    keep = True
-
-                    if keep:
-                        unique_id = obj["id"]
-
-                        if to_print:
-                            print("ID:", unique_id)
-                            print("type:", entry_type)
-
-                        # parsing all properties that refer to other entities
-                        if parse_properties:
-                            for prop, claim_property in claims.items():
-                                cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
-                                cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
-                                if cp_values:
-                                    if to_print:
-                                        print("prop:", prop, cp_values)
-
-                        if parse_sitelinks:
-                            site_value = obj["sitelinks"].get(site_filter, None)
-                            if site_value:
-                                site = site_value['title']
-                                if to_print:
-                                    print(site_filter, ":", site)
-                                title_to_id[site] = unique_id
-                                # print(site, "for", unique_id)
-
-                        if parse_labels:
-                            labels = obj["labels"]
-                            if labels:
-                                for lang in languages:
-                                    lang_label = labels.get(lang, None)
-                                    if lang_label:
-                                        if to_print:
-                                            print("label (" + lang + "):", lang_label["value"])
-
-                        if parse_descriptions:
-                            descriptions = obj["descriptions"]
-                            if descriptions:
-                                for lang in languages:
-                                    lang_descr = descriptions.get(lang, None)
-                                    if lang_descr:
-                                        if to_print:
-                                            print("description (" + lang + "):", lang_descr["value"])
-
-                        if parse_aliases:
-                            aliases = obj["aliases"]
-                            if aliases:
-                                for lang in languages:
-                                    lang_aliases = aliases.get(lang, None)
-                                    if lang_aliases:
-                                        for item in lang_aliases:
-                                            if to_print:
-                                                print("alias (" + lang + "):", item["value"])
-
-                        if to_print:
-                            print()
-            line = file.readline()
-            cnt += 1
-
-    return title_to_id
-
-
-def _read_wikidata_entities_regex_depr(limit=None, to_print=False):
-    """ Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines. """
-
-    regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE)
-    regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
-    regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE)
-    regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
-
-    title_to_id = dict()
-
-    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
-        line = file.readline()
-        cnt = 0
-        while line and (not limit or cnt < limit):
-            if cnt % 500000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
-            clean_line = line.strip()
-            if clean_line.endswith(b","):
-                clean_line = clean_line[:-1]
-            if len(clean_line) > 1:
-                clean_line = line.strip().decode("utf-8")
-                keep = False
-
-                p31_matches = regex_p31.findall(clean_line)
-                if p31_matches:
-                    for p31_match in p31_matches:
-                        id_matches = regex_id.findall(p31_match)
-                        for id_match in id_matches:
-                            id_match = id_match[6:][:-1]
-                            if id_match == "Q5" or id_match == "Q15632617":
-                                keep = True
-
-                if keep:
-                    id_match = regex_id.search(clean_line).group(0)
-                    id_match = id_match[6:][:-1]
-
-                    enwiki_matches = regex_enwiki.findall(clean_line)
-                    if enwiki_matches:
-                        for enwiki_match in enwiki_matches:
-                            title_match = regex_title.search(enwiki_match).group(0)
-                            title = title_match[9:][:-1]
-                            title_to_id[title] = id_match
-                            # print(title, "for", id_match)
-
-            line = file.readline()
-            cnt += 1
-
-    return title_to_id
-
-
-def test_kb(kb):
-    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
-    nlp = spacy.load('en_core_web_sm')
-
-    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
-    nlp.add_pipe(el_pipe, last=True)
-
-    candidates = my_kb.get_candidates("Bush")
-
-    print("generating candidates for 'Bush' :")
-    for c in candidates:
-        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
-    print()
-
-    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
-           "Douglas reminds us to always bring our towel. " \
-           "The main character in Doug's novel is the man Arthur Dent, " \
-           "but Douglas doesn't write about George Washington or Homer Simpson."
-    doc = nlp(text)
-
-    for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
-
-
-def add_coref():
-    """ STEP 5: add coreference resolution to our model """
-    nlp = spacy.load('en_core_web_sm')
-    # nlp = spacy.load('en')
-
-    # TODO: this doesn't work yet
-    # neuralcoref.add_to_pipe(nlp)
-    print("done adding to pipe")
-
-    doc = nlp(u'My sister has a dog. She loves him.')
-    print("done doc")
-
-    print(doc._.has_coref)
-    print(doc._.coref_clusters)
-
-
-def create_training(kb):
-    if not kb:
-        raise ValueError("kb should be defined")
-    # nlp = spacy.load('en_core_web_sm')
-    wp_to_id = _get_entity_to_id()
-    _read_wikipedia_texts(kb, wp_to_id, limit=100000000) # TODO: full dataset
-
-
-def _read_wikipedia_texts(kb, wp_to_id, limit=None):
-    """
-    Read the XML wikipedia data to parse out training data:
-    raw text data + positive and negative instances
-    """
-
-    title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
-    id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
-
-    read_ids = set()
-
-    entityfile_loc = TRAINING_OUTPUT_SET_DIR + "/" + "gold_entities.csv"
-    with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
-        # write entity training header file
-        _write_training_entity(outputfile=entityfile,
-                               article_id="article_id",
-                               alias="alias",
-                               entity="entity",
-                               correct="correct")
-
-        with bz2.open(ENWIKI_DUMP, mode='rb') as file:
-            line = file.readline()
-            cnt = 0
-            article_text = ""
-            article_title = None
-            article_id = None
-            reading_text = False
-            reading_revision = False
-            while line and (not limit or cnt < limit):
-                if cnt % 1000000 == 0:
-                    print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
-                clean_line = line.strip().decode("utf-8")
-                # print(clean_line)
-
-                if clean_line == "<revision>":
-                    reading_revision = True
-                elif clean_line == "</revision>":
-                    reading_revision = False
-
-                # Start reading new page
-                if clean_line == "<page>":
-                    article_text = ""
-                    article_title = None
-                    article_id = None
-
-                # finished reading this page
-                elif clean_line == "</page>":
-                    if article_id:
-                        try:
-                            _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text.strip())
-                        # on a previous run, an error occurred after 46M lines and 2h
-                        except Exception as e:
-                            print("Error processing article", article_id, article_title)
-                            print(e)
-                    else:
-                        print("Done processing a page, but couldn't find an article_id ?")
-                        print(article_title)
-                        print(article_text)
-                    article_text = ""
-                    article_title = None
-                    article_id = None
-                    reading_text = False
-                    reading_revision = False
-
-                # start reading text within a page
-                if "<text" in clean_line:
-                    reading_text = True
-
-                if reading_text:
-                    article_text += " " + clean_line
-
-                # stop reading text within a page (we assume a new page doesn't start on the same line)
-                if "</text" in clean_line:
-                    reading_text = False
-
-                # read the ID of this article (outside the revision portion of the document)
-                if not reading_revision:
-                    ids = id_regex.search(clean_line)
-                    if ids:
-                        article_id = ids[0]
-                        if article_id in read_ids:
-                            print("Found duplicate article ID", article_id, clean_line) # This should never happen ...
-                        read_ids.add(article_id)
-
-                # read the title of this article  (outside the revision portion of the document)
-                if not reading_revision:
-                    titles = title_regex.search(clean_line)
-                    if titles:
-                        article_title = titles[0].strip()
-
-                line = file.readline()
-                cnt += 1
-
-
-text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
-
-
-def _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text):
-    # remove the text tags
-    text = text_regex.search(article_text).group(0)
-
-    # stop processing if this is a redirect page
-    if text.startswith("#REDIRECT"):
-        return
-
-    # print("WP article", article_id, ":", article_title)
-    # print()
-    # print(text)
-
-    # get the raw text without markup etc
-    clean_text = _get_clean_wp_text(text)
-    # print()
-    # print(clean_text)
-
-    article_dict = dict()
-    ambiguous_aliases = set()
-    aliases, entities, normalizations = _get_wp_links(text)
-    for alias, entity, norm in zip(aliases, entities, normalizations):
-        if alias not in ambiguous_aliases:
-            entity_id = wp_to_id.get(entity)
-            if entity_id:
-                # TODO: take care of these conflicts ! Currently they are being removed from the dataset
-                if article_dict.get(alias) and article_dict[alias] != entity_id:
-                    ambiguous_aliases.add(alias)
-                    article_dict.pop(alias)
-                    # print("Found conflicting alias", alias, "in article", article_id, article_title)
-                else:
-                    article_dict[alias] = entity_id
-
-    # print("found entities:")
-    for alias, entity in article_dict.items():
-        # print(alias, "-->", entity)
-        candidates = kb.get_candidates(alias)
-
-        # as training data, we only store entities that are sufficiently ambiguous
-        if len(candidates) > 1:
-            _write_training_article(article_id=article_id, clean_text=clean_text)
-            # print("alias", alias)
-
-            # print all incorrect candidates
-            for c in candidates:
-                if entity != c.entity_:
-                    _write_training_entity(outputfile=entityfile,
-                                           article_id=article_id,
-                                           alias=alias,
-                                           entity=c.entity_,
-                                           correct="0")
-
-            # print the one correct candidate
-            _write_training_entity(outputfile=entityfile,
-                                   article_id=article_id,
-                                   alias=alias,
-                                   entity=entity,
-                                   correct="1")
-
-            # print("gold entity", entity)
-            # print()
-
-    # _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict)
-    # print()
-
-
-info_regex = re.compile(r'{[^{]*?}')
-interwiki_regex = re.compile(r'\[\[([^|]*?)]]')
-interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]')
-htlm_regex = re.compile(r'&lt;!--[^!]*--&gt;')
-category_regex = re.compile(r'\[\[Category:[^\[]*]]')
-file_regex = re.compile(r'\[\[File:[^[\]]+]]')
-ref_regex = re.compile(r'&lt;ref.*?&gt;')     # non-greedy
-ref_2_regex = re.compile(r'&lt;/ref.*?&gt;')  # non-greedy
-
-
-def _get_clean_wp_text(article_text):
-    clean_text = article_text.strip()
-
-    # remove bolding & italic markup
-    clean_text = clean_text.replace('\'\'\'', '')
-    clean_text = clean_text.replace('\'\'', '')
-
-    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
-    try_again = True
-    previous_length = len(clean_text)
-    while try_again:
-        clean_text = info_regex.sub('', clean_text)  # non-greedy match excluding a nested {
-        if len(clean_text) < previous_length:
-            try_again = True
-        else:
-            try_again = False
-        previous_length = len(clean_text)
-
-    # remove simple interwiki links (no alternative name)
-    clean_text = interwiki_regex.sub(r'\1', clean_text)
-
-    # remove simple interwiki links by picking the alternative name
-    clean_text = interwiki_2_regex.sub(r'\1', clean_text)
-
-    # remove HTML comments
-    clean_text = htlm_regex.sub('', clean_text)
-
-    # remove Category and File statements
-    clean_text = category_regex.sub('', clean_text)
-    clean_text = file_regex.sub('', clean_text)
-
-    # remove multiple =
-    while '==' in clean_text:
-        clean_text = clean_text.replace("==", "=")
-
-    clean_text = clean_text.replace(". =", ".")
-    clean_text = clean_text.replace(" = ", ". ")
-    clean_text = clean_text.replace("= ", ".")
-    clean_text = clean_text.replace(" =", "")
-
-    # remove refs (non-greedy match)
-    clean_text = ref_regex.sub('', clean_text)
-    clean_text = ref_2_regex.sub('', clean_text)
-
-    # remove additional wikiformatting
-    clean_text = re.sub(r'&lt;blockquote&gt;', '', clean_text)
-    clean_text = re.sub(r'&lt;/blockquote&gt;', '', clean_text)
-
-    # change special characters back to normal ones
-    clean_text = clean_text.replace(r'&lt;', '<')
-    clean_text = clean_text.replace(r'&gt;', '>')
-    clean_text = clean_text.replace(r'&quot;', '"')
-    clean_text = clean_text.replace(r'&amp;nbsp;', ' ')
-    clean_text = clean_text.replace(r'&amp;', '&')
-
-    # remove multiple spaces
-    while '  ' in clean_text:
-        clean_text = clean_text.replace('  ', ' ')
-
-    return clean_text.strip()
-
-
-def _write_training_article(article_id, clean_text):
-    file_loc = TRAINING_OUTPUT_SET_DIR + "/" + str(article_id) + ".txt"
-    with open(file_loc, mode='w', encoding='utf8') as outputfile:
-        outputfile.write(clean_text)
-
-
-def _write_training_entity(outputfile, article_id, alias, entity, correct):
-    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
-
-
-def _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict):
-    doc = nlp(clean_text)
-    for ent in doc.ents:
-        if ent.label_ == "PERSON":           # TODO: expand to non-persons
-            ent_id = article_dict.get(ent.text)
-            if ent_id:
-                print(" -", ent.text, ent.label_, ent_id)
-            else:
-                print(" -", ent.text, ent.label_, '???')  # TODO: investigate these cases
-
-
-if __name__ == "__main__":
-    print("START", datetime.datetime.now())
-    print()
-    my_kb = None
-
-    # one-time methods to create KB and write to file
-    to_create_prior_probs = False
-    to_create_entity_counts = False
-    to_create_kb = False
-
-    # read KB back in from file
-    to_read_kb = True
-    to_test_kb = False
-
-    create_wp_training = True
-
-    # STEP 1 : create prior probabilities from WP
-    # run only once !
-    if to_create_prior_probs:
-        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
-        read_wikipedia_prior_probs()
-        print()
-
-    # STEP 2 : deduce entity frequencies from WP
-    # run only once !
-    if to_create_entity_counts:
-        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
-        write_entity_counts()
-        print()
-
-    # STEP 3 : create KB and write to file
-    # run only once !
-    if to_create_kb:
-        print("STEP 3a: to_create_kb", datetime.datetime.now())
-        my_nlp = spacy.load('en_core_web_sm')
-        my_vocab = my_nlp.vocab
-        my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
-        print("kb entities:", my_kb.get_size_entities())
-        print("kb aliases:", my_kb.get_size_aliases())
-        print()
-
-        print("STEP 3b: write KB", datetime.datetime.now())
-        my_kb.dump(KB_FILE)
-        my_vocab.to_disk(VOCAB_DIR)
-        print()
-
-    # STEP 4 : read KB back in from file
-    if to_read_kb:
-        print("STEP 4: to_read_kb", datetime.datetime.now())
-        my_vocab = Vocab()
-        my_vocab.from_disk(VOCAB_DIR)
-        my_kb = KnowledgeBase(vocab=my_vocab)
-        my_kb.load_bulk(KB_FILE)
-        print("kb entities:", my_kb.get_size_entities())
-        print("kb aliases:", my_kb.get_size_aliases())
-        print()
-
-        # test KB
-        if to_test_kb:
-            test_kb(my_kb)
-            print()
-
-    # STEP 5: create a training dataset from WP
-    if create_wp_training:
-        print("STEP 5: create training dataset", datetime.datetime.now())
-        create_training(my_kb)
-
-    # TODO coreference resolution
-    # add_coref()
-
-    print()
-    print("STOP", datetime.datetime.now())