KB extensions and better parsing of WikiData (#4375)

* fix overflow error on windows * more documentation & logging fixes * md fix * 3 different limit parameters to play with execution time * bug fixes directory locations * small fixes * exclude dev test articles from prior probabilities stats * small fixes * filtering wikidata entities, removing numeric and meta items * adding aliases from wikidata also to the KB * fix adding WD aliases * adding also new aliases to previously added entities * fixing comma's * small doc fixes * adding subclassof filtering * append alias functionality in KB * prevent appending the same entity-alias pair * fix for appending WD aliases * remove date filter * remove unnecessary import * small corrections and reformatting * remove WD aliases for now (too slow) * removing numeric entities from training and evaluation * small fixes * shortcut during prediction if there is only one candidate * add counts and fscore logging, remove FP NER from evaluation * fix entity_linker.predict to take docs instead of single sentences * remove enumeration sentences from the WP dataset * entity_linker.update to process full doc instead of single sentence * spelling corrections and dump locations in readme * NLP IO fix * reading KB is unnecessary at the end of the pipeline * small logging fix * remove empty files
2025-10-22 03:34:15 +03:00 · 2019-10-14 11:28:53 +01:00 · 2019-10-14 11:28:53 +01:00 · 2d249a9502
commit 2d249a9502
parent 428887b8f2
21 changed files with 1163 additions and 811 deletions
--- a/bin/wiki_entity_linking/README.md
+++ b/bin/wiki_entity_linking/README.md
@ -0,0 +1,34 @@
+## Entity Linking with Wikipedia and Wikidata
+
+### Step 1: Create a Knowledge Base (KB) and training data
+
+Run  `wikipedia_pretrain_kb.py` 
+* This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file**
+  * WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/
+  * Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language)
+* You can set the filtering parameters for KB construction:
+  * `max_per_alias`: (max) number of candidate entities in the KB per alias/synonym
+  * `min_freq`: threshold of number of times an entity should occur in the corpus to be included in the KB
+  * `min_pair`: threshold of number of times an entity+alias combination should occur in the corpus to be included in the KB
+* Further parameters to set:
+  * `descriptions_from_wikipedia`: whether to parse descriptions from Wikipedia (`True`) or Wikidata (`False`)
+  * `entity_vector_length`: length of the pre-trained entity description vectors
+  * `lang`: language for which to fetch Wikidata information (as the dump contains all languages)
+
+Quick testing and rerunning: 
+* When trying out the pipeline for a quick test, set `limit_prior`, `limit_train` and/or `limit_wd` to read only parts of the dumps instead of everything. 
+* If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed.
+
+
+### Step 2: Train an Entity Linking model
+
+Run  `wikidata_train_entity_linker.py` 
+* This takes the **KB directory** produced by Step 1, and trains an **Entity Linking model**
+* You can set the learning parameters for the EL training:
+  * `epochs`: number of training iterations
+  * `dropout`: dropout rate
+  * `lr`: learning rate
+  * `l2`: L2 regularization
+* Specify the number of training and dev testing entities with `train_inst` and `dev_inst` respectively
+* Further parameters to set:
+  * `labels_discard`: NER label types to discard during training
--- a/bin/wiki_entity_linking/init.py
+++ b/bin/wiki_entity_linking/init.py
@ -6,6 +6,7 @@ OUTPUT_MODEL_DIR = "nlp"
 PRIOR_PROB_PATH = "prior_prob.csv"
 ENTITY_DEFS_PATH = "entity_defs.csv"
 ENTITY_FREQ_PATH = "entity_freq.csv"
+ENTITY_ALIAS_PATH = "entity_alias.csv"
 ENTITY_DESCR_PATH = "entity_descriptions.csv"

 LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
--- a/bin/wiki_entity_linking/entity_linker_evaluation.py
+++ b/bin/wiki_entity_linking/entity_linker_evaluation.py
@ -15,10 +15,11 @@ class Metrics(object):
        candidate_is_correct = true_entity == candidate

        # Assume that we have no labeled negatives in the data (i.e. cases where true_entity is "NIL")
-        # Therefore, if candidate_is_correct then we have a true positive and never a true negative
+        # Therefore, if candidate_is_correct then we have a true positive and never a true negative.
        self.true_pos += candidate_is_correct
        self.false_neg += not candidate_is_correct
-        if candidate not in {"", "NIL"}:
+        if candidate and candidate not in {"", "NIL"}:
+            # A wrong prediction (e.g. Q42 != Q3) counts both as a FP as well as a FN.
            self.false_pos += not candidate_is_correct

    def calculate_precision(self):
@ -33,6 +34,14 @@ class Metrics(object):
        else:
            return self.true_pos / (self.true_pos + self.false_neg)

+    def calculate_fscore(self):
+        p = self.calculate_precision()
+        r = self.calculate_recall()
+        if p + r == 0:
+            return 0.0
+        else:
+            return 2 * p * r / (p + r)
+

 class EvaluationResults(object):
    def __init__(self):
@ -43,18 +52,20 @@ class EvaluationResults(object):
        self.metrics.update_results(true_entity, candidate)
        self.metrics_by_label[ent_label].update_results(true_entity, candidate)

-    def increment_false_negatives(self):
-        self.metrics.false_neg += 1
-
    def report_metrics(self, model_name):
        model_str = model_name.title()
        recall = self.metrics.calculate_recall()
        precision = self.metrics.calculate_precision()
-        return ("{}: ".format(model_str) +
-                "Recall = {} | ".format(round(recall, 3)) +
-                "Precision = {} | ".format(round(precision, 3)) +
-                "Precision by label = {}".format({k: v.calculate_precision()
-                                                  for k, v in self.metrics_by_label.items()}))
+        fscore = self.metrics.calculate_fscore()
+        return (
+            "{}: ".format(model_str)
+            + "F-score = {} | ".format(round(fscore, 3))
+            + "Recall = {} | ".format(round(recall, 3))
+            + "Precision = {} | ".format(round(precision, 3))
+            + "F-score by label = {}".format(
+                {k: v.calculate_fscore() for k, v in sorted(self.metrics_by_label.items())}
+            )
+        )


 class BaselineResults(object):
@ -63,25 +74,32 @@ class BaselineResults(object):
        self.prior = EvaluationResults()
        self.oracle = EvaluationResults()

-    def report_accuracy(self, model):
+    def report_performance(self, model):
        results = getattr(self, model)
        return results.report_metrics(model)

-    def update_baselines(self, true_entity, ent_label, random_candidate, prior_candidate, oracle_candidate):
+    def update_baselines(
+        self,
+        true_entity,
+        ent_label,
+        random_candidate,
+        prior_candidate,
+        oracle_candidate,
+    ):
        self.oracle.update_metrics(ent_label, true_entity, oracle_candidate)
        self.prior.update_metrics(ent_label, true_entity, prior_candidate)
        self.random.update_metrics(ent_label, true_entity, random_candidate)


-def measure_performance(dev_data, kb, el_pipe):
-    baseline_accuracies = measure_baselines(
-        dev_data, kb
-    )
-
-    logger.info(baseline_accuracies.report_accuracy("random"))
-    logger.info(baseline_accuracies.report_accuracy("prior"))
-    logger.info(baseline_accuracies.report_accuracy("oracle"))
+def measure_performance(dev_data, kb, el_pipe, baseline=True, context=True):
+    if baseline:
+        baseline_accuracies, counts = measure_baselines(dev_data, kb)
+        logger.info("Counts: {}".format({k: v for k, v in sorted(counts.items())}))
+        logger.info(baseline_accuracies.report_performance("random"))
+        logger.info(baseline_accuracies.report_performance("prior"))
+        logger.info(baseline_accuracies.report_performance("oracle"))

+    if context:
        # using only context
        el_pipe.cfg["incl_context"] = True
        el_pipe.cfg["incl_prior"] = False
@ -96,7 +114,11 @@ def measure_performance(dev_data, kb, el_pipe):


 def get_eval_results(data, el_pipe=None):
-    # If the docs in the data require further processing with an entity linker, set el_pipe
+    """
+    Evaluate the ent.kb_id_ annotations against the gold standard.
+    Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
+    If the docs in the data require further processing with an entity linker, set el_pipe.
+    """
    from tqdm import tqdm

    docs = []
@ -111,18 +133,15 @@ def get_eval_results(data, el_pipe=None):

    results = EvaluationResults()
    for doc, gold in zip(docs, golds):
-        tagged_entries_per_article = {_offset(ent.start_char, ent.end_char): ent for ent in doc.ents}
        try:
            correct_entries_per_article = dict()
            for entity, kb_dict in gold.links.items():
                start, end = entity
-                # only evaluating on positive examples
                for gold_kb, value in kb_dict.items():
                    if value:
+                        # only evaluating on positive examples
                        offset = _offset(start, end)
                        correct_entries_per_article[offset] = gold_kb
-                        if offset not in tagged_entries_per_article:
-                            results.increment_false_negatives()

            for ent in doc.ents:
                ent_label = ent.label_
@ -142,7 +161,11 @@ def get_eval_results(data, el_pipe=None):


 def measure_baselines(data, kb):
-    # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound
+    """
+    Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound.
+    Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
+    Also return a dictionary of counts by entity label.
+    """
    counts_d = dict()

    baseline_results = BaselineResults()
@ -152,7 +175,6 @@ def measure_baselines(data, kb):

    for doc, gold in zip(docs, golds):
        correct_entries_per_article = dict()
-        tagged_entries_per_article = {_offset(ent.start_char, ent.end_char): ent for ent in doc.ents}
        for entity, kb_dict in gold.links.items():
            start, end = entity
            for gold_kb, value in kb_dict.items():
@ -160,10 +182,6 @@ def measure_baselines(data, kb):
                if value:
                    offset = _offset(start, end)
                    correct_entries_per_article[offset] = gold_kb
-                    if offset not in tagged_entries_per_article:
-                        baseline_results.random.increment_false_negatives()
-                        baseline_results.oracle.increment_false_negatives()
-                        baseline_results.prior.increment_false_negatives()

        for ent in doc.ents:
            ent_label = ent.label_
@ -176,7 +194,7 @@ def measure_baselines(data, kb):
            if gold_entity is not None:
                candidates = kb.get_candidates(ent.text)
                oracle_candidate = ""
-                best_candidate = ""
+                prior_candidate = ""
                random_candidate = ""
                if candidates:
                    scores = []
@ -187,13 +205,21 @@ def measure_baselines(data, kb):
                            oracle_candidate = c.entity_

                    best_index = scores.index(max(scores))
-                    best_candidate = candidates[best_index].entity_
+                    prior_candidate = candidates[best_index].entity_
                    random_candidate = random.choice(candidates).entity_

-                baseline_results.update_baselines(gold_entity, ent_label,
-                                                  random_candidate, best_candidate, oracle_candidate)
+                current_count = counts_d.get(ent_label, 0)
+                counts_d[ent_label] = current_count+1

-    return baseline_results
+                baseline_results.update_baselines(
+                    gold_entity,
+                    ent_label,
+                    random_candidate,
+                    prior_candidate,
+                    oracle_candidate,
+                )
+
+    return baseline_results, counts_d


 def _offset(start, end):
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@ -1,17 +1,12 @@
 # coding: utf-8
 from __future__ import unicode_literals

-import csv
 import logging
-import spacy
-import sys

 from spacy.kb import KnowledgeBase

-from bin.wiki_entity_linking import wikipedia_processor as wp
 from bin.wiki_entity_linking.train_descriptions import EntityEncoder
-
-csv.field_size_limit(sys.maxsize)
+from bin.wiki_entity_linking import wiki_io as io


 logger = logging.getLogger(__name__)
@ -22,18 +17,24 @@ def create_kb(
    max_entities_per_alias,
    min_entity_freq,
    min_occ,
-    entity_def_input,
+    entity_def_path,
    entity_descr_path,
-    count_input,
-    prior_prob_input,
+    entity_alias_path,
+    entity_freq_path,
+    prior_prob_path,
    entity_vector_length,
 ):
    # Create the knowledge base from Wikidata entries
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length)
+    entity_list, filtered_title_to_id = _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length)
+    _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path)
+    return kb

+
+def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length):
    # read the mappings from file
-    title_to_id = get_entity_to_id(entity_def_input)
-    id_to_descr = get_id_to_description(entity_descr_path)
+    title_to_id = io.read_title_to_id(entity_def_path)
+    id_to_descr = io.read_id_to_descr(entity_descr_path)

    # check the length of the nlp vectors
    if "vectors" in nlp.meta and nlp.vocab.vectors.size:
@ -45,10 +46,8 @@ def create_kb(
            " cf. https://spacy.io/usage/models#languages."
        )

-    logger.info("Get entity frequencies")
-    entity_frequencies = wp.get_all_frequencies(count_input=count_input)
-
    logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq))
+    entity_frequencies = io.read_entity_to_count(entity_freq_path)
    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
    filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
        title_to_id,
@ -56,36 +55,33 @@ def create_kb(
        entity_frequencies,
        min_entity_freq
    )
-    logger.info("Left with {} entities".format(len(description_list)))
+    logger.info("Kept {} entities from the set of {}".format(len(description_list), len(title_to_id.keys())))

-    logger.info("Train entity encoder")
+    logger.info("Training entity encoder")
    encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
    encoder.train(description_list=description_list, to_print=True)

-    logger.info("Get entity embeddings:")
+    logger.info("Getting entity embeddings")
    embeddings = encoder.apply_encoder(description_list)

    logger.info("Adding {} entities".format(len(entity_list)))
    kb.set_entities(
        entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings
    )
+    return entity_list, filtered_title_to_id

-    logger.info("Adding aliases")
+
+def _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
+    logger.info("Adding aliases from Wikipedia and Wikidata")
    _add_aliases(
        kb,
+        entity_list=entity_list,
        title_to_id=filtered_title_to_id,
        max_entities_per_alias=max_entities_per_alias,
        min_occ=min_occ,
-        prior_prob_input=prior_prob_input,
+        prior_prob_path=prior_prob_path,
    )

-    logger.info("KB size: {} entities, {} aliases".format(
-        kb.get_size_entities(),
-        kb.get_size_aliases()))
-
-    logger.info("Done with kb")
-    return kb
-

 def get_filtered_entities(title_to_id, id_to_descr, entity_frequencies,
                          min_entity_freq: int = 10):
@ -104,34 +100,13 @@ def get_filtered_entities(title_to_id, id_to_descr, entity_frequencies,
    return filtered_title_to_id, entity_list, description_list, frequency_list


-def get_entity_to_id(entity_def_output):
-    entity_to_id = dict()
-    with entity_def_output.open("r", encoding="utf8") as csvfile:
-        csvreader = csv.reader(csvfile, delimiter="|")
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            entity_to_id[row[0]] = row[1]
-    return entity_to_id
-
-
-def get_id_to_description(entity_descr_path):
-    id_to_desc = dict()
-    with entity_descr_path.open("r", encoding="utf8") as csvfile:
-        csvreader = csv.reader(csvfile, delimiter="|")
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            id_to_desc[row[0]] = row[1]
-    return id_to_desc
-
-
-def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input):
+def _add_aliases(kb, entity_list, title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
    wp_titles = title_to_id.keys()

    # adding aliases with prior probabilities
    # we can read this file sequentially, it's sorted by alias, and then by count
-    with prior_prob_input.open("r", encoding="utf8") as prior_file:
+    logger.info("Adding WP aliases")
+    with prior_prob_path.open("r", encoding="utf8") as prior_file:
        # skip header
        prior_file.readline()
        line = prior_file.readline()
@ -180,10 +155,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
            line = prior_file.readline()


-def read_nlp_kb(model_dir, kb_file):
-    nlp = spacy.load(model_dir)
+def read_kb(nlp, kb_file):
    kb = KnowledgeBase(vocab=nlp.vocab)
    kb.load_bulk(kb_file)
-    logger.info("kb entities: {}".format(kb.get_size_entities()))
-    logger.info("kb aliases: {}".format(kb.get_size_aliases()))
-    return nlp, kb
+    return kb
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@ -53,7 +53,7 @@ class EntityEncoder:

            start = start + batch_size
            stop = min(stop + batch_size, len(description_list))
-            logger.info("encoded: {} entities".format(stop))
+            logger.info("Encoded: {} entities".format(stop))

        return encodings

@ -62,7 +62,7 @@ class EntityEncoder:
        if to_print:
            logger.info(
                "Trained entity descriptions on {} ".format(processed) +
-                "(non-unique) entities across {} ".format(self.epochs) +
+                "(non-unique) descriptions across {} ".format(self.epochs) +
                "epochs"
            )
            logger.info("Final loss: {}".format(loss))
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@ -1,395 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import logging
-import random
-import re
-import bz2
-import json
-
-from functools import partial
-
-from spacy.gold import GoldParse
-from bin.wiki_entity_linking import kb_creator
-
-"""
-Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
-Gold-standard entities are stored in one file in standoff format (by character offset).
-"""
-
-ENTITY_FILE = "gold_entities.csv"
-logger = logging.getLogger(__name__)
-
-
-def create_training_examples_and_descriptions(wikipedia_input,
-                                              entity_def_input,
-                                              description_output,
-                                              training_output,
-                                              parse_descriptions,
-                                              limit=None):
-    wp_to_id = kb_creator.get_entity_to_id(entity_def_input)
-    _process_wikipedia_texts(wikipedia_input,
-                             wp_to_id,
-                             description_output,
-                             training_output,
-                             parse_descriptions,
-                             limit)
-
-
-def _process_wikipedia_texts(wikipedia_input,
-                             wp_to_id,
-                             output,
-                             training_output,
-                             parse_descriptions,
-                             limit=None):
-    """
-    Read the XML wikipedia data to parse out training data:
-    raw text data + positive instances
-    """
-    title_regex = re.compile(r"(?<=<title>).*(?=</title>)")
-    id_regex = re.compile(r"(?<=<id>)\d*(?=</id>)")
-
-    read_ids = set()
-
-    with output.open("a", encoding="utf8") as descr_file, training_output.open("w", encoding="utf8") as entity_file:
-        if parse_descriptions:
-            _write_training_description(descr_file, "WD_id", "description")
-        with bz2.open(wikipedia_input, mode="rb") as file:
-            article_count = 0
-            article_text = ""
-            article_title = None
-            article_id = None
-            reading_text = False
-            reading_revision = False
-
-            logger.info("Processed {} articles".format(article_count))
-
-            for line in file:
-                clean_line = line.strip().decode("utf-8")
-
-                if clean_line == "<revision>":
-                    reading_revision = True
-                elif clean_line == "</revision>":
-                    reading_revision = False
-
-                # Start reading new page
-                if clean_line == "<page>":
-                    article_text = ""
-                    article_title = None
-                    article_id = None
-                # finished reading this page
-                elif clean_line == "</page>":
-                    if article_id:
-                        clean_text, entities = _process_wp_text(
-                            article_title,
-                            article_text,
-                            wp_to_id
-                        )
-                        if clean_text is not None and entities is not None:
-                            _write_training_entities(entity_file,
-                                                     article_id,
-                                                     clean_text,
-                                                     entities)
-
-                            if article_title in wp_to_id and parse_descriptions:
-                                description = " ".join(clean_text[:1000].split(" ")[:-1])
-                                _write_training_description(
-                                    descr_file,
-                                    wp_to_id[article_title],
-                                    description
-                                )
-                            article_count += 1
-                            if article_count % 10000 == 0:
-                                logger.info("Processed {} articles".format(article_count))
-                            if limit and article_count >= limit:
-                                break
-                    article_text = ""
-                    article_title = None
-                    article_id = None
-                    reading_text = False
-                    reading_revision = False
-
-                # start reading text within a page
-                if "<text" in clean_line:
-                    reading_text = True
-
-                if reading_text:
-                    article_text += " " + clean_line
-
-                # stop reading text within a page (we assume a new page doesn't start on the same line)
-                if "</text" in clean_line:
-                    reading_text = False
-
-                # read the ID of this article (outside the revision portion of the document)
-                if not reading_revision:
-                    ids = id_regex.search(clean_line)
-                    if ids:
-                        article_id = ids[0]
-                        if article_id in read_ids:
-                            logger.info(
-                                "Found duplicate article ID", article_id, clean_line
-                            )  # This should never happen ...
-                        read_ids.add(article_id)
-
-                # read the title of this article (outside the revision portion of the document)
-                if not reading_revision:
-                    titles = title_regex.search(clean_line)
-                    if titles:
-                        article_title = titles[0].strip()
-    logger.info("Finished. Processed {} articles".format(article_count))
-
-
-text_regex = re.compile(r"(?<=<text xml:space=\"preserve\">).*(?=</text)")
-info_regex = re.compile(r"{[^{]*?}")
-htlm_regex = re.compile(r"&lt;!--[^-]*--&gt;")
-category_regex = re.compile(r"\[\[Category:[^\[]*]]")
-file_regex = re.compile(r"\[\[File:[^[\]]+]]")
-ref_regex = re.compile(r"&lt;ref.*?&gt;")  # non-greedy
-ref_2_regex = re.compile(r"&lt;/ref.*?&gt;")  # non-greedy
-
-
-def _process_wp_text(article_title, article_text, wp_to_id):
-    # ignore meta Wikipedia pages
-    if (
-        article_title.startswith("Wikipedia:") or
-        article_title.startswith("Kategori:")
-    ):
-        return None, None
-
-    # remove the text tags
-    text_search = text_regex.search(article_text)
-    if text_search is None:
-        return None, None
-    text = text_search.group(0)
-
-    # stop processing if this is a redirect page
-    if text.startswith("#REDIRECT"):
-        return None, None
-
-    # get the raw text without markup etc, keeping only interwiki links
-    clean_text, entities = _remove_links(_get_clean_wp_text(text), wp_to_id)
-    return clean_text, entities
-
-
-def _get_clean_wp_text(article_text):
-    clean_text = article_text.strip()
-
-    # remove bolding & italic markup
-    clean_text = clean_text.replace("'''", "")
-    clean_text = clean_text.replace("''", "")
-
-    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
-    try_again = True
-    previous_length = len(clean_text)
-    while try_again:
-        clean_text = info_regex.sub(
-            "", clean_text
-        )  # non-greedy match excluding a nested {
-        if len(clean_text) < previous_length:
-            try_again = True
-        else:
-            try_again = False
-        previous_length = len(clean_text)
-
-    # remove HTML comments
-    clean_text = htlm_regex.sub("", clean_text)
-
-    # remove Category and File statements
-    clean_text = category_regex.sub("", clean_text)
-    clean_text = file_regex.sub("", clean_text)
-
-    # remove multiple =
-    while "==" in clean_text:
-        clean_text = clean_text.replace("==", "=")
-
-    clean_text = clean_text.replace(". =", ".")
-    clean_text = clean_text.replace(" = ", ". ")
-    clean_text = clean_text.replace("= ", ".")
-    clean_text = clean_text.replace(" =", "")
-
-    # remove refs (non-greedy match)
-    clean_text = ref_regex.sub("", clean_text)
-    clean_text = ref_2_regex.sub("", clean_text)
-
-    # remove additional wikiformatting
-    clean_text = re.sub(r"&lt;blockquote&gt;", "", clean_text)
-    clean_text = re.sub(r"&lt;/blockquote&gt;", "", clean_text)
-
-    # change special characters back to normal ones
-    clean_text = clean_text.replace(r"&lt;", "<")
-    clean_text = clean_text.replace(r"&gt;", ">")
-    clean_text = clean_text.replace(r"&quot;", '"')
-    clean_text = clean_text.replace(r"&amp;nbsp;", " ")
-    clean_text = clean_text.replace(r"&amp;", "&")
-
-    # remove multiple spaces
-    while "  " in clean_text:
-        clean_text = clean_text.replace("  ", " ")
-
-    return clean_text.strip()
-
-
-def _remove_links(clean_text, wp_to_id):
-    # read the text char by char to get the right offsets for the interwiki links
-    entities = []
-    final_text = ""
-    open_read = 0
-    reading_text = True
-    reading_entity = False
-    reading_mention = False
-    reading_special_case = False
-    entity_buffer = ""
-    mention_buffer = ""
-    for index, letter in enumerate(clean_text):
-        if letter == "[":
-            open_read += 1
-        elif letter == "]":
-            open_read -= 1
-        elif letter == "|":
-            if reading_text:
-                final_text += letter
-            # switch from reading entity to mention in the [[entity|mention]] pattern
-            elif reading_entity:
-                reading_text = False
-                reading_entity = False
-                reading_mention = True
-            else:
-                reading_special_case = True
-        else:
-            if reading_entity:
-                entity_buffer += letter
-            elif reading_mention:
-                mention_buffer += letter
-            elif reading_text:
-                final_text += letter
-            else:
-                raise ValueError("Not sure at point", clean_text[index - 2: index + 2])
-
-        if open_read > 2:
-            reading_special_case = True
-
-        if open_read == 2 and reading_text:
-            reading_text = False
-            reading_entity = True
-            reading_mention = False
-
-        # we just finished reading an entity
-        if open_read == 0 and not reading_text:
-            if "#" in entity_buffer or entity_buffer.startswith(":"):
-                reading_special_case = True
-            # Ignore cases with nested structures like File: handles etc
-            if not reading_special_case:
-                if not mention_buffer:
-                    mention_buffer = entity_buffer
-                start = len(final_text)
-                end = start + len(mention_buffer)
-                qid = wp_to_id.get(entity_buffer, None)
-                if qid:
-                    entities.append((mention_buffer, qid, start, end))
-                final_text += mention_buffer
-
-            entity_buffer = ""
-            mention_buffer = ""
-
-            reading_text = True
-            reading_entity = False
-            reading_mention = False
-            reading_special_case = False
-    return final_text, entities
-
-
-def _write_training_description(outputfile, qid, description):
-    if description is not None:
-        line = str(qid) + "|" + description + "\n"
-        outputfile.write(line)
-
-
-def _write_training_entities(outputfile, article_id, clean_text, entities):
-    entities_data = [{"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]} for ent in entities]
-    line = json.dumps(
-        {
-            "article_id": article_id,
-            "clean_text": clean_text,
-            "entities": entities_data
-        },
-        ensure_ascii=False) + "\n"
-    outputfile.write(line)
-
-
-def read_training(nlp, entity_file_path, dev, limit, kb):
-    """ This method provides training examples that correspond to the entity annotations found by the nlp object.
-     For training,, it will include negative training examples by using the candidate generator,
-     and it will only keep positive training examples that can be found by using the candidate generator.
-     For testing, it will include all positive examples only."""
-
-    from tqdm import tqdm
-    data = []
-    num_entities = 0
-    get_gold_parse = partial(_get_gold_parse, dev=dev, kb=kb)
-
-    logger.info("Reading {} data with limit {}".format('dev' if dev else 'train', limit))
-    with entity_file_path.open("r", encoding="utf8") as file:
-        with tqdm(total=limit, leave=False) as pbar:
-            for i, line in enumerate(file):
-                example = json.loads(line)
-                article_id = example["article_id"]
-                clean_text = example["clean_text"]
-                entities = example["entities"]
-
-                if dev != is_dev(article_id) or len(clean_text) >= 30000:
-                    continue
-
-                doc = nlp(clean_text)
-                gold = get_gold_parse(doc, entities)
-                if gold and len(gold.links) > 0:
-                    data.append((doc, gold))
-                    num_entities += len(gold.links)
-                    pbar.update(len(gold.links))
-                if limit and num_entities >= limit:
-                    break
-    logger.info("Read {} entities in {} articles".format(num_entities, len(data)))
-    return data
-
-
-def _get_gold_parse(doc, entities, dev, kb):
-    gold_entities = {}
-    tagged_ent_positions = set(
-        [(ent.start_char, ent.end_char) for ent in doc.ents]
-    )
-
-    for entity in entities:
-        entity_id = entity["entity"]
-        alias = entity["alias"]
-        start = entity["start"]
-        end = entity["end"]
-
-        candidates = kb.get_candidates(alias)
-        candidate_ids = [
-            c.entity_ for c in candidates
-        ]
-
-        should_add_ent = (
-            dev or
-            (
-                (start, end) in tagged_ent_positions and
-                entity_id in candidate_ids and
-                len(candidates) > 1
-            )
-        )
-
-        if should_add_ent:
-            value_by_id = {entity_id: 1.0}
-            if not dev:
-                random.shuffle(candidate_ids)
-                value_by_id.update({
-                    kb_id: 0.0
-                    for kb_id in candidate_ids
-                    if kb_id != entity_id
-                })
-            gold_entities[(start, end)] = value_by_id
-
-    return GoldParse(doc, links=gold_entities)
-
-
-def is_dev(article_id):
-    return article_id.endswith("3")
--- a/bin/wiki_entity_linking/wiki_io.py
+++ b/bin/wiki_entity_linking/wiki_io.py
@ -0,0 +1,127 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import sys
+import csv
+
+# min() needed to prevent error on windows, cf https://stackoverflow.com/questions/52404416/
+csv.field_size_limit(min(sys.maxsize, 2147483646))
+
+""" This class provides reading/writing methods for temp files """
+
+
+# Entity definition: WP title -> WD ID #
+def write_title_to_id(entity_def_output, title_to_id):
+    with entity_def_output.open("w", encoding="utf8") as id_file:
+        id_file.write("WP_title" + "|" + "WD_id" + "\n")
+        for title, qid in title_to_id.items():
+            id_file.write(title + "|" + str(qid) + "\n")
+
+
+def read_title_to_id(entity_def_output):
+    title_to_id = dict()
+    with entity_def_output.open("r", encoding="utf8") as id_file:
+        csvreader = csv.reader(id_file, delimiter="|")
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            title_to_id[row[0]] = row[1]
+    return title_to_id
+
+
+# Entity aliases from WD: WD ID -> WD alias #
+def write_id_to_alias(entity_alias_path, id_to_alias):
+    with entity_alias_path.open("w", encoding="utf8") as alias_file:
+        alias_file.write("WD_id" + "|" + "alias" + "\n")
+        for qid, alias_list in id_to_alias.items():
+            for alias in alias_list:
+                alias_file.write(str(qid) + "|" + alias + "\n")
+
+
+def read_id_to_alias(entity_alias_path):
+    id_to_alias = dict()
+    with entity_alias_path.open("r", encoding="utf8") as alias_file:
+        csvreader = csv.reader(alias_file, delimiter="|")
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            qid = row[0]
+            alias = row[1]
+            alias_list = id_to_alias.get(qid, [])
+            alias_list.append(alias)
+            id_to_alias[qid] = alias_list
+    return id_to_alias
+
+
+def read_alias_to_id_generator(entity_alias_path):
+    """ Read (aliases, qid) tuples """
+
+    with entity_alias_path.open("r", encoding="utf8") as alias_file:
+        csvreader = csv.reader(alias_file, delimiter="|")
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            qid = row[0]
+            alias = row[1]
+            yield alias, qid
+
+
+# Entity descriptions from WD: WD ID -> WD alias #
+def write_id_to_descr(entity_descr_output, id_to_descr):
+    with entity_descr_output.open("w", encoding="utf8") as descr_file:
+        descr_file.write("WD_id" + "|" + "description" + "\n")
+        for qid, descr in id_to_descr.items():
+            descr_file.write(str(qid) + "|" + descr + "\n")
+
+
+def read_id_to_descr(entity_desc_path):
+    id_to_desc = dict()
+    with entity_desc_path.open("r", encoding="utf8") as descr_file:
+        csvreader = csv.reader(descr_file, delimiter="|")
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            id_to_desc[row[0]] = row[1]
+    return id_to_desc
+
+
+# Entity counts from WP: WP title -> count #
+def write_entity_to_count(prior_prob_input, count_output):
+    # Write entity counts for quick access later
+    entity_to_count = dict()
+    total_count = 0
+
+    with prior_prob_input.open("r", encoding="utf8") as prior_file:
+        # skip header
+        prior_file.readline()
+        line = prior_file.readline()
+
+        while line:
+            splits = line.replace("\n", "").split(sep="|")
+            # alias = splits[0]
+            count = int(splits[1])
+            entity = splits[2]
+
+            current_count = entity_to_count.get(entity, 0)
+            entity_to_count[entity] = current_count + count
+
+            total_count += count
+
+            line = prior_file.readline()
+
+    with count_output.open("w", encoding="utf8") as entity_file:
+        entity_file.write("entity" + "|" + "count" + "\n")
+        for entity, count in entity_to_count.items():
+            entity_file.write(entity + "|" + str(count) + "\n")
+
+
+def read_entity_to_count(count_input):
+    entity_to_count = dict()
+    with count_input.open("r", encoding="utf8") as csvfile:
+        csvreader = csv.reader(csvfile, delimiter="|")
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            entity_to_count[row[0]] = int(row[1])
+
+    return entity_to_count
--- a/bin/wiki_entity_linking/wiki_namespaces.py
+++ b/bin/wiki_entity_linking/wiki_namespaces.py
@ -0,0 +1,128 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+# List of meta pages in Wikidata, should be kept out of the Knowledge base
+WD_META_ITEMS = [
+    "Q163875",
+    "Q191780",
+    "Q224414",
+    "Q4167836",
+    "Q4167410",
+    "Q4663903",
+    "Q11266439",
+    "Q13406463",
+    "Q15407973",
+    "Q18616576",
+    "Q19887878",
+    "Q22808320",
+    "Q23894233",
+    "Q33120876",
+    "Q42104522",
+    "Q47460393",
+    "Q64875536",
+    "Q66480449",
+]
+
+
+# TODO: add more cases from non-English WP's
+
+# List of prefixes that refer to Wikipedia "file" pages
+WP_FILE_NAMESPACE = ["Bestand", "File"]
+
+# List of prefixes that refer to Wikipedia "category" pages
+WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"]
+
+# List of prefixes that refer to Wikipedia "meta" pages
+# these will/should be matched ignoring case
+WP_META_NAMESPACE = (
+    WP_FILE_NAMESPACE
+    + WP_CATEGORY_NAMESPACE
+    + [
+        "b",
+        "betawikiversity",
+        "Book",
+        "c",
+        "Commons",
+        "d",
+        "dbdump",
+        "download",
+        "Draft",
+        "Education",
+        "Foundation",
+        "Gadget",
+        "Gadget definition",
+        "Gebruiker",
+        "gerrit",
+        "Help",
+        "Image",
+        "Incubator",
+        "m",
+        "mail",
+        "mailarchive",
+        "media",
+        "MediaWiki",
+        "MediaWiki talk",
+        "Mediawikiwiki",
+        "MediaZilla",
+        "Meta",
+        "Metawikipedia",
+        "Module",
+        "mw",
+        "n",
+        "nost",
+        "oldwikisource",
+        "otrs",
+        "OTRSwiki",
+        "Overleg gebruiker",
+        "outreach",
+        "outreachwiki",
+        "Portal",
+        "phab",
+        "Phabricator",
+        "Project",
+        "q",
+        "quality",
+        "rev",
+        "s",
+        "spcom",
+        "Special",
+        "species",
+        "Strategy",
+        "sulutil",
+        "svn",
+        "Talk",
+        "Template",
+        "Template talk",
+        "Testwiki",
+        "ticket",
+        "TimedText",
+        "Toollabs",
+        "tools",
+        "tswiki",
+        "User",
+        "User talk",
+        "v",
+        "voy",
+        "w",
+        "Wikibooks",
+        "Wikidata",
+        "wikiHow",
+        "Wikinvest",
+        "wikilivres",
+        "Wikimedia",
+        "Wikinews",
+        "Wikipedia",
+        "Wikipedia talk",
+        "Wikiquote",
+        "Wikisource",
+        "Wikispecies",
+        "Wikitech",
+        "Wikiversity",
+        "Wikivoyage",
+        "wikt",
+        "wiktionary",
+        "wmf",
+        "wmania",
+        "WP",
+    ]
+)
--- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py
+++ b/bin/wiki_entity_linking/wikidata_pretrain_kb.py
@ -18,11 +18,12 @@ from pathlib import Path
 import plac

 from bin.wiki_entity_linking import wikipedia_processor as wp, wikidata_processor as wd
+from bin.wiki_entity_linking import wiki_io as io
 from bin.wiki_entity_linking import kb_creator
-from bin.wiki_entity_linking import training_set_creator
 from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_FILE, ENTITY_DESCR_PATH, KB_MODEL_DIR, LOG_FORMAT
-from bin.wiki_entity_linking import ENTITY_FREQ_PATH, PRIOR_PROB_PATH, ENTITY_DEFS_PATH
+from bin.wiki_entity_linking import ENTITY_FREQ_PATH, PRIOR_PROB_PATH, ENTITY_DEFS_PATH, ENTITY_ALIAS_PATH
 import spacy
+from bin.wiki_entity_linking.kb_creator import read_kb

 logger = logging.getLogger(__name__)

@ -39,9 +40,11 @@ logger = logging.getLogger(__name__)
    loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path),
    loc_entity_defs=("Location to file with entity definitions", "option", "d", Path),
    loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path),
-    descriptions_from_wikipedia=("Flag for using wp descriptions not wd", "flag", "wp"),
-    limit=("Optional threshold to limit lines read from dumps", "option", "l", int),
-    lang=("Optional language for which to get wikidata titles. Defaults to 'en'", "option", "la", str),
+    descr_from_wp=("Flag for using wp descriptions not wd", "flag", "wp"),
+    limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int),
+    limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int),
+    limit_wd=("Threshold to limit lines read from WD", "option", "lw", int),
+    lang=("Optional language for which to get Wikidata titles. Defaults to 'en'", "option", "la", str),
 )
 def main(
    wd_json,
@ -54,13 +57,16 @@ def main(
    entity_vector_length=64,
    loc_prior_prob=None,
    loc_entity_defs=None,
+    loc_entity_alias=None,
    loc_entity_desc=None,
-    descriptions_from_wikipedia=False,
-    limit=None,
+    descr_from_wp=False,
+    limit_prior=None,
+    limit_train=None,
+    limit_wd=None,
    lang="en",
 ):
-
    entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH
+    entity_alias_path = loc_entity_alias if loc_entity_alias else output_dir / ENTITY_ALIAS_PATH
    entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH
    entity_freq_path = output_dir / ENTITY_FREQ_PATH
    prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH
@ -69,15 +75,12 @@ def main(

    logger.info("Creating KB with Wikipedia and WikiData")

-    if limit is not None:
-        logger.warning("Warning: reading only {} lines of Wikipedia/Wikidata dumps.".format(limit))
-
    # STEP 0: set up IO
    if not output_dir.exists():
        output_dir.mkdir(parents=True)

-    # STEP 1: create the NLP object
-    logger.info("STEP 1: Loading model {}".format(model))
+    # STEP 1: Load the NLP object
+    logger.info("STEP 1: Loading NLP model {}".format(model))
    nlp = spacy.load(model)

    # check the length of the nlp vectors
@ -90,62 +93,83 @@ def main(
    # STEP 2: create prior probabilities from WP
    if not prior_prob_path.exists():
        # It takes about 2h to process 1000M lines of Wikipedia XML dump
-        logger.info("STEP 2: writing prior probabilities to {}".format(prior_prob_path))
-        wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit)
-    logger.info("STEP 2: reading prior probabilities from {}".format(prior_prob_path))
+        logger.info("STEP 2: Writing prior probabilities to {}".format(prior_prob_path))
+        if limit_prior is not None:
+            logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_prior))
+        wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit_prior)
+    else:
+        logger.info("STEP 2: Reading prior probabilities from {}".format(prior_prob_path))

-    # STEP 3: deduce entity frequencies from WP (takes only a few minutes)
-    logger.info("STEP 3: calculating entity frequencies")
-    wp.write_entity_counts(prior_prob_path, entity_freq_path, to_print=False)
+    # STEP 3: calculate entity frequencies
+    if not entity_freq_path.exists():
+        logger.info("STEP 3: Calculating and writing entity frequencies to {}".format(entity_freq_path))
+        io.write_entity_to_count(prior_prob_path, entity_freq_path)
+    else:
+        logger.info("STEP 3: Reading entity frequencies from {}".format(entity_freq_path))

    # STEP 4: reading definitions and (possibly) descriptions from WikiData or from file
-    message = " and descriptions" if not descriptions_from_wikipedia else ""
-    if (not entity_defs_path.exists()) or (not descriptions_from_wikipedia and not entity_descr_path.exists()):
+    if (not entity_defs_path.exists()) or (not descr_from_wp and not entity_descr_path.exists()):
        # It takes about 10h to process 55M lines of Wikidata JSON dump
-        logger.info("STEP 4: parsing wikidata for entity definitions" + message)
-        title_to_id, id_to_descr = wd.read_wikidata_entities_json(
+        logger.info("STEP 4: Parsing and writing Wikidata entity definitions to {}".format(entity_defs_path))
+        if limit_wd is not None:
+            logger.warning("Warning: reading only {} lines of Wikidata dump".format(limit_wd))
+        title_to_id, id_to_descr, id_to_alias = wd.read_wikidata_entities_json(
            wd_json,
-            limit,
+            limit_wd,
            to_print=False,
            lang=lang,
-            parse_descriptions=(not descriptions_from_wikipedia),
+            parse_descr=(not descr_from_wp),
        )
-        wd.write_entity_files(entity_defs_path, title_to_id)
-        if not descriptions_from_wikipedia:
-            wd.write_entity_description_files(entity_descr_path, id_to_descr)
-    logger.info("STEP 4: read entity definitions" + message)
+        io.write_title_to_id(entity_defs_path, title_to_id)

-    # STEP 5: Getting gold entities from wikipedia
-    message = " and descriptions" if descriptions_from_wikipedia else ""
-    if (not training_entities_path.exists()) or (descriptions_from_wikipedia and not entity_descr_path.exists()):
-        logger.info("STEP 5: parsing wikipedia for gold entities" + message)
-        training_set_creator.create_training_examples_and_descriptions(
-            wp_xml,
-            entity_defs_path,
-            entity_descr_path,
-            training_entities_path,
-            parse_descriptions=descriptions_from_wikipedia,
-            limit=limit,
-        )
-    logger.info("STEP 5: read gold entities" + message)
+        logger.info("STEP 4b: Writing Wikidata entity aliases to {}".format(entity_alias_path))
+        io.write_id_to_alias(entity_alias_path, id_to_alias)
+
+        if not descr_from_wp:
+            logger.info("STEP 4c: Writing Wikidata entity descriptions to {}".format(entity_descr_path))
+            io.write_id_to_descr(entity_descr_path, id_to_descr)
+    else:
+        logger.info("STEP 4: Reading entity definitions from {}".format(entity_defs_path))
+        logger.info("STEP 4b: Reading entity aliases from {}".format(entity_alias_path))
+        if not descr_from_wp:
+            logger.info("STEP 4c: Reading entity descriptions from {}".format(entity_descr_path))
+
+    # STEP 5: Getting gold entities from Wikipedia
+    if (not training_entities_path.exists()) or (descr_from_wp and not entity_descr_path.exists()):
+        logger.info("STEP 5: Parsing and writing Wikipedia gold entities to {}".format(training_entities_path))
+        if limit_train is not None:
+            logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_train))
+        wp.create_training_and_desc(wp_xml, entity_defs_path, entity_descr_path,
+                                    training_entities_path, descr_from_wp, limit_train)
+        if descr_from_wp:
+            logger.info("STEP 5b: Parsing and writing Wikipedia descriptions to {}".format(entity_descr_path))
+    else:
+        logger.info("STEP 5: Reading gold entities from {}".format(training_entities_path))
+        if descr_from_wp:
+            logger.info("STEP 5b: Reading entity descriptions from {}".format(entity_descr_path))

    # STEP 6: creating the actual KB
    # It takes ca. 30 minutes to pretrain the entity embeddings
-    logger.info("STEP 6: creating the KB at {}".format(kb_path))
+    if not kb_path.exists():
+        logger.info("STEP 6: Creating the KB at {}".format(kb_path))
        kb = kb_creator.create_kb(
            nlp=nlp,
            max_entities_per_alias=max_per_alias,
            min_entity_freq=min_freq,
            min_occ=min_pair,
-        entity_def_input=entity_defs_path,
+            entity_def_path=entity_defs_path,
            entity_descr_path=entity_descr_path,
-        count_input=entity_freq_path,
-        prior_prob_input=prior_prob_path,
+            entity_alias_path=entity_alias_path,
+            entity_freq_path=entity_freq_path,
+            prior_prob_path=prior_prob_path,
            entity_vector_length=entity_vector_length,
        )
-
        kb.dump(kb_path)
+        logger.info("kb entities: {}".format(kb.get_size_entities()))
+        logger.info("kb aliases: {}".format(kb.get_size_aliases()))
        nlp.to_disk(output_dir / KB_MODEL_DIR)
+    else:
+        logger.info("STEP 6: KB already exists at {}".format(kb_path))

    logger.info("Done!")

--- a/bin/wiki_entity_linking/wikidata_processor.py
+++ b/bin/wiki_entity_linking/wikidata_processor.py
@ -1,40 +1,52 @@
 # coding: utf-8
 from __future__ import unicode_literals

-import gzip
+import bz2
 import json
 import logging
-import datetime
+
+from bin.wiki_entity_linking.wiki_namespaces import WD_META_ITEMS

 logger = logging.getLogger(__name__)


-def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descriptions=True):
-    # Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines.
+def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descr=True):
+    # Read the JSON wiki data and parse out the entities. Takes about 7-10h to parse 55M lines.
    # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/

    site_filter = '{}wiki'.format(lang)

-    # properties filter (currently disabled to get ALL data)
-    prop_filter = dict()
-    # prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
+    # filter: currently defined as OR: one hit suffices to be removed from further processing
+    exclude_list = WD_META_ITEMS
+
+    # punctuation
+    exclude_list.extend(["Q1383557", "Q10617810"])
+
+    # letters etc
+    exclude_list.extend(["Q188725", "Q19776628", "Q3841820", "Q17907810", "Q9788", "Q9398093"])
+
+    neg_prop_filter = {
+        'P31': exclude_list,    # instance of
+        'P279': exclude_list    # subclass
+    }

    title_to_id = dict()
    id_to_descr = dict()
+    id_to_alias = dict()

    # parse appropriate fields - depending on what we need in the KB
    parse_properties = False
    parse_sitelinks = True
    parse_labels = False
-    parse_aliases = False
-    parse_claims = False
+    parse_aliases = True
+    parse_claims = True

-    with gzip.open(wikidata_file, mode='rb') as file:
+    with bz2.open(wikidata_file, mode='rb') as file:
        for cnt, line in enumerate(file):
            if limit and cnt >= limit:
                break
-            if cnt % 500000 == 0:
-                logger.info("processed {} lines of WikiData dump".format(cnt))
+            if cnt % 500000 == 0 and cnt > 0:
+                logger.info("processed {} lines of WikiData JSON dump".format(cnt))
            clean_line = line.strip()
            if clean_line.endswith(b","):
                clean_line = clean_line[:-1]
@ -43,13 +55,11 @@ def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang=
                entry_type = obj["type"]

                if entry_type == "item":
-                    # filtering records on their properties (currently disabled to get ALL data)
-                    # keep = False
                    keep = True

                    claims = obj["claims"]
                    if parse_claims:
-                        for prop, value_set in prop_filter.items():
+                        for prop, value_set in neg_prop_filter.items():
                            claim_property = claims.get(prop, None)
                            if claim_property:
                                for cp in claim_property:
@ -61,7 +71,7 @@ def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang=
                                    )
                                    cp_rank = cp["rank"]
                                    if cp_rank != "deprecated" and cp_id in value_set:
-                                        keep = True
+                                        keep = False

                    if keep:
                        unique_id = obj["id"]
@ -108,7 +118,7 @@ def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang=
                                            "label (" + lang + "):", lang_label["value"]
                                        )

-                        if found_link and parse_descriptions:
+                        if found_link and parse_descr:
                            descriptions = obj["descriptions"]
                            if descriptions:
                                lang_descr = descriptions.get(lang, None)
@ -130,22 +140,15 @@ def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang=
                                            print(
                                                "alias (" + lang + "):", item["value"]
                                            )
+                                        alias_list = id_to_alias.get(unique_id, [])
+                                        alias_list.append(item["value"])
+                                        id_to_alias[unique_id] = alias_list

                        if to_print:
                            print()

-    return title_to_id, id_to_descr
+    # log final number of lines processed
+    logger.info("Finished. Processed {} lines of WikiData JSON dump".format(cnt))
+    return title_to_id, id_to_descr, id_to_alias


-def write_entity_files(entity_def_output, title_to_id):
-    with entity_def_output.open("w", encoding="utf8") as id_file:
-        id_file.write("WP_title" + "|" + "WD_id" + "\n")
-        for title, qid in title_to_id.items():
-            id_file.write(title + "|" + str(qid) + "\n")
-
-
-def write_entity_description_files(entity_descr_output, id_to_descr):
-    with entity_descr_output.open("w", encoding="utf8") as descr_file:
-        descr_file.write("WD_id" + "|" + "description" + "\n")
-        for qid, descr in id_to_descr.items():
-            descr_file.write(str(qid) + "|" + descr + "\n")
--- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py
+++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
@ -6,19 +6,19 @@ as created by the script `wikidata_create_kb`.

 For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2
 from https://dumps.wikimedia.org/enwiki/latest/
-
 """
 from __future__ import unicode_literals

 import random
 import logging
+import spacy
 from pathlib import Path
 import plac

-from bin.wiki_entity_linking import training_set_creator
+from bin.wiki_entity_linking import wikipedia_processor
 from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_MODEL_DIR, KB_FILE, LOG_FORMAT, OUTPUT_MODEL_DIR
-from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance, measure_baselines
-from bin.wiki_entity_linking.kb_creator import read_nlp_kb
+from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance
+from bin.wiki_entity_linking.kb_creator import read_kb

 from spacy.util import minibatch, compounding

@ -35,6 +35,7 @@ logger = logging.getLogger(__name__)
    l2=("L2 regularization", "option", "r", float),
    train_inst=("# training instances (default 90% of all)", "option", "t", int),
    dev_inst=("# test instances (default 10% of all)", "option", "d", int),
+    labels_discard=("NER labels to discard (default None)", "option", "l", str),
 )
 def main(
    dir_kb,
@ -46,13 +47,14 @@ def main(
    l2=1e-6,
    train_inst=None,
    dev_inst=None,
+    labels_discard=None
 ):
    logger.info("Creating Entity Linker with Wikipedia and WikiData")

    output_dir = Path(output_dir) if output_dir else dir_kb
-    training_path = loc_training if loc_training else output_dir / TRAINING_DATA_FILE
+    training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE
    nlp_dir = dir_kb / KB_MODEL_DIR
-    kb_path = output_dir / KB_FILE
+    kb_path = dir_kb / KB_FILE
    nlp_output_dir = output_dir / OUTPUT_MODEL_DIR

    # STEP 0: set up IO
@ -60,38 +62,47 @@ def main(
        output_dir.mkdir()

    # STEP 1 : load the NLP object
-    logger.info("STEP 1: loading model from {}".format(nlp_dir))
-    nlp, kb = read_nlp_kb(nlp_dir, kb_path)
+    logger.info("STEP 1a: Loading model from {}".format(nlp_dir))
+    nlp = spacy.load(nlp_dir)
+    logger.info("STEP 1b: Loading KB from {}".format(kb_path))
+    kb = read_kb(nlp, kb_path)

    # check that there is a NER component in the pipeline
    if "ner" not in nlp.pipe_names:
        raise ValueError("The `nlp` object should have a pretrained `ner` component.")

-    # STEP 2: create a training dataset from WP
-    logger.info("STEP 2: reading training dataset from {}".format(training_path))
+    # STEP 2: read the training dataset previously created from WP
+    logger.info("STEP 2: Reading training dataset from {}".format(training_path))

-    train_data = training_set_creator.read_training(
+    if labels_discard:
+        labels_discard = [x.strip() for x in labels_discard.split(",")]
+        logger.info("Discarding {} NER types: {}".format(len(labels_discard), labels_discard))
+
+    train_data = wikipedia_processor.read_training(
        nlp=nlp,
        entity_file_path=training_path,
        dev=False,
        limit=train_inst,
        kb=kb,
+        labels_discard=labels_discard
    )

-    # for testing, get all pos instances, whether or not they are in the kb
-    dev_data = training_set_creator.read_training(
+    # for testing, get all pos instances (independently of KB)
+    dev_data = wikipedia_processor.read_training(
        nlp=nlp,
        entity_file_path=training_path,
        dev=True,
        limit=dev_inst,
-        kb=kb,
+        kb=None,
+        labels_discard=labels_discard
    )

-    # STEP 3: create and train the entity linking pipe
-    logger.info("STEP 3: training Entity Linking pipe")
+    # STEP 3: create and train an entity linking pipe
+    logger.info("STEP 3: Creating and training an Entity Linking pipe")

    el_pipe = nlp.create_pipe(
-        name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name}
+        name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name,
+                                      "labels_discard": labels_discard}
    )
    el_pipe.set_kb(kb)
    nlp.add_pipe(el_pipe, last=True)
@ -105,14 +116,9 @@ def main(
    logger.info("Training on {} articles".format(len(train_data)))
    logger.info("Dev testing on {} articles".format(len(dev_data)))

-    dev_baseline_accuracies = measure_baselines(
-        dev_data, kb
-    )
-
+    # baseline performance on dev data
    logger.info("Dev Baseline Accuracies:")
-    logger.info(dev_baseline_accuracies.report_accuracy("random"))
-    logger.info(dev_baseline_accuracies.report_accuracy("prior"))
-    logger.info(dev_baseline_accuracies.report_accuracy("oracle"))
+    measure_performance(dev_data, kb, el_pipe, baseline=True, context=False)

    for itn in range(epochs):
        random.shuffle(train_data)
@ -136,18 +142,18 @@ def main(
                    logger.error("Error updating batch:" + str(e))
        if batchnr > 0:
            logging.info("Epoch {}, train loss {}".format(itn, round(losses["entity_linker"] / batchnr, 2)))
-        measure_performance(dev_data, kb, el_pipe)
+            measure_performance(dev_data, kb, el_pipe, baseline=False, context=True)

    # STEP 4: measure the performance of our trained pipe on an independent dev set
-    logger.info("STEP 4: performance measurement of Entity Linking pipe")
+    logger.info("STEP 4: Final performance measurement of Entity Linking pipe")
    measure_performance(dev_data, kb, el_pipe)

    # STEP 5: apply the EL pipe on a toy example
-    logger.info("STEP 5: applying Entity Linking to toy example")
+    logger.info("STEP 5: Applying Entity Linking to toy example")
    run_el_toy_example(nlp=nlp)

    if output_dir:
-        # STEP 6: write the NLP pipeline (including entity linker) to file
+        # STEP 6: write the NLP pipeline (now including an EL model) to file
        logger.info("STEP 6: Writing trained NLP to {}".format(nlp_output_dir))
        nlp.to_disk(nlp_output_dir)

--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@ -3,147 +3,104 @@ from __future__ import unicode_literals

 import re
 import bz2
-import csv
-import datetime
 import logging
+import random
+import json

-from bin.wiki_entity_linking import LOG_FORMAT
+from functools import partial
+
+from spacy.gold import GoldParse
+from bin.wiki_entity_linking import wiki_io as io
+from bin.wiki_entity_linking.wiki_namespaces import (
+    WP_META_NAMESPACE,
+    WP_FILE_NAMESPACE,
+    WP_CATEGORY_NAMESPACE,
+)

 """
 Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
 Write these results to file for downstream KB and training data generation.
+
+Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
 """

+ENTITY_FILE = "gold_entities.csv"
+
 map_alias_to_link = dict()

 logger = logging.getLogger(__name__)

-
-# these will/should be matched ignoring case
-wiki_namespaces = [
-    "b",
-    "betawikiversity",
-    "Book",
-    "c",
-    "Category",
-    "Commons",
-    "d",
-    "dbdump",
-    "download",
-    "Draft",
-    "Education",
-    "Foundation",
-    "Gadget",
-    "Gadget definition",
-    "gerrit",
-    "File",
-    "Help",
-    "Image",
-    "Incubator",
-    "m",
-    "mail",
-    "mailarchive",
-    "media",
-    "MediaWiki",
-    "MediaWiki talk",
-    "Mediawikiwiki",
-    "MediaZilla",
-    "Meta",
-    "Metawikipedia",
-    "Module",
-    "mw",
-    "n",
-    "nost",
-    "oldwikisource",
-    "outreach",
-    "outreachwiki",
-    "otrs",
-    "OTRSwiki",
-    "Portal",
-    "phab",
-    "Phabricator",
-    "Project",
-    "q",
-    "quality",
-    "rev",
-    "s",
-    "spcom",
-    "Special",
-    "species",
-    "Strategy",
-    "sulutil",
-    "svn",
-    "Talk",
-    "Template",
-    "Template talk",
-    "Testwiki",
-    "ticket",
-    "TimedText",
-    "Toollabs",
-    "tools",
-    "tswiki",
-    "User",
-    "User talk",
-    "v",
-    "voy",
-    "w",
-    "Wikibooks",
-    "Wikidata",
-    "wikiHow",
-    "Wikinvest",
-    "wikilivres",
-    "Wikimedia",
-    "Wikinews",
-    "Wikipedia",
-    "Wikipedia talk",
-    "Wikiquote",
-    "Wikisource",
-    "Wikispecies",
-    "Wikitech",
-    "Wikiversity",
-    "Wikivoyage",
-    "wikt",
-    "wiktionary",
-    "wmf",
-    "wmania",
-    "WP",
-]
+title_regex = re.compile(r"(?<=<title>).*(?=</title>)")
+id_regex = re.compile(r"(?<=<id>)\d*(?=</id>)")
+text_regex = re.compile(r"(?<=<text xml:space=\"preserve\">).*(?=</text)")
+info_regex = re.compile(r"{[^{]*?}")
+html_regex = re.compile(r"&lt;!--[^-]*--&gt;")
+ref_regex = re.compile(r"&lt;ref.*?&gt;")  # non-greedy
+ref_2_regex = re.compile(r"&lt;/ref.*?&gt;")  # non-greedy

 # find the links
 link_regex = re.compile(r"\[\[[^\[\]]*\]\]")

 # match on interwiki links, e.g. `en:` or `:fr:`
 ns_regex = r":?" + "[a-z][a-z]" + ":"
-
 # match on Namespace: optionally preceded by a :
-for ns in wiki_namespaces:
+for ns in WP_META_NAMESPACE:
    ns_regex += "|" + ":?" + ns + ":"
-
 ns_regex = re.compile(ns_regex, re.IGNORECASE)

+files = r""
+for f in WP_FILE_NAMESPACE:
+    files += "\[\[" + f + ":[^[\]]+]]" + "|"
+files = files[0 : len(files) - 1]
+file_regex = re.compile(files)
+
+cats = r""
+for c in WP_CATEGORY_NAMESPACE:
+    cats += "\[\[" + c + ":[^\[]*]]" + "|"
+cats = cats[0 : len(cats) - 1]
+category_regex = re.compile(cats)
+

 def read_prior_probs(wikipedia_input, prior_prob_output, limit=None):
    """
    Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities.
-    The full file takes about 2h to parse 1100M lines.
-    It works relatively fast because it runs line by line, irrelevant of which article the intrawiki is from.
+    The full file takes about 2-3h to parse 1100M lines.
+    It works relatively fast because it runs line by line, irrelevant of which article the intrawiki is from,
+    though dev test articles are excluded in order not to get an artificially strong baseline.
    """
+    cnt = 0
+    read_id = False
+    current_article_id = None
    with bz2.open(wikipedia_input, mode="rb") as file:
        line = file.readline()
-        cnt = 0
        while line and (not limit or cnt < limit):
-            if cnt % 25000000 == 0:
+            if cnt % 25000000 == 0 and cnt > 0:
                logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
            clean_line = line.strip().decode("utf-8")

+            # we attempt at reading the article's ID (but not the revision or contributor ID)
+            if "<revision>" in clean_line or "<contributor>" in clean_line:
+                read_id = False
+            if "<page>" in clean_line:
+                read_id = True
+
+            if read_id:
+                ids = id_regex.search(clean_line)
+                if ids:
+                    current_article_id = ids[0]
+
+            # only processing prior probabilities from true training (non-dev) articles
+            if not is_dev(current_article_id):
                aliases, entities, normalizations = get_wp_links(clean_line)
                for alias, entity, norm in zip(aliases, entities, normalizations):
-                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
-                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
+                    _store_alias(
+                        alias, entity, normalize_alias=norm, normalize_entity=True
+                    )

            line = file.readline()
            cnt += 1
        logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
+    logger.info("Finished. processed {} lines of Wikipedia XML dump".format(cnt))

    # write all aliases and their entities and count occurrences to file
    with prior_prob_output.open("w", encoding="utf8") as outputfile:
@ -182,7 +139,7 @@ def get_wp_links(text):
        match = match[2:][:-2].replace("_", " ").strip()

        if ns_regex.match(match):
-            pass  # ignore namespaces at the beginning of the string
+            pass  # ignore the entity if it points to a "meta" page

        # this is a simple [[link]], with the alias the same as the mention
        elif "|" not in match:
@ -218,47 +175,382 @@ def _capitalize_first(text):
    return result


-def write_entity_counts(prior_prob_input, count_output, to_print=False):
-    # Write entity counts for quick access later
-    entity_to_count = dict()
-    total_count = 0
-
-    with prior_prob_input.open("r", encoding="utf8") as prior_file:
-        # skip header
-        prior_file.readline()
-        line = prior_file.readline()
-
-        while line:
-            splits = line.replace("\n", "").split(sep="|")
-            # alias = splits[0]
-            count = int(splits[1])
-            entity = splits[2]
-
-            current_count = entity_to_count.get(entity, 0)
-            entity_to_count[entity] = current_count + count
-
-            total_count += count
-
-            line = prior_file.readline()
-
-    with count_output.open("w", encoding="utf8") as entity_file:
-        entity_file.write("entity" + "|" + "count" + "\n")
-        for entity, count in entity_to_count.items():
-            entity_file.write(entity + "|" + str(count) + "\n")
-
-    if to_print:
-        for entity, count in entity_to_count.items():
-            print("Entity count:", entity, count)
-        print("Total count:", total_count)
+def create_training_and_desc(
+    wp_input, def_input, desc_output, training_output, parse_desc, limit=None
+):
+    wp_to_id = io.read_title_to_id(def_input)
+    _process_wikipedia_texts(
+        wp_input, wp_to_id, desc_output, training_output, parse_desc, limit
+    )


-def get_all_frequencies(count_input):
-    entity_to_count = dict()
-    with count_input.open("r", encoding="utf8") as csvfile:
-        csvreader = csv.reader(csvfile, delimiter="|")
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            entity_to_count[row[0]] = int(row[1])
+def _process_wikipedia_texts(
+    wikipedia_input, wp_to_id, output, training_output, parse_descriptions, limit=None
+):
+    """
+    Read the XML wikipedia data to parse out training data:
+    raw text data + positive instances
+    """

-    return entity_to_count
+    read_ids = set()
+
+    with output.open("a", encoding="utf8") as descr_file, training_output.open(
+        "w", encoding="utf8"
+    ) as entity_file:
+        if parse_descriptions:
+            _write_training_description(descr_file, "WD_id", "description")
+        with bz2.open(wikipedia_input, mode="rb") as file:
+            article_count = 0
+            article_text = ""
+            article_title = None
+            article_id = None
+            reading_text = False
+            reading_revision = False
+
+            for line in file:
+                clean_line = line.strip().decode("utf-8")
+
+                if clean_line == "<revision>":
+                    reading_revision = True
+                elif clean_line == "</revision>":
+                    reading_revision = False
+
+                # Start reading new page
+                if clean_line == "<page>":
+                    article_text = ""
+                    article_title = None
+                    article_id = None
+                # finished reading this page
+                elif clean_line == "</page>":
+                    if article_id:
+                        clean_text, entities = _process_wp_text(
+                            article_title, article_text, wp_to_id
+                        )
+                        if clean_text is not None and entities is not None:
+                            _write_training_entities(
+                                entity_file, article_id, clean_text, entities
+                            )
+
+                            if article_title in wp_to_id and parse_descriptions:
+                                description = " ".join(
+                                    clean_text[:1000].split(" ")[:-1]
+                                )
+                                _write_training_description(
+                                    descr_file, wp_to_id[article_title], description
+                                )
+                            article_count += 1
+                            if article_count % 10000 == 0 and article_count > 0:
+                                logger.info(
+                                    "Processed {} articles".format(article_count)
+                                )
+                            if limit and article_count >= limit:
+                                break
+                    article_text = ""
+                    article_title = None
+                    article_id = None
+                    reading_text = False
+                    reading_revision = False
+
+                # start reading text within a page
+                if "<text" in clean_line:
+                    reading_text = True
+
+                if reading_text:
+                    article_text += " " + clean_line
+
+                # stop reading text within a page (we assume a new page doesn't start on the same line)
+                if "</text" in clean_line:
+                    reading_text = False
+
+                # read the ID of this article (outside the revision portion of the document)
+                if not reading_revision:
+                    ids = id_regex.search(clean_line)
+                    if ids:
+                        article_id = ids[0]
+                        if article_id in read_ids:
+                            logger.info(
+                                "Found duplicate article ID", article_id, clean_line
+                            )  # This should never happen ...
+                        read_ids.add(article_id)
+
+                # read the title of this article (outside the revision portion of the document)
+                if not reading_revision:
+                    titles = title_regex.search(clean_line)
+                    if titles:
+                        article_title = titles[0].strip()
+    logger.info("Finished. Processed {} articles".format(article_count))
+
+
+def _process_wp_text(article_title, article_text, wp_to_id):
+    # ignore meta Wikipedia pages
+    if ns_regex.match(article_title):
+        return None, None
+
+    # remove the text tags
+    text_search = text_regex.search(article_text)
+    if text_search is None:
+        return None, None
+    text = text_search.group(0)
+
+    # stop processing if this is a redirect page
+    if text.startswith("#REDIRECT"):
+        return None, None
+
+    # get the raw text without markup etc, keeping only interwiki links
+    clean_text, entities = _remove_links(_get_clean_wp_text(text), wp_to_id)
+    return clean_text, entities
+
+
+def _get_clean_wp_text(article_text):
+    clean_text = article_text.strip()
+
+    # remove bolding & italic markup
+    clean_text = clean_text.replace("'''", "")
+    clean_text = clean_text.replace("''", "")
+
+    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
+    try_again = True
+    previous_length = len(clean_text)
+    while try_again:
+        clean_text = info_regex.sub(
+            "", clean_text
+        )  # non-greedy match excluding a nested {
+        if len(clean_text) < previous_length:
+            try_again = True
+        else:
+            try_again = False
+        previous_length = len(clean_text)
+
+    # remove HTML comments
+    clean_text = html_regex.sub("", clean_text)
+
+    # remove Category and File statements
+    clean_text = category_regex.sub("", clean_text)
+    clean_text = file_regex.sub("", clean_text)
+
+    # remove multiple =
+    while "==" in clean_text:
+        clean_text = clean_text.replace("==", "=")
+
+    clean_text = clean_text.replace(". =", ".")
+    clean_text = clean_text.replace(" = ", ". ")
+    clean_text = clean_text.replace("= ", ".")
+    clean_text = clean_text.replace(" =", "")
+
+    # remove refs (non-greedy match)
+    clean_text = ref_regex.sub("", clean_text)
+    clean_text = ref_2_regex.sub("", clean_text)
+
+    # remove additional wikiformatting
+    clean_text = re.sub(r"&lt;blockquote&gt;", "", clean_text)
+    clean_text = re.sub(r"&lt;/blockquote&gt;", "", clean_text)
+
+    # change special characters back to normal ones
+    clean_text = clean_text.replace(r"&lt;", "<")
+    clean_text = clean_text.replace(r"&gt;", ">")
+    clean_text = clean_text.replace(r"&quot;", '"')
+    clean_text = clean_text.replace(r"&amp;nbsp;", " ")
+    clean_text = clean_text.replace(r"&amp;", "&")
+
+    # remove multiple spaces
+    while "  " in clean_text:
+        clean_text = clean_text.replace("  ", " ")
+
+    return clean_text.strip()
+
+
+def _remove_links(clean_text, wp_to_id):
+    # read the text char by char to get the right offsets for the interwiki links
+    entities = []
+    final_text = ""
+    open_read = 0
+    reading_text = True
+    reading_entity = False
+    reading_mention = False
+    reading_special_case = False
+    entity_buffer = ""
+    mention_buffer = ""
+    for index, letter in enumerate(clean_text):
+        if letter == "[":
+            open_read += 1
+        elif letter == "]":
+            open_read -= 1
+        elif letter == "|":
+            if reading_text:
+                final_text += letter
+            # switch from reading entity to mention in the [[entity|mention]] pattern
+            elif reading_entity:
+                reading_text = False
+                reading_entity = False
+                reading_mention = True
+            else:
+                reading_special_case = True
+        else:
+            if reading_entity:
+                entity_buffer += letter
+            elif reading_mention:
+                mention_buffer += letter
+            elif reading_text:
+                final_text += letter
+            else:
+                raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
+
+        if open_read > 2:
+            reading_special_case = True
+
+        if open_read == 2 and reading_text:
+            reading_text = False
+            reading_entity = True
+            reading_mention = False
+
+        # we just finished reading an entity
+        if open_read == 0 and not reading_text:
+            if "#" in entity_buffer or entity_buffer.startswith(":"):
+                reading_special_case = True
+            # Ignore cases with nested structures like File: handles etc
+            if not reading_special_case:
+                if not mention_buffer:
+                    mention_buffer = entity_buffer
+                start = len(final_text)
+                end = start + len(mention_buffer)
+                qid = wp_to_id.get(entity_buffer, None)
+                if qid:
+                    entities.append((mention_buffer, qid, start, end))
+                final_text += mention_buffer
+
+            entity_buffer = ""
+            mention_buffer = ""
+
+            reading_text = True
+            reading_entity = False
+            reading_mention = False
+            reading_special_case = False
+    return final_text, entities
+
+
+def _write_training_description(outputfile, qid, description):
+    if description is not None:
+        line = str(qid) + "|" + description + "\n"
+        outputfile.write(line)
+
+
+def _write_training_entities(outputfile, article_id, clean_text, entities):
+    entities_data = [
+        {"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]}
+        for ent in entities
+    ]
+    line = (
+        json.dumps(
+            {
+                "article_id": article_id,
+                "clean_text": clean_text,
+                "entities": entities_data,
+            },
+            ensure_ascii=False,
+        )
+        + "\n"
+    )
+    outputfile.write(line)
+
+
+def read_training(nlp, entity_file_path, dev, limit, kb, labels_discard=None):
+    """ This method provides training examples that correspond to the entity annotations found by the nlp object.
+     For training, it will include both positive and negative examples by using the candidate generator from the kb.
+     For testing (kb=None), it will include all positive examples only."""
+
+    from tqdm import tqdm
+
+    if not labels_discard:
+        labels_discard = []
+
+    data = []
+    num_entities = 0
+    get_gold_parse = partial(
+        _get_gold_parse, dev=dev, kb=kb, labels_discard=labels_discard
+    )
+
+    logger.info(
+        "Reading {} data with limit {}".format("dev" if dev else "train", limit)
+    )
+    with entity_file_path.open("r", encoding="utf8") as file:
+        with tqdm(total=limit, leave=False) as pbar:
+            for i, line in enumerate(file):
+                example = json.loads(line)
+                article_id = example["article_id"]
+                clean_text = example["clean_text"]
+                entities = example["entities"]
+
+                if dev != is_dev(article_id) or not is_valid_article(clean_text):
+                    continue
+
+                doc = nlp(clean_text)
+                gold = get_gold_parse(doc, entities)
+                if gold and len(gold.links) > 0:
+                    data.append((doc, gold))
+                    num_entities += len(gold.links)
+                    pbar.update(len(gold.links))
+                if limit and num_entities >= limit:
+                    break
+    logger.info("Read {} entities in {} articles".format(num_entities, len(data)))
+    return data
+
+
+def _get_gold_parse(doc, entities, dev, kb, labels_discard):
+    gold_entities = {}
+    tagged_ent_positions = {
+        (ent.start_char, ent.end_char): ent
+        for ent in doc.ents
+        if ent.label_ not in labels_discard
+    }
+
+    for entity in entities:
+        entity_id = entity["entity"]
+        alias = entity["alias"]
+        start = entity["start"]
+        end = entity["end"]
+
+        candidate_ids = []
+        if kb and not dev:
+            candidates = kb.get_candidates(alias)
+            candidate_ids = [cand.entity_ for cand in candidates]
+
+        tagged_ent = tagged_ent_positions.get((start, end), None)
+        if tagged_ent:
+            # TODO: check that alias == doc.text[start:end]
+            should_add_ent = (dev or entity_id in candidate_ids) and is_valid_sentence(
+                tagged_ent.sent.text
+            )
+
+            if should_add_ent:
+                value_by_id = {entity_id: 1.0}
+                if not dev:
+                    random.shuffle(candidate_ids)
+                    value_by_id.update(
+                        {kb_id: 0.0 for kb_id in candidate_ids if kb_id != entity_id}
+                    )
+                gold_entities[(start, end)] = value_by_id
+
+    return GoldParse(doc, links=gold_entities)
+
+
+def is_dev(article_id):
+    if not article_id:
+        return False
+    return article_id.endswith("3")
+
+
+def is_valid_article(doc_text):
+    # custom length cut-off
+    return 10 < len(doc_text) < 30000
+
+
+def is_valid_sentence(sent_text):
+    if not 10 < len(sent_text) < 3000:
+        # custom length cut-off
+        return False
+
+    if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"):
+        # remove 'enumeration' sentences (occurs often on Wikipedia)
+        return False
+
+    return True
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -246,7 +246,7 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
    """Perform an update over a single batch of documents.

    docs (iterable): A batch of `Doc` objects.
-    drop (float): The droput rate.
+    drop (float): The dropout rate.
    optimizer (callable): An optimizer.
    RETURNS loss: A float for the loss.
    """
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -80,8 +80,8 @@ class Warnings(object):
            "the v2.x models cannot release the global interpreter lock. "
            "Future versions may introduce a `n_process` argument for "
            "parallel inference via multiprocessing.")
-    W017 = ("Alias '{alias}' already exists in the Knowledge base.")
-    W018 = ("Entity '{entity}' already exists in the Knowledge base.")
+    W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
+    W018 = ("Entity '{entity}' already exists in the Knowledge Base.")
    W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
            "previously loaded vectors. See Issue #3853.")
    W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
@ -96,6 +96,8 @@ class Warnings(object):
            "If this is surprising, make sure you have the spacy-lookups-data "
            "package installed.")
    W023 = ("Multiprocessing of Language.pipe is not supported in Python2. 'n_process' will be set to 1.")
+    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
+            "the Knowledge Base.")


@add_codes
@ -408,7 +410,7 @@ class Errors(object):
            "{probabilities_length} respectively.")
    E133 = ("The sum of prior probabilities for alias '{alias}' should not "
            "exceed 1, but found {sum}.")
-    E134 = ("Alias '{alias}' defined for unknown entity '{entity}'.")
+    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
    E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
            "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
    E136 = ("This additional feature requires the jsonschema library to be "
@ -420,7 +422,7 @@ class Errors(object):
    E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
            "includes either the `text` or `tokens` key. For more info, see "
            "the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
-    E139 = ("Knowledge base for component '{name}' not initialized. Did you "
+    E139 = ("Knowledge Base for component '{name}' not initialized. Did you "
            "forget to call set_kb()?")
    E140 = ("The list of entities, prior probabilities and entity vectors "
            "should be of equal length.")
@ -499,6 +501,7 @@ class Errors(object):
    E174 = ("Architecture '{name}' not found in registry. Available "
            "names: {names}")
    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
+    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")


@add_codes
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -142,6 +142,7 @@ cdef class KnowledgeBase:

        i = 0
        cdef KBEntryC entry
+        cdef hash_t entity_hash
        while i < nr_entities:
            entity_vector = vector_list[i]
            if len(entity_vector) != self.entity_vector_length:
@ -161,6 +162,14 @@ cdef class KnowledgeBase:

            i += 1

+    def contains_entity(self, unicode entity):
+        cdef hash_t entity_hash = self.vocab.strings.add(entity)
+        return entity_hash in self._entry_index
+
+    def contains_alias(self, unicode alias):
+        cdef hash_t alias_hash = self.vocab.strings.add(alias)
+        return alias_hash in self._alias_index
+
    def add_alias(self, unicode alias, entities, probabilities):
        """
        For a given alias, add its potential entities and prior probabilies to the KB.
@ -190,7 +199,7 @@ cdef class KnowledgeBase:
        for entity, prob in zip(entities, probabilities):
            entity_hash = self.vocab.strings[entity]
            if not entity_hash in self._entry_index:
-                raise ValueError(Errors.E134.format(alias=alias, entity=entity))
+                raise ValueError(Errors.E134.format(entity=entity))

            entry_index = <int64_t>self._entry_index.get(entity_hash)
            entry_indices.push_back(int(entry_index))
@ -201,8 +210,63 @@ cdef class KnowledgeBase:

        return alias_hash

-    def get_candidates(self, unicode alias):
+    def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
+        """
+        For an alias already existing in the KB, extend its potential entities with one more.
+        Throw a warning if either the alias or the entity is unknown,
+        or when the combination is already previously recorded.
+        Throw an error if this entity+prior prob would exceed the sum of 1.
+        For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
+        """
+        # Check if the alias exists in the KB
        cdef hash_t alias_hash = self.vocab.strings[alias]
+        if not alias_hash in self._alias_index:
+            raise ValueError(Errors.E176.format(alias=alias))
+
+        # Check if the entity exists in the KB
+        cdef hash_t entity_hash = self.vocab.strings[entity]
+        if not entity_hash in self._entry_index:
+            raise ValueError(Errors.E134.format(entity=entity))
+        entry_index = <int64_t>self._entry_index.get(entity_hash)
+
+        # Throw an error if the prior probabilities (including the new one) sum up to more than 1
+        alias_index = <int64_t>self._alias_index.get(alias_hash)
+        alias_entry = self._aliases_table[alias_index]
+        current_sum = sum([p for p in alias_entry.probs])
+        new_sum = current_sum + prior_prob
+
+        if new_sum > 1.00001:
+            raise ValueError(Errors.E133.format(alias=alias, sum=new_sum))
+
+        entry_indices = alias_entry.entry_indices
+
+        is_present = False
+        for i in range(entry_indices.size()):
+            if entry_indices[i] == int(entry_index):
+                is_present = True
+
+        if is_present:
+            if not ignore_warnings:
+                user_warning(Warnings.W024.format(entity=entity, alias=alias))
+        else:
+            entry_indices.push_back(int(entry_index))
+            alias_entry.entry_indices = entry_indices
+
+            probs = alias_entry.probs
+            probs.push_back(float(prior_prob))
+            alias_entry.probs = probs
+            self._aliases_table[alias_index] = alias_entry
+
+
+    def get_candidates(self, unicode alias):
+        """
+        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
+        and the prior probability of that alias resolving to that entity.
+        If the alias is not known in the KB, and empty list is returned.
+        """
+        cdef hash_t alias_hash = self.vocab.strings[alias]
+        if not alias_hash in self._alias_index:
+            return []
        alias_index = <int64_t>self._alias_index.get(alias_hash)
        alias_entry = self._aliases_table[alias_index]

@ -341,7 +405,6 @@ cdef class KnowledgeBase:
        assert nr_entities == self.get_size_entities()

        # STEP 3: load aliases
-
        cdef int64_t nr_aliases
        reader.read_alias_length(&nr_aliases)
        self._alias_index = PreshMap(nr_aliases+1)
--- a/spacy/language.py
+++ b/spacy/language.py
@ -483,7 +483,7 @@ class Language(object):

        docs (iterable): A batch of `Doc` objects.
        golds (iterable): A batch of `GoldParse` objects.
-        drop (float): The droput rate.
+        drop (float): The dropout rate.
        sgd (callable): An optimizer.
        losses (dict): Dictionary to update with the loss, keyed by component.
        component_cfg (dict): Config parameters for specific pipeline
@ -531,7 +531,7 @@ class Language(object):
        even if you're updating it with a smaller set of examples.

        docs (iterable): A batch of `Doc` objects.
-        drop (float): The droput rate.
+        drop (float): The dropout rate.
        sgd (callable): An optimizer.
        RETURNS (dict): Results from the update.

--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -1195,23 +1195,26 @@ class EntityLinker(Pipe):
            docs = [docs]
            golds = [golds]

-        context_docs = []
+        sentence_docs = []

        for doc, gold in zip(docs, golds):
            ents_by_offset = dict()
            for ent in doc.ents:
-                ents_by_offset["{}_{}".format(ent.start_char, ent.end_char)] = ent
+                ents_by_offset[(ent.start_char, ent.end_char)] = ent
+
            for entity, kb_dict in gold.links.items():
                start, end = entity
                mention = doc.text[start:end]
+                # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
+                ent = ents_by_offset[(start, end)]

                for kb_id, value in kb_dict.items():
                    # Currently only training on the positive instances
                    if value:
-                        context_docs.append(doc)
+                        sentence_docs.append(ent.sent.as_doc())

-        context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop)
-        loss, d_scores = self.get_similarity_loss(scores=context_encodings, golds=golds, docs=None)
+        sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
+        loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)
        bp_context(d_scores, sgd=sgd)

        if losses is not None:
@ -1280,22 +1283,40 @@ class EntityLinker(Pipe):
        if isinstance(docs, Doc):
            docs = [docs]

-        context_encodings = self.model(docs)
-        xp = get_array_module(context_encodings)
-
        for i, doc in enumerate(docs):
            if len(doc) > 0:
+                # Looping through each sentence and each entity
+                # This may go wrong if there are entities across sentences - because they might not get a KB ID
+                for sent in doc.ents:
+                    sent_doc = sent.as_doc()
                    # currently, the context is the same for each entity in a sentence (should be refined)
-                context_encoding = context_encodings[i]
-                context_enc_t = context_encoding.T
-                norm_1 = xp.linalg.norm(context_enc_t)
-                for ent in doc.ents:
+                    sentence_encoding = self.model([sent_doc])[0]
+                    xp = get_array_module(sentence_encoding)
+                    sentence_encoding_t = sentence_encoding.T
+                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
+
+                    for ent in sent_doc.ents:
                        entity_count += 1

+                        if ent.label_ in self.cfg.get("labels_discard", []):
+                            # ignoring this entity - setting to NIL
+                            final_kb_ids.append(self.NIL)
+                            final_tensors.append(sentence_encoding)
+
+                        else:
                            candidates = self.kb.get_candidates(ent.text)
                            if not candidates:
-                        final_kb_ids.append(self.NIL)  # no prediction possible for this entity
-                        final_tensors.append(context_encoding)
+                                # no prediction possible for this entity - setting to NIL
+                                final_kb_ids.append(self.NIL)
+                                final_tensors.append(sentence_encoding)
+
+                            elif len(candidates) == 1:
+                                # shortcut for efficiency reasons: take the 1 candidate
+
+                                # TODO: thresholding
+                                final_kb_ids.append(candidates[0].entity_)
+                                final_tensors.append(sentence_encoding)
+
                            else:
                                random.shuffle(candidates)

@ -1308,13 +1329,13 @@ class EntityLinker(Pipe):
                                # add in similarity from the context
                                if self.cfg.get("incl_context", True):
                                    entity_encodings = xp.asarray([c.entity_vector for c in candidates])
-                            norm_2 = xp.linalg.norm(entity_encodings, axis=1)
+                                    entity_norm = xp.linalg.norm(entity_encodings, axis=1)

                                    if len(entity_encodings) != len(prior_probs):
                                        raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length"))

                                    # cosine similarity
-                            sims = xp.dot(entity_encodings, context_enc_t) / (norm_1 * norm_2)
+                                    sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm)
                                    if sims.shape != prior_probs.shape:
                                        raise ValueError(Errors.E161)
                                    scores = prior_probs + sims - (prior_probs*sims)
@ -1323,7 +1344,7 @@ class EntityLinker(Pipe):
                                best_index = scores.argmax()
                                best_candidate = candidates[best_index]
                                final_kb_ids.append(best_candidate.entity_)
-                        final_tensors.append(context_encoding)
+                                final_tensors.append(sentence_encoding)

        if not (len(final_tensors) == len(final_kb_ids) == entity_count):
            raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -131,6 +131,53 @@ def test_candidate_generation(nlp):
    assert_almost_equal(mykb.get_candidates("adam")[0].prior_prob, 0.9)


+def test_append_alias(nlp):
+    """Test that we can append additional alias-entity pairs"""
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+
+    # adding entities
+    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
+    mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
+    mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
+
+    # adding aliases
+    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
+    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+    # test the size of the relevant candidates
+    assert len(mykb.get_candidates("douglas")) == 2
+
+    # append an alias
+    mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
+
+    # test the size of the relevant candidates has been incremented
+    assert len(mykb.get_candidates("douglas")) == 3
+
+    # append the same alias-entity pair again should not work (will throw a warning)
+    mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
+
+    # test the size of the relevant candidates remained unchanged
+    assert len(mykb.get_candidates("douglas")) == 3
+
+
+def test_append_invalid_alias(nlp):
+    """Test that append an alias will throw an error if prior probs are exceeding 1"""
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+
+    # adding entities
+    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
+    mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
+    mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
+
+    # adding aliases
+    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
+    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+    # append an alias - should fail because the entities and probabilities vectors are not of equal length
+    with pytest.raises(ValueError):
+        mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
+
+
 def test_preserving_links_asdoc(nlp):
    """Test that Span.as_doc preserves the existing entity links"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@ -430,7 +430,7 @@ def test_issue957(en_tokenizer):
 def test_issue999(train_data):
    """Test that adding entities and resuming training works passably OK.
    There are two issues here:
-    1) We have to read labels. This isn't very nice.
+    1) We have to re-add labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    """