Merge pull request #3864 from svlandeg/feature/nel-wiki

Entity linking using Wikipedia & Wikidata
2025-06-13 09:33:18 +03:00 · 2019-07-10 11:25:41 +02:00 · 2019-07-10 11:25:41 +02:00 · 6ba5ddbd5f
commit 6ba5ddbd5f
parent 4ebb4865fe f2ea3e3ea2
29 changed files with 2419 additions and 243 deletions
--- a/bin/init.py
+++ b/bin/init.py
--- a/bin/ud/conll17_ud_eval.py
+++ b/bin/ud/conll17_ud_eval.py
@ -292,8 +292,8 @@ def evaluate(gold_ud, system_ud, deprel_weights=None, check_parse=True):
    def spans_score(gold_spans, system_spans):
        correct, gi, si = 0, 0, 0
-        undersegmented = list()
+        undersegmented = []
-        oversegmented = list()
+        oversegmented = []
        combo = 0
        previous_end_si_earlier = False
        previous_end_gi_earlier = False
--- a/bin/wiki_entity_linking/init.py
+++ b/bin/wiki_entity_linking/init.py
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@ -0,0 +1,171 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .train_descriptions import EntityEncoder
 from . import wikidata_processor as wd, wikipedia_processor as wp
 from spacy.kb import KnowledgeBase
 import csv
 import datetime
 INPUT_DIM = 300  # dimension of pre-trained input vectors
 DESC_WIDTH = 64  # dimension of output entity vectors
 def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
              entity_def_output, entity_descr_output,
              count_input, prior_prob_input, wikidata_input):
    # Create the knowledge base from Wikidata entries
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH)
    # disable this part of the pipeline when rerunning the KB generation from preprocessed files
    read_raw_data = True
    if read_raw_data:
        print()
        print(" * _read_wikidata_entities", datetime.datetime.now())
        title_to_id, id_to_descr = wd.read_wikidata_entities_json(wikidata_input)
        # write the title-ID and ID-description mappings to file
        _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr)
    else:
        # read the mappings from file
        title_to_id = get_entity_to_id(entity_def_output)
        id_to_descr = get_id_to_description(entity_descr_output)
    print()
    print(" * _get_entity_frequencies", datetime.datetime.now())
    print()
    entity_frequencies = wp.get_all_frequencies(count_input=count_input)
    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
    filtered_title_to_id = dict()
    entity_list = []
    description_list = []
    frequency_list = []
    for title, entity in title_to_id.items():
        freq = entity_frequencies.get(title, 0)
        desc = id_to_descr.get(entity, None)
        if desc and freq > min_entity_freq:
            entity_list.append(entity)
            description_list.append(desc)
            frequency_list.append(freq)
            filtered_title_to_id[title] = entity
    print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()),
          "titles with filter frequency", min_entity_freq)
    print()
    print(" * train entity encoder", datetime.datetime.now())
    print()
    encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH)
    encoder.train(description_list=description_list, to_print=True)
    print()
    print(" * get entity embeddings", datetime.datetime.now())
    print()
    embeddings = encoder.apply_encoder(description_list)
    print()
    print(" * adding", len(entity_list), "entities", datetime.datetime.now())
    kb.set_entities(entity_list=entity_list, prob_list=frequency_list, vector_list=embeddings)
    print()
    print(" * adding aliases", datetime.datetime.now())
    print()
    _add_aliases(kb, title_to_id=filtered_title_to_id,
                 max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
                 prior_prob_input=prior_prob_input)
    print()
    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
    print("done with kb", datetime.datetime.now())
    return kb
 def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr):
    with open(entity_def_output, mode='w', encoding='utf8') as id_file:
        id_file.write("WP_title" + "|" + "WD_id" + "\n")
        for title, qid in title_to_id.items():
            id_file.write(title + "|" + str(qid) + "\n")
    with open(entity_descr_output, mode='w', encoding='utf8') as descr_file:
        descr_file.write("WD_id" + "|" + "description" + "\n")
        for qid, descr in id_to_descr.items():
            descr_file.write(str(qid) + "|" + descr + "\n")
 def get_entity_to_id(entity_def_output):
    entity_to_id = dict()
    with open(entity_def_output, 'r', encoding='utf8') as csvfile:
        csvreader = csv.reader(csvfile, delimiter='|')
        # skip header
        next(csvreader)
        for row in csvreader:
            entity_to_id[row[0]] = row[1]
    return entity_to_id
 def get_id_to_description(entity_descr_output):
    id_to_desc = dict()
    with open(entity_descr_output, 'r', encoding='utf8') as csvfile:
        csvreader = csv.reader(csvfile, delimiter='|')
        # skip header
        next(csvreader)
        for row in csvreader:
            id_to_desc[row[0]] = row[1]
    return id_to_desc
 def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input):
    wp_titles = title_to_id.keys()
    # adding aliases with prior probabilities
    # we can read this file sequentially, it's sorted by alias, and then by count
    with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
        # skip header
        prior_file.readline()
        line = prior_file.readline()
        previous_alias = None
        total_count = 0
        counts = []
        entities = []
        while line:
            splits = line.replace('\n', "").split(sep='|')
            new_alias = splits[0]
            count = int(splits[1])
            entity = splits[2]
            if new_alias != previous_alias and previous_alias:
                # done reading the previous alias --> output
                if len(entities) > 0:
                    selected_entities = []
                    prior_probs = []
                    for ent_count, ent_string in zip(counts, entities):
                        if ent_string in wp_titles:
                            wd_id = title_to_id[ent_string]
                            p_entity_givenalias = ent_count / total_count
                            selected_entities.append(wd_id)
                            prior_probs.append(p_entity_givenalias)
                    if selected_entities:
                        try:
                            kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
                        except ValueError as e:
                            print(e)
                total_count = 0
                counts = []
                entities = []
            total_count += count
            if len(entities) < max_entities_per_alias and count >= min_occ:
                counts.append(count)
                entities.append(entity)
            previous_alias = new_alias
            line = prior_file.readline()
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@ -0,0 +1,121 @@
 # coding: utf-8
 from random import shuffle
 import numpy as np
 from spacy._ml import zero_init, create_default_optimizer
 from spacy.cli.pretrain import get_cossim_loss
 from thinc.v2v import Model
 from thinc.api import chain
 from thinc.neural._classes.affine import Affine
 class EntityEncoder:
    """
    Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D).
    This entity vector will be stored in the KB, for further downstream use in the entity model.
    """
    DROP = 0
    EPOCHS = 5
    STOP_THRESHOLD = 0.04
    BATCH_SIZE = 1000
    def __init__(self, nlp, input_dim, desc_width):
        self.nlp = nlp
        self.input_dim = input_dim
        self.desc_width = desc_width
    def apply_encoder(self, description_list):
        if self.encoder is None:
            raise ValueError("Can not apply encoder before training it")
        batch_size = 100000
        start = 0
        stop = min(batch_size, len(description_list))
        encodings = []
        while start < len(description_list):
            docs = list(self.nlp.pipe(description_list[start:stop]))
            doc_embeddings = [self._get_doc_embedding(doc) for doc in docs]
            enc = self.encoder(np.asarray(doc_embeddings))
            encodings.extend(enc.tolist())
            start = start + batch_size
            stop = min(stop + batch_size, len(description_list))
        return encodings
    def train(self, description_list, to_print=False):
        processed, loss = self._train_model(description_list)
        if to_print:
            print("Trained on", processed, "entities across", self.EPOCHS, "epochs")
            print("Final loss:", loss)
    def _train_model(self, description_list):
        # TODO: when loss gets too low, a 'mean of empty slice' warning is thrown by numpy
        self._build_network(self.input_dim, self.desc_width)
        processed = 0
        loss = 1
        descriptions = description_list.copy()   # copy this list so that shuffling does not affect other functions
        for i in range(self.EPOCHS):
            shuffle(descriptions)
            batch_nr = 0
            start = 0
            stop = min(self.BATCH_SIZE, len(descriptions))
            while loss > self.STOP_THRESHOLD and start < len(descriptions):
                batch = []
                for descr in descriptions[start:stop]:
                    doc = self.nlp(descr)
                    doc_vector = self._get_doc_embedding(doc)
                    batch.append(doc_vector)
                loss = self._update(batch)
                print(i, batch_nr, loss)
                processed += len(batch)
                batch_nr += 1
                start = start + self.BATCH_SIZE
                stop = min(stop + self.BATCH_SIZE, len(descriptions))
        return processed, loss
    @staticmethod
    def _get_doc_embedding(doc):
        indices = np.zeros((len(doc),), dtype="i")
        for i, word in enumerate(doc):
            if word.orth in doc.vocab.vectors.key2row:
                indices[i] = doc.vocab.vectors.key2row[word.orth]
            else:
                indices[i] = 0
        word_vectors = doc.vocab.vectors.data[indices]
        doc_vector = np.mean(word_vectors, axis=0)
        return doc_vector
    def _build_network(self, orig_width, hidden_with):
        with Model.define_operators({">>": chain}):
            # very simple encoder-decoder model
            self.encoder = (
                Affine(hidden_with, orig_width)
            )
            self.model = self.encoder >> zero_init(Affine(orig_width, hidden_with, drop_factor=0.0))
        self.sgd = create_default_optimizer(self.model.ops)
    def _update(self, vectors):
        predictions, bp_model = self.model.begin_update(np.asarray(vectors), drop=self.DROP)
        loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors))
        bp_model(d_scores, sgd=self.sgd)
        return loss / len(vectors)
    @staticmethod
    def _get_loss(golds, scores):
        loss, gradients = get_cossim_loss(scores, golds)
        return loss, gradients
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@ -0,0 +1,353 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import os
 import re
 import bz2
 import datetime
 from spacy.gold import GoldParse
 from bin.wiki_entity_linking import kb_creator
 """
 Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
 Gold-standard entities are stored in one file in standoff format (by character offset).
 """
 ENTITY_FILE = "gold_entities.csv"
 def create_training(wikipedia_input, entity_def_input, training_output):
    wp_to_id = kb_creator.get_entity_to_id(entity_def_input)
    _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None)
 def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None):
    """
    Read the XML wikipedia data to parse out training data:
    raw text data + positive instances
    """
    title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
    id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
    read_ids = set()
    entityfile_loc = training_output / ENTITY_FILE
    with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
        # write entity training header file
        _write_training_entity(outputfile=entityfile,
                               article_id="article_id",
                               alias="alias",
                               entity="WD_id",
                               start="start",
                               end="end")
        with bz2.open(wikipedia_input, mode='rb') as file:
            line = file.readline()
            cnt = 0
            article_text = ""
            article_title = None
            article_id = None
            reading_text = False
            reading_revision = False
            while line and (not limit or cnt < limit):
                if cnt % 1000000 == 0:
                    print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
                clean_line = line.strip().decode("utf-8")
                if clean_line == "<revision>":
                    reading_revision = True
                elif clean_line == "</revision>":
                    reading_revision = False
                # Start reading new page
                if clean_line == "<page>":
                    article_text = ""
                    article_title = None
                    article_id = None
                # finished reading this page
                elif clean_line == "</page>":
                    if article_id:
                        try:
                            _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(),
                                             training_output)
                        except Exception as e:
                            print("Error processing article", article_id, article_title, e)
                    else:
                        print("Done processing a page, but couldn't find an article_id ?", article_title)
                    article_text = ""
                    article_title = None
                    article_id = None
                    reading_text = False
                    reading_revision = False
                # start reading text within a page
                if "<text" in clean_line:
                    reading_text = True
                if reading_text:
                    article_text += " " + clean_line
                # stop reading text within a page (we assume a new page doesn't start on the same line)
                if "</text" in clean_line:
                    reading_text = False
                # read the ID of this article (outside the revision portion of the document)
                if not reading_revision:
                    ids = id_regex.search(clean_line)
                    if ids:
                        article_id = ids[0]
                        if article_id in read_ids:
                            print("Found duplicate article ID", article_id, clean_line)  # This should never happen ...
                        read_ids.add(article_id)
                # read the title of this article (outside the revision portion of the document)
                if not reading_revision:
                    titles = title_regex.search(clean_line)
                    if titles:
                        article_title = titles[0].strip()
                line = file.readline()
                cnt += 1
 text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
 def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text, training_output):
    found_entities = False
    # ignore meta Wikipedia pages
    if article_title.startswith("Wikipedia:"):
        return
    # remove the text tags
    text = text_regex.search(article_text).group(0)
    # stop processing if this is a redirect page
    if text.startswith("#REDIRECT"):
        return
    # get the raw text without markup etc, keeping only interwiki links
    clean_text = _get_clean_wp_text(text)
    # read the text char by char to get the right offsets for the interwiki links
    final_text = ""
    open_read = 0
    reading_text = True
    reading_entity = False
    reading_mention = False
    reading_special_case = False
    entity_buffer = ""
    mention_buffer = ""
    for index, letter in enumerate(clean_text):
        if letter == '[':
            open_read += 1
        elif letter == ']':
            open_read -= 1
        elif letter == '|':
            if reading_text:
                final_text += letter
            # switch from reading entity to mention in the [[entity|mention]] pattern
            elif reading_entity:
                reading_text = False
                reading_entity = False
                reading_mention = True
            else:
                reading_special_case = True
        else:
            if reading_entity:
                entity_buffer += letter
            elif reading_mention:
                mention_buffer += letter
            elif reading_text:
                final_text += letter
            else:
                raise ValueError("Not sure at point", clean_text[index-2:index+2])
        if open_read > 2:
            reading_special_case = True
        if open_read == 2 and reading_text:
            reading_text = False
            reading_entity = True
            reading_mention = False
        # we just finished reading an entity
        if open_read == 0 and not reading_text:
            if '#' in entity_buffer or entity_buffer.startswith(':'):
                reading_special_case = True
            # Ignore cases with nested structures like File: handles etc
            if not reading_special_case:
                if not mention_buffer:
                    mention_buffer = entity_buffer
                start = len(final_text)
                end = start + len(mention_buffer)
                qid = wp_to_id.get(entity_buffer, None)
                if qid:
                    _write_training_entity(outputfile=entityfile,
                                           article_id=article_id,
                                           alias=mention_buffer,
                                           entity=qid,
                                           start=start,
                                           end=end)
                found_entities = True
                final_text += mention_buffer
            entity_buffer = ""
            mention_buffer = ""
            reading_text = True
            reading_entity = False
            reading_mention = False
            reading_special_case = False
    if found_entities:
        _write_training_article(article_id=article_id, clean_text=final_text, training_output=training_output)
 info_regex = re.compile(r'{[^{]*?}')
 htlm_regex = re.compile(r'&lt;!--[^-]*--&gt;')
 category_regex = re.compile(r'\[\[Category:[^\[]*]]')
 file_regex = re.compile(r'\[\[File:[^[\]]+]]')
 ref_regex = re.compile(r'&lt;ref.*?&gt;')     # non-greedy
 ref_2_regex = re.compile(r'&lt;/ref.*?&gt;')  # non-greedy
 def _get_clean_wp_text(article_text):
    clean_text = article_text.strip()
    # remove bolding & italic markup
    clean_text = clean_text.replace('\'\'\'', '')
    clean_text = clean_text.replace('\'\'', '')
    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
    try_again = True
    previous_length = len(clean_text)
    while try_again:
        clean_text = info_regex.sub('', clean_text)  # non-greedy match excluding a nested {
        if len(clean_text) < previous_length:
            try_again = True
        else:
            try_again = False
        previous_length = len(clean_text)
    # remove HTML comments
    clean_text = htlm_regex.sub('', clean_text)
    # remove Category and File statements
    clean_text = category_regex.sub('', clean_text)
    clean_text = file_regex.sub('', clean_text)
    # remove multiple =
    while '==' in clean_text:
        clean_text = clean_text.replace("==", "=")
    clean_text = clean_text.replace(". =", ".")
    clean_text = clean_text.replace(" = ", ". ")
    clean_text = clean_text.replace("= ", ".")
    clean_text = clean_text.replace(" =", "")
    # remove refs (non-greedy match)
    clean_text = ref_regex.sub('', clean_text)
    clean_text = ref_2_regex.sub('', clean_text)
    # remove additional wikiformatting
    clean_text = re.sub(r'&lt;blockquote&gt;', '', clean_text)
    clean_text = re.sub(r'&lt;/blockquote&gt;', '', clean_text)
    # change special characters back to normal ones
    clean_text = clean_text.replace(r'&lt;', '<')
    clean_text = clean_text.replace(r'&gt;', '>')
    clean_text = clean_text.replace(r'&quot;', '"')
    clean_text = clean_text.replace(r'&amp;nbsp;', ' ')
    clean_text = clean_text.replace(r'&amp;', '&')
    # remove multiple spaces
    while '  ' in clean_text:
        clean_text = clean_text.replace('  ', ' ')
    return clean_text.strip()
 def _write_training_article(article_id, clean_text, training_output):
    file_loc = training_output / str(article_id) + ".txt"
    with open(file_loc, mode='w', encoding='utf8') as outputfile:
        outputfile.write(clean_text)
 def _write_training_entity(outputfile, article_id, alias, entity, start, end):
    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + str(start) + "|" + str(end) + "\n")
 def is_dev(article_id):
    return article_id.endswith("3")
 def read_training(nlp, training_dir, dev, limit):
    # This method provides training examples that correspond to the entity annotations found by the nlp object
    entityfile_loc = training_dir / ENTITY_FILE
    data = []
    # assume the data is written sequentially, so we can reuse the article docs
    current_article_id = None
    current_doc = None
    ents_by_offset = dict()
    skip_articles = set()
    total_entities = 0
    with open(entityfile_loc, mode='r', encoding='utf8') as file:
        for line in file:
            if not limit or len(data) < limit:
                fields = line.replace('\n', "").split(sep='|')
                article_id = fields[0]
                alias = fields[1]
                wp_title = fields[2]
                start = fields[3]
                end = fields[4]
                if dev == is_dev(article_id) and article_id != "article_id" and article_id not in skip_articles:
                    if not current_doc or (current_article_id != article_id):
                        # parse the new article text
                        file_name = article_id + ".txt"
                        try:
                            with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as f:
                                text = f.read()
                                if len(text) < 30000:   # threshold for convenience / speed of processing
                                    current_doc = nlp(text)
                                    current_article_id = article_id
                                    ents_by_offset = dict()
                                    for ent in current_doc.ents:
                                        sent_length = len(ent.sent)
                                        # custom filtering to avoid too long or too short sentences
                                        if 5 < sent_length < 100:
                                            ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
                                else:
                                    skip_articles.add(article_id)
                                    current_doc = None
                        except Exception as e:
                            print("Problem parsing article", article_id, e)
                            skip_articles.add(article_id)
                            raise e
                    # repeat checking this condition in case an exception was thrown
                    if current_doc and (current_article_id == article_id):
                        found_ent = ents_by_offset.get(start + "_" + end,  None)
                        if found_ent:
                            if found_ent.text != alias:
                                skip_articles.add(article_id)
                                current_doc = None
                            else:
                                sent = found_ent.sent.as_doc()
                                # currently feeding the gold data one entity per sentence at a time
                                gold_start = int(start) - found_ent.sent.start_char
                                gold_end = int(end) - found_ent.sent.start_char
                                gold_entities = [(gold_start, gold_end, wp_title)]
                                gold = GoldParse(doc=sent, links=gold_entities)
                                data.append((sent, gold))
                                total_entities += 1
                                if len(data) % 2500 == 0:
                                    print(" -read", total_entities, "entities")
    print(" -read", total_entities, "entities")
    return data
--- a/bin/wiki_entity_linking/wikidata_processor.py
+++ b/bin/wiki_entity_linking/wikidata_processor.py
@ -0,0 +1,119 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import bz2
 import json
 import datetime
 def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False):
    # Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines.
    # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
    lang = 'en'
    site_filter = 'enwiki'
    # properties filter (currently disabled to get ALL data)
    prop_filter = dict()
    # prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
    title_to_id = dict()
    id_to_descr = dict()
    # parse appropriate fields - depending on what we need in the KB
    parse_properties = False
    parse_sitelinks = True
    parse_labels = False
    parse_descriptions = True
    parse_aliases = False
    parse_claims = False
    with bz2.open(wikidata_file, mode='rb') as file:
        line = file.readline()
        cnt = 0
        while line and (not limit or cnt < limit):
            if cnt % 500000 == 0:
                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
            clean_line = line.strip()
            if clean_line.endswith(b","):
                clean_line = clean_line[:-1]
            if len(clean_line) > 1:
                obj = json.loads(clean_line)
                entry_type = obj["type"]
                if entry_type == "item":
                    # filtering records on their properties (currently disabled to get ALL data)
                    # keep = False
                    keep = True
                    claims = obj["claims"]
                    if parse_claims:
                        for prop, value_set in prop_filter.items():
                            claim_property = claims.get(prop, None)
                            if claim_property:
                                for cp in claim_property:
                                    cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
                                    cp_rank = cp['rank']
                                    if cp_rank != "deprecated" and cp_id in value_set:
                                        keep = True
                    if keep:
                        unique_id = obj["id"]
                        if to_print:
                            print("ID:", unique_id)
                            print("type:", entry_type)
                        # parsing all properties that refer to other entities
                        if parse_properties:
                            for prop, claim_property in claims.items():
                                cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property
                                            if cp['mainsnak'].get('datavalue')]
                                cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict)
                                             if cp_dict.get('id') is not None]
                                if cp_values:
                                    if to_print:
                                        print("prop:", prop, cp_values)
                        found_link = False
                        if parse_sitelinks:
                            site_value = obj["sitelinks"].get(site_filter, None)
                            if site_value:
                                site = site_value['title']
                                if to_print:
                                    print(site_filter, ":", site)
                                title_to_id[site] = unique_id
                                found_link = True
                        if parse_labels:
                            labels = obj["labels"]
                            if labels:
                                lang_label = labels.get(lang, None)
                                if lang_label:
                                    if to_print:
                                        print("label (" + lang + "):", lang_label["value"])
                        if found_link and parse_descriptions:
                            descriptions = obj["descriptions"]
                            if descriptions:
                                lang_descr = descriptions.get(lang, None)
                                if lang_descr:
                                    if to_print:
                                        print("description (" + lang + "):", lang_descr["value"])
                                    id_to_descr[unique_id] = lang_descr["value"]
                        if parse_aliases:
                            aliases = obj["aliases"]
                            if aliases:
                                lang_aliases = aliases.get(lang, None)
                                if lang_aliases:
                                    for item in lang_aliases:
                                        if to_print:
                                            print("alias (" + lang + "):", item["value"])
                        if to_print:
                            print()
            line = file.readline()
            cnt += 1
    return title_to_id, id_to_descr
--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@ -0,0 +1,182 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 import bz2
 import csv
 import datetime
 """
 Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
 Write these results to file for downstream KB and training data generation.
 """
 map_alias_to_link = dict()
 # these will/should be matched ignoring case
 wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
                   "d", "dbdump", "download", "Draft", "Education", "Foundation",
                   "Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator",
                   "m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki",
                   "MediaZilla", "Meta", "Metawikipedia", "Module",
                   "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki",
                   "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev",
                   "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn",
                   "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools",
                   "tswiki", "User", "User talk", "v", "voy",
                   "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews",
                   "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech",
                   "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"]
 # find the links
 link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
 # match on interwiki links, e.g. `en:` or `:fr:`
 ns_regex = r":?" + "[a-z][a-z]" + ":"
 # match on Namespace: optionally preceded by a :
 for ns in wiki_namespaces:
    ns_regex += "|" + ":?" + ns + ":"
 ns_regex = re.compile(ns_regex, re.IGNORECASE)
 def read_wikipedia_prior_probs(wikipedia_input, prior_prob_output):
    """
    Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities.
    The full file takes about 2h to parse 1100M lines.
    It works relatively fast because it runs line by line, irrelevant of which article the intrawiki is from.
    """
    with bz2.open(wikipedia_input, mode='rb') as file:
        line = file.readline()
        cnt = 0
        while line:
            if cnt % 5000000 == 0:
                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
            clean_line = line.strip().decode("utf-8")
            aliases, entities, normalizations = get_wp_links(clean_line)
            for alias, entity, norm in zip(aliases, entities, normalizations):
                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
            line = file.readline()
            cnt += 1
    # write all aliases and their entities and count occurrences to file
    with open(prior_prob_output, mode='w', encoding='utf8') as outputfile:
        outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
        for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
            for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
                outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
 def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
    alias = alias.strip()
    entity = entity.strip()
    # remove everything after # as this is not part of the title but refers to a specific paragraph
    if normalize_entity:
        # wikipedia titles are always capitalized
        entity = _capitalize_first(entity.split("#")[0])
    if normalize_alias:
        alias = alias.split("#")[0]
    if alias and entity:
        alias_dict = map_alias_to_link.get(alias, dict())
        entity_count = alias_dict.get(entity, 0)
        alias_dict[entity] = entity_count + 1
        map_alias_to_link[alias] = alias_dict
 def get_wp_links(text):
    aliases = []
    entities = []
    normalizations = []
    matches = link_regex.findall(text)
    for match in matches:
        match = match[2:][:-2].replace("_", " ").strip()
        if ns_regex.match(match):
            pass  # ignore namespaces at the beginning of the string
        # this is a simple [[link]], with the alias the same as the mention
        elif "|" not in match:
            aliases.append(match)
            entities.append(match)
            normalizations.append(True)
        # in wiki format, the link is written as [[entity|alias]]
        else:
            splits = match.split("|")
            entity = splits[0].strip()
            alias = splits[1].strip()
            # specific wiki format  [[alias (specification)|]]
            if len(alias) == 0 and "(" in entity:
                alias = entity.split("(")[0]
                aliases.append(alias)
                entities.append(entity)
                normalizations.append(False)
            else:
                aliases.append(alias)
                entities.append(entity)
                normalizations.append(False)
    return aliases, entities, normalizations
 def _capitalize_first(text):
    if not text:
        return None
    result = text[0].capitalize()
    if len(result) > 0:
        result += text[1:]
    return result
 def write_entity_counts(prior_prob_input, count_output, to_print=False):
    # Write entity counts for quick access later
    entity_to_count = dict()
    total_count = 0
    with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
        # skip header
        prior_file.readline()
        line = prior_file.readline()
        while line:
            splits = line.replace('\n', "").split(sep='|')
            # alias = splits[0]
            count = int(splits[1])
            entity = splits[2]
            current_count = entity_to_count.get(entity, 0)
            entity_to_count[entity] = current_count + count
            total_count += count
            line = prior_file.readline()
    with open(count_output, mode='w', encoding='utf8') as entity_file:
        entity_file.write("entity" + "|" + "count" + "\n")
        for entity, count in entity_to_count.items():
            entity_file.write(entity + "|" + str(count) + "\n")
    if to_print:
        for entity, count in entity_to_count.items():
            print("Entity count:", entity, count)
        print("Total count:", total_count)
 def get_all_frequencies(count_input):
    entity_to_count = dict()
    with open(count_input, 'r', encoding='utf8') as csvfile:
        csvreader = csv.reader(csvfile, delimiter='|')
        # skip header
        next(csvreader)
        for row in csvreader:
            entity_to_count[row[0]] = int(row[1])
    return entity_to_count
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@ -9,26 +9,26 @@ from spacy.kb import KnowledgeBase
 def create_kb(vocab):
-    kb = KnowledgeBase(vocab=vocab)
+    kb = KnowledgeBase(vocab=vocab, entity_vector_length=1)
    # adding entities
    entity_0 = "Q1004791_Douglas"
    print("adding entity", entity_0)
-    kb.add_entity(entity=entity_0, prob=0.5)
+    kb.add_entity(entity=entity_0, prob=0.5, entity_vector=[0])
    entity_1 = "Q42_Douglas_Adams"
    print("adding entity", entity_1)
-    kb.add_entity(entity=entity_1, prob=0.5)
+    kb.add_entity(entity=entity_1, prob=0.5, entity_vector=[1])
    entity_2 = "Q5301561_Douglas_Haig"
    print("adding entity", entity_2)
-    kb.add_entity(entity=entity_2, prob=0.5)
+    kb.add_entity(entity=entity_2, prob=0.5, entity_vector=[2])
    # adding aliases
    print()
    alias_0 = "Douglas"
    print("adding alias", alias_0)
-    kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.1, 0.6, 0.2])
+    kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2])
    alias_1 = "Douglas Adams"
    print("adding alias", alias_1)
@ -41,8 +41,12 @@ def create_kb(vocab):
 def add_el(kb, nlp):
-    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"context_width": 64})
    el_pipe.set_kb(kb)
    nlp.add_pipe(el_pipe, last=True)
    nlp.begin_training()
    el_pipe.context_weight = 0
    el_pipe.prior_weight = 1
    for alias in ["Douglas Adams", "Douglas"]:
        candidates = nlp.linker.kb.get_candidates(alias)
@ -66,6 +70,6 @@ def add_el(kb, nlp):
 if __name__ == "__main__":
-    nlp = spacy.load('en_core_web_sm')
+    my_nlp = spacy.load('en_core_web_sm')
-    my_kb = create_kb(nlp.vocab)
+    my_kb = create_kb(my_nlp.vocab)
-    add_el(my_kb, nlp)
+    add_el(my_kb, my_nlp)
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@ -0,0 +1,442 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import random
 import datetime
 from pathlib import Path
 from bin.wiki_entity_linking import training_set_creator, kb_creator, wikipedia_processor as wp
 from bin.wiki_entity_linking.kb_creator import DESC_WIDTH
 import spacy
 from spacy.kb import KnowledgeBase
 from spacy.util import minibatch, compounding
 """
 Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
 """
 ROOT_DIR = Path("C:/Users/Sofie/Documents/data/")
 OUTPUT_DIR = ROOT_DIR / 'wikipedia'
 TRAINING_DIR = OUTPUT_DIR / 'training_data_nel'
 PRIOR_PROB = OUTPUT_DIR / 'prior_prob.csv'
 ENTITY_COUNTS = OUTPUT_DIR / 'entity_freq.csv'
 ENTITY_DEFS = OUTPUT_DIR / 'entity_defs.csv'
 ENTITY_DESCR = OUTPUT_DIR / 'entity_descriptions.csv'
 KB_FILE = OUTPUT_DIR / 'kb_1' / 'kb'
 NLP_1_DIR = OUTPUT_DIR / 'nlp_1'
 NLP_2_DIR = OUTPUT_DIR / 'nlp_2'
 # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
 WIKIDATA_JSON = ROOT_DIR / 'wikidata' / 'wikidata-20190304-all.json.bz2'
 # get enwiki-latest-pages-articles-multistream.xml.bz2 from https://dumps.wikimedia.org/enwiki/latest/
 ENWIKI_DUMP = ROOT_DIR / 'wikipedia' / 'enwiki-20190320-pages-articles-multistream.xml.bz2'
 # KB construction parameters
 MAX_CANDIDATES = 10
 MIN_ENTITY_FREQ = 20
 MIN_PAIR_OCC = 5
 # model training parameters
 EPOCHS = 10
 DROPOUT = 0.5
 LEARN_RATE = 0.005
 L2 = 1e-6
 CONTEXT_WIDTH = 128
 def run_pipeline():
    # set the appropriate booleans to define which parts of the pipeline should be re(run)
    print("START", datetime.datetime.now())
    print()
    nlp_1 = spacy.load('en_core_web_lg')
    nlp_2 = None
    kb_2 = None
    # one-time methods to create KB and write to file
    to_create_prior_probs = False
    to_create_entity_counts = False
    to_create_kb = False
    # read KB back in from file
    to_read_kb = True
    to_test_kb = False
    # create training dataset
    create_wp_training = False
    # train the EL pipe
    train_pipe = True
    measure_performance = True
    # test the EL pipe on a simple example
    to_test_pipeline = True
    # write the NLP object, read back in and test again
    to_write_nlp = True
    to_read_nlp = True
    test_from_file = False
    # STEP 1 : create prior probabilities from WP (run only once)
    if to_create_prior_probs:
        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
        wp.read_wikipedia_prior_probs(wikipedia_input=ENWIKI_DUMP, prior_prob_output=PRIOR_PROB)
        print()
    # STEP 2 : deduce entity frequencies from WP (run only once)
    if to_create_entity_counts:
        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
        wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False)
        print()
    # STEP 3 : create KB and write to file (run only once)
    if to_create_kb:
        print("STEP 3a: to_create_kb", datetime.datetime.now())
        kb_1 = kb_creator.create_kb(nlp_1,
                                    max_entities_per_alias=MAX_CANDIDATES,
                                    min_entity_freq=MIN_ENTITY_FREQ,
                                    min_occ=MIN_PAIR_OCC,
                                    entity_def_output=ENTITY_DEFS,
                                    entity_descr_output=ENTITY_DESCR,
                                    count_input=ENTITY_COUNTS,
                                    prior_prob_input=PRIOR_PROB,
                                    wikidata_input=WIKIDATA_JSON)
        print("kb entities:", kb_1.get_size_entities())
        print("kb aliases:", kb_1.get_size_aliases())
        print()
        print("STEP 3b: write KB and NLP", datetime.datetime.now())
        kb_1.dump(KB_FILE)
        nlp_1.to_disk(NLP_1_DIR)
        print()
    # STEP 4 : read KB back in from file
    if to_read_kb:
        print("STEP 4: to_read_kb", datetime.datetime.now())
        nlp_2 = spacy.load(NLP_1_DIR)
        kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH)
        kb_2.load_bulk(KB_FILE)
        print("kb entities:", kb_2.get_size_entities())
        print("kb aliases:", kb_2.get_size_aliases())
        print()
        # test KB
        if to_test_kb:
            check_kb(kb_2)
            print()
    # STEP 5: create a training dataset from WP
    if create_wp_training:
        print("STEP 5: create training dataset", datetime.datetime.now())
        training_set_creator.create_training(wikipedia_input=ENWIKI_DUMP,
                                             entity_def_input=ENTITY_DEFS,
                                             training_output=TRAINING_DIR)
    # STEP 6: create and train the entity linking pipe
    if train_pipe:
        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
        type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)}
        print(" -analysing", len(type_to_int), "different entity types")
        el_pipe = nlp_2.create_pipe(name='entity_linker',
                                    config={"context_width": CONTEXT_WIDTH,
                                            "pretrained_vectors": nlp_2.vocab.vectors.name,
                                            "type_to_int": type_to_int})
        el_pipe.set_kb(kb_2)
        nlp_2.add_pipe(el_pipe, last=True)
        other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"]
        with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
            optimizer = nlp_2.begin_training()
            optimizer.learn_rate = LEARN_RATE
            optimizer.L2 = L2
        # define the size (nr of entities) of training and dev set
        train_limit = 5000
        dev_limit = 5000
        train_data = training_set_creator.read_training(nlp=nlp_2,
                                                        training_dir=TRAINING_DIR,
                                                        dev=False,
                                                        limit=train_limit)
        print("Training on", len(train_data), "articles")
        print()
        dev_data = training_set_creator.read_training(nlp=nlp_2,
                                                      training_dir=TRAINING_DIR,
                                                      dev=True,
                                                      limit=dev_limit)
        print("Dev testing on", len(dev_data), "articles")
        print()
        if not train_data:
            print("Did not find any training data")
        else:
            for itn in range(EPOCHS):
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
                batchnr = 0
                with nlp_2.disable_pipes(*other_pipes):
                    for batch in batches:
                        try:
                            docs, golds = zip(*batch)
                            nlp_2.update(
                                docs,
                                golds,
                                sgd=optimizer,
                                drop=DROPOUT,
                                losses=losses,
                            )
                            batchnr += 1
                        except Exception as e:
                            print("Error updating batch:", e)
                if batchnr > 0:
                    el_pipe.cfg["context_weight"] = 1
                    el_pipe.cfg["prior_weight"] = 1
                    dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
                    losses['entity_linker'] = losses['entity_linker'] / batchnr
                    print("Epoch, train loss", itn, round(losses['entity_linker'], 2),
                          " / dev acc avg", round(dev_acc_context, 3))
        # STEP 7: measure the performance of our trained pipe on an independent dev set
        if len(dev_data) and measure_performance:
            print()
            print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())
            print()
            counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines(dev_data, kb_2)
            print("dev counts:", sorted(counts.items(), key=lambda x: x[0]))
            print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_label.items()])
            print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()])
            print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()])
            # using only context
            el_pipe.cfg["context_weight"] = 1
            el_pipe.cfg["prior_weight"] = 0
            dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
            print("dev acc context avg:", round(dev_acc_context, 3),
                  [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()])
            # measuring combined accuracy (prior + context)
            el_pipe.cfg["context_weight"] = 1
            el_pipe.cfg["prior_weight"] = 1
            dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe, error_analysis=False)
            print("dev acc combo avg:", round(dev_acc_combo, 3),
                  [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
        # STEP 8: apply the EL pipe on a toy example
        if to_test_pipeline:
            print()
            print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())
            print()
            run_el_toy_example(nlp=nlp_2)
        # STEP 9: write the NLP pipeline (including entity linker) to file
        if to_write_nlp:
            print()
            print("STEP 9: testing NLP IO", datetime.datetime.now())
            print()
            print("writing to", NLP_2_DIR)
            nlp_2.to_disk(NLP_2_DIR)
            print()
    # verify that the IO has gone correctly
    if to_read_nlp:
        print("reading from", NLP_2_DIR)
        nlp_3 = spacy.load(NLP_2_DIR)
        print("running toy example with NLP 3")
        run_el_toy_example(nlp=nlp_3)
    # testing performance with an NLP model from file
    if test_from_file:
        nlp_2 = spacy.load(NLP_1_DIR)
        nlp_3 = spacy.load(NLP_2_DIR)
        el_pipe = nlp_3.get_pipe("entity_linker")
        dev_limit = 5000
        dev_data = training_set_creator.read_training(nlp=nlp_2,
                                                      training_dir=TRAINING_DIR,
                                                      dev=True,
                                                      limit=dev_limit)
        print("Dev testing from file on", len(dev_data), "articles")
        print()
        dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe, error_analysis=False)
        print("dev acc combo avg:", round(dev_acc_combo, 3),
              [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
    print()
    print("STOP", datetime.datetime.now())
 def _measure_accuracy(data, el_pipe=None, error_analysis=False):
    # If the docs in the data require further processing with an entity linker, set el_pipe
    correct_by_label = dict()
    incorrect_by_label = dict()
    docs = [d for d, g in data if len(d) > 0]
    if el_pipe is not None:
        docs = list(el_pipe.pipe(docs))
    golds = [g for d, g in data if len(d) > 0]
    for doc, gold in zip(docs, golds):
        try:
            correct_entries_per_article = dict()
            for entity in gold.links:
                start, end, gold_kb = entity
                correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
            for ent in doc.ents:
                ent_label = ent.label_
                pred_entity = ent.kb_id_
                start = ent.start_char
                end = ent.end_char
                gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
                # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
                if gold_entity is not None:
                    if gold_entity == pred_entity:
                        correct = correct_by_label.get(ent_label, 0)
                        correct_by_label[ent_label] = correct + 1
                    else:
                        incorrect = incorrect_by_label.get(ent_label, 0)
                        incorrect_by_label[ent_label] = incorrect + 1
                        if error_analysis:
                            print(ent.text, "in", doc)
                            print("Predicted",  pred_entity, "should have been", gold_entity)
                            print()
        except Exception as e:
            print("Error assessing accuracy", e)
    acc, acc_by_label = calculate_acc(correct_by_label,  incorrect_by_label)
    return acc, acc_by_label
 def _measure_baselines(data, kb):
    # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound
    counts_by_label = dict()
    random_correct_by_label = dict()
    random_incorrect_by_label = dict()
    oracle_correct_by_label = dict()
    oracle_incorrect_by_label = dict()
    prior_correct_by_label = dict()
    prior_incorrect_by_label = dict()
    docs = [d for d, g in data if len(d) > 0]
    golds = [g for d, g in data if len(d) > 0]
    for doc, gold in zip(docs, golds):
        try:
            correct_entries_per_article = dict()
            for entity in gold.links:
                start, end, gold_kb = entity
                correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
            for ent in doc.ents:
                ent_label = ent.label_
                start = ent.start_char
                end = ent.end_char
                gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
                # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
                if gold_entity is not None:
                    counts_by_label[ent_label] = counts_by_label.get(ent_label, 0) + 1
                    candidates = kb.get_candidates(ent.text)
                    oracle_candidate = ""
                    best_candidate = ""
                    random_candidate = ""
                    if candidates:
                        scores = []
                        for c in candidates:
                            scores.append(c.prior_prob)
                            if c.entity_ == gold_entity:
                                oracle_candidate = c.entity_
                        best_index = scores.index(max(scores))
                        best_candidate = candidates[best_index].entity_
                        random_candidate = random.choice(candidates).entity_
                    if gold_entity == best_candidate:
                        prior_correct_by_label[ent_label] = prior_correct_by_label.get(ent_label, 0) + 1
                    else:
                        prior_incorrect_by_label[ent_label] = prior_incorrect_by_label.get(ent_label, 0) + 1
                    if gold_entity == random_candidate:
                        random_correct_by_label[ent_label] = random_correct_by_label.get(ent_label, 0) + 1
                    else:
                        random_incorrect_by_label[ent_label] = random_incorrect_by_label.get(ent_label, 0) + 1
                    if gold_entity == oracle_candidate:
                        oracle_correct_by_label[ent_label] = oracle_correct_by_label.get(ent_label, 0) + 1
                    else:
                        oracle_incorrect_by_label[ent_label] = oracle_incorrect_by_label.get(ent_label, 0) + 1
        except Exception as e:
            print("Error assessing accuracy", e)
    acc_prior, acc_prior_by_label = calculate_acc(prior_correct_by_label, prior_incorrect_by_label)
    acc_rand, acc_rand_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label)
    acc_oracle, acc_oracle_by_label = calculate_acc(oracle_correct_by_label, oracle_incorrect_by_label)
    return counts_by_label, acc_rand, acc_rand_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label
 def calculate_acc(correct_by_label, incorrect_by_label):
    acc_by_label = dict()
    total_correct = 0
    total_incorrect = 0
    all_keys = set()
    all_keys.update(correct_by_label.keys())
    all_keys.update(incorrect_by_label.keys())
    for label in sorted(all_keys):
        correct = correct_by_label.get(label, 0)
        incorrect = incorrect_by_label.get(label, 0)
        total_correct += correct
        total_incorrect += incorrect
        if correct == incorrect == 0:
            acc_by_label[label] = 0
        else:
            acc_by_label[label] = correct / (correct + incorrect)
    acc = 0
    if not (total_correct == total_incorrect == 0):
        acc = total_correct / (total_correct + total_incorrect)
    return acc, acc_by_label
 def check_kb(kb):
    for mention in ("Bush", "Douglas Adams", "Homer", "Brazil", "China"):
        candidates = kb.get_candidates(mention)
        print("generating candidates for " + mention + " :")
        for c in candidates:
            print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
        print()
 def run_el_toy_example(nlp):
    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
           "Douglas reminds us to always bring our towel, even in China or Brazil. " \
           "The main character in Doug's novel is the man Arthur Dent, " \
           "but Douglas doesn't write about George Washington or Homer Simpson."
    doc = nlp(text)
    print(text)
    for ent in doc.ents:
        print(" ent", ent.text, ent.label_, ent.kb_id_)
    print()
 if __name__ == "__main__":
    run_pipeline()
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -652,6 +652,38 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
    return model
 def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
    # TODO proper error
    if "entity_width" not in cfg:
        raise ValueError("entity_width not found")
    if "context_width" not in cfg:
        raise ValueError("context_width not found")
    conv_depth = cfg.get("conv_depth", 2)
    cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
    pretrained_vectors = cfg.get("pretrained_vectors")   # self.nlp.vocab.vectors.name
    context_width = cfg.get("context_width")
    entity_width = cfg.get("entity_width")
    with Model.define_operators({">>": chain, "**": clone}):
        model = Affine(entity_width, entity_width+context_width+1+ner_types)\
                >> Affine(1, entity_width, drop_factor=0.0)\
                >> logistic
        # context encoder
        tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors,
                          cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth,
                          bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\
                                >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
                                >> zero_init(Affine(context_width, hidden_width))
        model.tok2vec = tok2vec
    model.tok2vec = tok2vec
    model.tok2vec.nO = context_width
    model.nO = 1
    return model
@layerize
 def flatten(seqs, drop=0.0):
    ops = Model.ops
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -82,6 +82,7 @@ cdef enum attr_id_t:
    DEP
    ENT_IOB
    ENT_TYPE
    ENT_KB_ID
    HEAD
    SENT_START
    SPACY
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -84,6 +84,7 @@ IDS = {
    "DEP": DEP,
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
    "ENT_KB_ID": ENT_KB_ID,
    "HEAD": HEAD,
    "SENT_START": SENT_START,
    "SPACY": SPACY,
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -301,7 +301,7 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
    elif objective == "cosine":
        loss, d_target = get_cossim_loss(prediction, target)
    else:
-        raise ValueError(Errors.E139.format(loss_func=objective))
+        raise ValueError(Errors.E142.format(loss_func=objective))
    return loss, d_target
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -399,7 +399,10 @@ class Errors(object):
    E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input includes either the "
            "`text` or `tokens` key. For more info, see the docs:\n"
            "https://spacy.io/api/cli#pretrain-jsonl")
-    E139 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or 'cosine'")
+    E139 = ("Knowledge base for component '{name}' not initialized. Did you forget to call set_kb()?")
    E140 = ("The list of entities, prior probabilities and entity vectors should be of equal length.")
    E141 = ("Entity vectors should be of length {required} instead of the provided {found}.")
    E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or 'cosine'")
@add_codes
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@ -31,6 +31,7 @@ cdef class GoldParse:
    cdef public list ents
    cdef public dict brackets
    cdef public object cats
    cdef public list links
    cdef readonly list cand_to_gold
    cdef readonly list gold_to_cand
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -427,7 +427,7 @@ cdef class GoldParse:
    def __init__(self, doc, annot_tuples=None, words=None, tags=None,
                 heads=None, deps=None, entities=None, make_projective=False,
-                 cats=None, **_):
+                 cats=None, links=None, **_):
        """Create a GoldParse.
        doc (Doc): The document the annotations refer to.
@ -450,6 +450,8 @@ cdef class GoldParse:
            examples of a label to have the value 0.0. Labels not in the
            dictionary are treated as missing - the gradient for those labels
            will be zero.
        links (iterable): A sequence of `(start_char, end_char, kb_id)` tuples,
            representing the external ID of an entity in a knowledge base.
        RETURNS (GoldParse): The newly constructed object.
        """
        if words is None:
@ -485,6 +487,7 @@ cdef class GoldParse:
        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
        self.cats = {} if cats is None else dict(cats)
        self.links = links
        self.words = [None] * len(doc)
        self.tags = [None] * len(doc)
        self.heads = [None] * len(doc)
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@ -1,53 +1,27 @@
 """Knowledge-base for entity or concept linking."""
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 from libc.stdio cimport FILE
 from spacy.vocab cimport Vocab
 from .typedefs cimport hash_t
-
+from .structs cimport KBEntryC, AliasC
-# Internal struct, for storage and disambiguation. This isn't what we return
+ctypedef vector[KBEntryC] entry_vec
-# to the user as the answer to "here's your entity". It's the minimum number
+ctypedef vector[AliasC] alias_vec
-# of bits we need to keep track of the answers.
+ctypedef vector[float] float_vec
-cdef struct _EntryC:
+ctypedef vector[float_vec] float_matrix
    # The hash of this entry's unique ID and name in the kB
    hash_t entity_hash
    # Allows retrieval of one or more vectors.
    # Each element of vector_rows should be an index into a vectors table.
    # Every entry should have the same number of vectors, so we can avoid storing
    # the number of vectors in each knowledge-base struct
    int32_t* vector_rows
    # Allows retrieval of a struct of non-vector features. We could make this a
    # pointer, but we have 32 bits left over in the struct after prob, so we'd
    # like this to only be 32 bits. We can also set this to -1, for the common
    # case where there are no features.
    int32_t feats_row
    # log probability of entity, based on corpus frequency
    float prob
 # Each alias struct stores a list of Entry pointers with their prior probabilities
 # for this specific mention/alias.
 cdef struct _AliasC:
    # All entry candidates for this alias
    vector[int64_t] entry_indices
    # Prior probability P(entity|alias) - should sum up to (at most) 1.
    vector[float] probs
 # Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 cdef class Candidate:
    cdef readonly KnowledgeBase kb
    cdef hash_t entity_hash
    cdef float entity_freq
    cdef vector[float] entity_vector
    cdef hash_t alias_hash
    cdef float prior_prob
@ -55,9 +29,10 @@ cdef class Candidate:
 cdef class KnowledgeBase:
    cdef Pool mem
    cpdef readonly Vocab vocab
    cdef int64_t entity_vector_length
    # This maps 64bit keys (hash of unique entity string)
-    # to 64bit values (position of the _EntryC struct in the _entries vector).
+    # to 64bit values (position of the _KBEntryC struct in the _entries vector).
    # The PreshMap is pretty space efficient, as it uses open addressing. So
    # the only overhead is the vacancy rate, which is approximately 30%.
    cdef PreshMap _entry_index
@ -66,7 +41,7 @@ cdef class KnowledgeBase:
    # over allocation.
    # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
    # Storing 1m entries would take 41.6mb under this scheme.
-    cdef vector[_EntryC] _entries
+    cdef entry_vec _entries
    # This maps 64bit keys (hash of unique alias string)
    # to 64bit values (position of the _AliasC struct in the _aliases_table vector).
@ -76,7 +51,7 @@ cdef class KnowledgeBase:
    # should be P(entity | mention), which is pretty important to know.
    # We can pack both pieces of information into a 64-bit value, to keep things
    # efficient.
-    cdef vector[_AliasC] _aliases_table
+    cdef alias_vec _aliases_table
    # This is the part which might take more space: storing various
    # categorical features for the entries, and storing vectors for disambiguation
@ -87,7 +62,7 @@ cdef class KnowledgeBase:
    # model, that embeds different features of the entities into vectors. We'll
    # still want some per-entity features, like the Wikipedia text or entity
    # co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions.
-    cdef object _vectors_table
+    cdef float_matrix _vectors_table
    # It's very useful to track categorical features, at least for output, even
    # if they're not useful in the model itself. For instance, we should be
@ -96,53 +71,102 @@ cdef class KnowledgeBase:
    # optional data, we can let users configure a DB as the backend for this.
    cdef object _features_table
    cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
        """Add an entity vector to the vectors table."""
        cdef int64_t new_index = self._vectors_table.size()
        self._vectors_table.push_back(entity_vector)
        return new_index
    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
-                                     int32_t* vector_rows, int feats_row):
+                                     int32_t vector_index, int feats_row) nogil:
-        """Add an entry to the knowledge base."""
+        """Add an entry to the vector of entries.
-        # This is what we'll map the hash key to. It's where the entry will sit
+        After calling this method, make sure to update also the _entry_index using the return value"""
        # This is what we'll map the entity hash key to. It's where the entry will sit
        # in the vector of entries, so we can get it later.
        cdef int64_t new_index = self._entries.size()
-        self._entries.push_back(
+
-            _EntryC(
+        # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
-                entity_hash=entity_hash,
+        cdef KBEntryC entry
-                vector_rows=vector_rows,
+        entry.entity_hash = entity_hash
-                feats_row=feats_row,
+        entry.vector_index = vector_index
-                prob=prob
+        entry.feats_row = feats_row
-            ))
+        entry.prob = prob
-        self._entry_index[entity_hash] = new_index
+
        self._entries.push_back(entry)
        return new_index
-    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
+    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
-        """Connect a mention to a list of potential entities with their prior probabilities ."""
+        """Connect a mention to a list of potential entities with their prior probabilities .
        After calling this method, make sure to update also the _alias_index using the return value"""
        # This is what we'll map the alias hash key to. It's where the alias will be defined
        # in the vector of aliases.
        cdef int64_t new_index = self._aliases_table.size()
-        self._aliases_table.push_back(
+        # Avoid struct initializer to enable nogil
-            _AliasC(
+        cdef AliasC alias
-                entry_indices=entry_indices,
+        alias.entry_indices = entry_indices
-                probs=probs
+        alias.probs = probs
-            ))
+
-        self._alias_index[alias_hash] = new_index
+        self._aliases_table.push_back(alias)
        return new_index
-    cdef inline _create_empty_vectors(self):
+    cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
        """ 
-        Making sure the first element of each vector is a dummy,
+        Initializing the vectors and making sure the first element of each vector is a dummy,
        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
        cf. https://github.com/explosion/preshed/issues/17
        """
        cdef int32_t dummy_value = 0
-        self.vocab.strings.add("")
+
-        self._entries.push_back(
+        # Avoid struct initializer to enable nogil
-            _EntryC(
+        cdef KBEntryC entry
-                entity_hash=self.vocab.strings[""],
+        entry.entity_hash = dummy_hash
-                vector_rows=&dummy_value,
+        entry.vector_index = dummy_value
-                feats_row=dummy_value,
+        entry.feats_row = dummy_value
-                prob=dummy_value
+        entry.prob = dummy_value
-            ))
+
-        self._aliases_table.push_back(
+        # Avoid struct initializer to enable nogil
-            _AliasC(
+        cdef vector[int64_t] dummy_entry_indices
-                entry_indices=[dummy_value],
+        dummy_entry_indices.push_back(0)
-                probs=[dummy_value]
+        cdef vector[float] dummy_probs
-            ))
+        dummy_probs.push_back(0)
        cdef AliasC alias
        alias.entry_indices = dummy_entry_indices
        alias.probs = dummy_probs
        self._entries.push_back(entry)
        self._aliases_table.push_back(alias)
    cpdef load_bulk(self, loc)
    cpdef set_entities(self, entity_list, prob_list, vector_list)
 cdef class Writer:
    cdef FILE* _fp
    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
    cdef int write_vector_element(self, float element) except -1
    cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1
    cdef int write_alias_length(self, int64_t alias_length) except -1
    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
    cdef int write_alias(self, int64_t entry_index, float prob) except -1
    cdef int _write(self, void* value, size_t size) except -1
 cdef class Reader:
    cdef FILE* _fp
    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
    cdef int read_vector_element(self, float* element) except -1
    cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1
    cdef int read_alias_length(self, int64_t* alias_length) except -1
    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
    cdef int _read(self, void* value, size_t size) except -1
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -1,13 +1,30 @@
 # cython: infer_types=True
 # cython: profile=True
 # coding: utf8
 from spacy.errors import Errors, Warnings, user_warning
 from pathlib import Path
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from cpython.exc cimport PyErr_SetFromErrno
 from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
 from libc.stdint cimport int32_t, int64_t
 from .typedefs cimport hash_t
 from os import path
 from libcpp.vector cimport vector
 cdef class Candidate:
-    def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob):
+    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
        self.kb = kb
        self.entity_hash = entity_hash
        self.entity_freq = entity_freq
        self.entity_vector = entity_vector
        self.alias_hash = alias_hash
        self.prior_prob = prior_prob
@ -19,7 +36,7 @@ cdef class Candidate:
    @property
    def entity_(self):
        """RETURNS (unicode): ID/name of this entity in the KB"""
-        return self.kb.vocab.strings[self.entity]
+        return self.kb.vocab.strings[self.entity_hash]
    @property
    def alias(self):
@ -29,7 +46,15 @@ cdef class Candidate:
    @property
    def alias_(self):
        """RETURNS (unicode): ID of the original alias"""
-        return self.kb.vocab.strings[self.alias]
+        return self.kb.vocab.strings[self.alias_hash]
    @property
    def entity_freq(self):
        return self.entity_freq
    @property
    def entity_vector(self):
        return self.entity_vector
    @property
    def prior_prob(self):
@ -38,26 +63,41 @@ cdef class Candidate:
 cdef class KnowledgeBase:
-    def __init__(self, Vocab vocab):
+    def __init__(self, Vocab vocab, entity_vector_length):
        self.vocab = vocab
        self.mem = Pool()
        self.entity_vector_length = entity_vector_length
        self._entry_index = PreshMap()
        self._alias_index = PreshMap()
-        self.mem = Pool()
+
-        self._create_empty_vectors()
+        self.vocab.strings.add("")
        self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
    @property
    def entity_vector_length(self):
        """RETURNS (uint64): length of the entity vectors"""
        return self.entity_vector_length
    def __len__(self):
        return self.get_size_entities()
    def get_size_entities(self):
-        return self._entries.size() - 1  # not counting dummy element on index 0
+        return len(self._entry_index)
    def get_entity_strings(self):
        return [self.vocab.strings[x] for x in self._entry_index]
    def get_size_aliases(self):
-        return self._aliases_table.size() - 1 # not counting dummy element on index 0
+        return len(self._alias_index)
-    def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None):
+    def get_alias_strings(self):
        return [self.vocab.strings[x] for x in self._alias_index]
    def add_entity(self, unicode entity, float prob, vector[float] entity_vector):
        """
-        Add an entity to the KB.
+        Add an entity to the KB, optionally specifying its log probability based on corpus frequency
-        Return the hash of the entity ID at the end
+        Return the hash of the entity ID/name at the end.
        """
        cdef hash_t entity_hash = self.vocab.strings.add(entity)
@ -66,40 +106,72 @@ cdef class KnowledgeBase:
            user_warning(Warnings.W018.format(entity=entity))
            return
-        cdef int32_t dummy_value = 342
+        # Raise an error if the provided entity vector is not of the correct length
-        self.c_add_entity(entity_hash=entity_hash, prob=prob,
+        if len(entity_vector) != self.entity_vector_length:
-                          vector_rows=&dummy_value, feats_row=dummy_value)
+            raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
-        # TODO self._vectors_table.get_pointer(vectors),
+
-        # self._features_table.get(features))
+        vector_index = self.c_add_vector(entity_vector=entity_vector)
        new_index = self.c_add_entity(entity_hash=entity_hash,
                                      prob=prob,
                                      vector_index=vector_index,
                                      feats_row=-1)  # Features table currently not implemented
        self._entry_index[entity_hash] = new_index
        return entity_hash
    cpdef set_entities(self, entity_list, prob_list, vector_list):
        if len(entity_list) != len(prob_list) or len(entity_list) != len(vector_list):
            raise ValueError(Errors.E140)
        nr_entities = len(entity_list)
        self._entry_index = PreshMap(nr_entities+1)
        self._entries = entry_vec(nr_entities+1)
        i = 0
        cdef KBEntryC entry
        while i < nr_entities:
            entity_vector = vector_list[i]
            if len(entity_vector) != self.entity_vector_length:
                raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
            entity_hash = self.vocab.strings.add(entity_list[i])
            entry.entity_hash = entity_hash
            entry.prob = prob_list[i]
            vector_index = self.c_add_vector(entity_vector=vector_list[i])
            entry.vector_index = vector_index
            entry.feats_row = -1   # Features table currently not implemented
            self._entries[i+1] = entry
            self._entry_index[entity_hash] = i+1
            i += 1
    def add_alias(self, unicode alias, entities, probabilities):
        """
        For a given alias, add its potential entities and prior probabilies to the KB.
        Return the alias_hash at the end
        """
        # Throw an error if the length of entities and probabilities are not the same
        if not len(entities) == len(probabilities):
            raise ValueError(Errors.E132.format(alias=alias,
                                                entities_length=len(entities),
                                                probabilities_length=len(probabilities)))
-        # Throw an error if the probabilities sum up to more than 1
+        # Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
        prob_sum = sum(probabilities)
-        if prob_sum > 1:
+        if prob_sum > 1.00001:
            raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
        cdef hash_t alias_hash = self.vocab.strings.add(alias)
-        # Return if this alias was added before
+        # Check whether this alias was added before
        if alias_hash in self._alias_index:
            user_warning(Warnings.W017.format(alias=alias))
            return
        cdef hash_t entity_hash
        cdef vector[int64_t] entry_indices
        cdef vector[float] probs
@ -112,20 +184,295 @@ cdef class KnowledgeBase:
            entry_indices.push_back(int(entry_index))
            probs.push_back(float(prob))
-        self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
+        new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
        self._alias_index[alias_hash] = new_index
        return alias_hash
    def get_candidates(self, unicode alias):
        """ TODO: where to put this functionality ?"""
        cdef hash_t alias_hash = self.vocab.strings[alias]
        alias_index = <int64_t>self._alias_index.get(alias_hash)
        alias_entry = self._aliases_table[alias_index]
        return [Candidate(kb=self,
                          entity_hash=self._entries[entry_index].entity_hash,
                          entity_freq=self._entries[entry_index].prob,
                          entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
                          alias_hash=alias_hash,
                          prior_prob=prob)
                for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                if entry_index != 0]
    def dump(self, loc):
        cdef Writer writer = Writer(loc)
        writer.write_header(self.get_size_entities(), self.entity_vector_length)
        # dumping the entity vectors in their original order
        i = 0
        for entity_vector in self._vectors_table:
            for element in entity_vector:
                writer.write_vector_element(element)
            i = i+1
        # dumping the entry records in the order in which they are in the _entries vector.
        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
        i = 1
        for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
            entry = self._entries[entry_index]
            assert entry.entity_hash == entry_hash
            assert entry_index == i
            writer.write_entry(entry.entity_hash, entry.prob, entry.vector_index)
            i = i+1
        writer.write_alias_length(self.get_size_aliases())
        # dumping the aliases in the order in which they are in the _alias_index vector.
        # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
        i = 1
        for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
            alias = self._aliases_table[alias_index]
            assert alias_index == i
            candidate_length = len(alias.entry_indices)
            writer.write_alias_header(alias_hash, candidate_length)
            for j in range(0, candidate_length):
                writer.write_alias(alias.entry_indices[j], alias.probs[j])
            i = i+1
        writer.close()
    cpdef load_bulk(self, loc):
        cdef hash_t entity_hash
        cdef hash_t alias_hash
        cdef int64_t entry_index
        cdef float prob
        cdef int32_t vector_index
        cdef KBEntryC entry
        cdef AliasC alias
        cdef float vector_element
        cdef Reader reader = Reader(loc)
        # STEP 0: load header and initialize KB
        cdef int64_t nr_entities
        cdef int64_t entity_vector_length
        reader.read_header(&nr_entities, &entity_vector_length)
        self.entity_vector_length = entity_vector_length
        self._entry_index = PreshMap(nr_entities+1)
        self._entries = entry_vec(nr_entities+1)
        self._vectors_table = float_matrix(nr_entities+1)
        # STEP 1: load entity vectors
        cdef int i = 0
        cdef int j = 0
        while i < nr_entities:
            entity_vector = float_vec(entity_vector_length)
            j = 0
            while j < entity_vector_length:
                reader.read_vector_element(&vector_element)
                entity_vector[j] = vector_element
                j = j+1
            self._vectors_table[i] = entity_vector
            i = i+1
        # STEP 2: load entities
        # we assume that the entity data was written in sequence
        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
        i = 1
        while i <= nr_entities:
            reader.read_entry(&entity_hash, &prob, &vector_index)
            entry.entity_hash = entity_hash
            entry.prob = prob
            entry.vector_index = vector_index
            entry.feats_row = -1    # Features table currently not implemented
            self._entries[i] = entry
            self._entry_index[entity_hash] = i
            i += 1
        # check that all entities were read in properly
        assert nr_entities == self.get_size_entities()
        # STEP 3: load aliases
        cdef int64_t nr_aliases
        reader.read_alias_length(&nr_aliases)
        self._alias_index = PreshMap(nr_aliases+1)
        self._aliases_table = alias_vec(nr_aliases+1)
        cdef int64_t nr_candidates
        cdef vector[int64_t] entry_indices
        cdef vector[float] probs
        i = 1
        # we assume the alias data was written in sequence
        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
        while i <= nr_aliases:
            reader.read_alias_header(&alias_hash, &nr_candidates)
            entry_indices = vector[int64_t](nr_candidates)
            probs = vector[float](nr_candidates)
            for j in range(0, nr_candidates):
                reader.read_alias(&entry_index, &prob)
                entry_indices[j] = entry_index
                probs[j] = prob
            alias.entry_indices = entry_indices
            alias.probs = probs
            self._aliases_table[i] = alias
            self._alias_index[alias_hash] = i
            i += 1
        # check that all aliases were read in properly
        assert nr_aliases == self.get_size_aliases()
 cdef class Writer:
    def __init__(self, object loc):
        if path.exists(loc):
            assert not path.isdir(loc), "%s is directory." % loc
        if isinstance(loc, Path):
            loc = bytes(loc)
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
        self._fp = fopen(<char*>bytes_loc, 'wb')
        assert self._fp != NULL
        fseek(self._fp, 0, 0)
    def close(self):
        cdef size_t status = fclose(self._fp)
        assert status == 0
    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
        self._write(&nr_entries, sizeof(nr_entries))
        self._write(&entity_vector_length, sizeof(entity_vector_length))
    cdef int write_vector_element(self, float element) except -1:
        self._write(&element, sizeof(element))
    cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1:
        self._write(&entry_hash, sizeof(entry_hash))
        self._write(&entry_prob, sizeof(entry_prob))
        self._write(&vector_index, sizeof(vector_index))
        # Features table currently not implemented and not written to file
    cdef int write_alias_length(self, int64_t alias_length) except -1:
        self._write(&alias_length, sizeof(alias_length))
    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
        self._write(&alias_hash, sizeof(alias_hash))
        self._write(&candidate_length, sizeof(candidate_length))
    cdef int write_alias(self, int64_t entry_index, float prob) except -1:
        self._write(&entry_index, sizeof(entry_index))
        self._write(&prob, sizeof(prob))
    cdef int _write(self, void* value, size_t size) except -1:
        status = fwrite(value, size, 1, self._fp)
        assert status == 1, status
 cdef class Reader:
    def __init__(self, object loc):
        assert path.exists(loc)
        assert not path.isdir(loc)
        if isinstance(loc, Path):
            loc = bytes(loc)
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
        self._fp = fopen(<char*>bytes_loc, 'rb')
        if not self._fp:
            PyErr_SetFromErrno(IOError)
        status = fseek(self._fp, 0, 0)  # this can be 0 if there is no header
    def __dealloc__(self):
        fclose(self._fp)
    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
        status = self._read(nr_entries, sizeof(int64_t))
        if status < 1:
            if feof(self._fp):
                return 0  # end of file
            raise IOError("error reading header from input file")
        status = self._read(entity_vector_length, sizeof(int64_t))
        if status < 1:
            if feof(self._fp):
                return 0  # end of file
            raise IOError("error reading header from input file")
    cdef int read_vector_element(self, float* element) except -1:
        status = self._read(element, sizeof(float))
        if status < 1:
            if feof(self._fp):
                return 0  # end of file
            raise IOError("error reading entity vector from input file")
    cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1:
        status = self._read(entity_hash, sizeof(hash_t))
        if status < 1:
            if feof(self._fp):
                return 0  # end of file
            raise IOError("error reading entity hash from input file")
        status = self._read(prob, sizeof(float))
        if status < 1:
            if feof(self._fp):
                return 0  # end of file
            raise IOError("error reading entity prob from input file")
        status = self._read(vector_index, sizeof(int32_t))
        if status < 1:
            if feof(self._fp):
                return 0  # end of file
            raise IOError("error reading entity vector from input file")
        if feof(self._fp):
            return 0
        else:
            return 1
    cdef int read_alias_length(self, int64_t* alias_length) except -1:
        status = self._read(alias_length, sizeof(int64_t))
        if status < 1:
            if feof(self._fp):
                return 0  # end of file
            raise IOError("error reading alias length from input file")
    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
        status = self._read(alias_hash, sizeof(hash_t))
        if status < 1:
            if feof(self._fp):
                return 0  # end of file
            raise IOError("error reading alias hash from input file")
        status = self._read(candidate_length, sizeof(int64_t))
        if status < 1:
            if feof(self._fp):
                return 0  # end of file
            raise IOError("error reading candidate length from input file")
    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1:
        status = self._read(entry_index, sizeof(int64_t))
        if status < 1:
            if feof(self._fp):
                return 0  # end of file
            raise IOError("error reading entry index for alias from input file")
        status = self._read(prob, sizeof(float))
        if status < 1:
            if feof(self._fp):
                return 0  # end of file
            raise IOError("error reading prob for entity/alias from input file")
    cdef int _read(self, void* value, size_t size) except -1:
        status = fread(value, size, 1, self._fp)
        return status
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -3,16 +3,18 @@
 # coding: utf8
 from __future__ import unicode_literals
 cimport numpy as np
 import numpy
 import srsly
 import random
 from collections import OrderedDict
 from thinc.api import chain
 from thinc.v2v import Affine, Maxout, Softmax
 from thinc.misc import LayerNorm
-from thinc.neural.util import to_categorical, copy_array
+from thinc.neural.util import to_categorical
 from thinc.neural.util import get_array_module
 from spacy.kb import KnowledgeBase
 from ..cli.pretrain import get_cossim_loss
 from .functions import merge_subtokens
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
@ -24,9 +26,9 @@ from ..vocab cimport Vocab
 from ..syntax import nonproj
 from ..attrs import POS, ID
 from ..parts_of_speech import X
-from .._ml import Tok2Vec, build_tagger_model
+from .._ml import Tok2Vec, build_tagger_model, cosine
 from .._ml import build_text_classifier, build_simple_cnn_text_classifier
-from .._ml import build_bow_text_classifier
+from .._ml import build_bow_text_classifier, build_nel_encoder
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import masked_language_model, create_default_optimizer
 from ..errors import Errors, TempErrors
@ -229,7 +231,7 @@ class Tensorizer(Pipe):
        vocab (Vocab): A `Vocab` instance. The model must share the same
            `Vocab` instance with the `Doc` objects it will process.
-        model (Model): A `Model` instance or `True` allocate one later.
+        model (Model): A `Model` instance or `True` to allocate one later.
        **cfg: Config parameters.
        EXAMPLE:
@ -294,7 +296,7 @@ class Tensorizer(Pipe):
        docs (iterable): A batch of `Doc` objects.
        golds (iterable): A batch of `GoldParse` objects.
-        drop (float): The droput rate.
+        drop (float): The dropout rate.
        sgd (callable): An optimizer.
        RETURNS (dict): Results from the update.
        """
@ -386,7 +388,7 @@ class Tagger(Pipe):
    def predict(self, docs):
        self.require_model()
        if not any(len(doc) for doc in docs):
-            # Handle case where there are no tokens in any docs.
+            # Handle cases where there are no tokens in any docs.
            n_labels = len(self.labels)
            guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
            tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
@ -1063,52 +1065,252 @@ cdef class EntityRecognizer(Parser):
 class EntityLinker(Pipe):
    """Pipeline component for named entity linking.
    DOCS: TODO
    """
    name = 'entity_linker'
    @classmethod
-    def Model(cls, nr_class=1, **cfg):
+    def Model(cls, **cfg):
-        # TODO: non-dummy EL implementation
+        embed_width = cfg.get("embed_width", 300)
-        return None
+        hidden_width = cfg.get("hidden_width", 128)
        type_to_int = cfg.get("type_to_int", dict())
-    def __init__(self, model=True, **cfg):
+        model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg)
-        self.model = False
+        return model
    def __init__(self, vocab, **cfg):
        self.vocab = vocab
        self.model = True
        self.kb = None
        self.cfg = dict(cfg)
-        self.kb = self.cfg["kb"]
+        self.sgd_context = None
    def set_kb(self, kb):
        self.kb = kb
    def require_model(self):
        # Raise an error if the component's model is not initialized.
        if getattr(self, "model", None) in (None, True, False):
            raise ValueError(Errors.E109.format(name=self.name))
    def require_kb(self):
        # Raise an error if the knowledge base is not initialized.
        if getattr(self, "kb", None) in (None, True, False):
            raise ValueError(Errors.E139.format(name=self.name))
    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
        self.require_kb()
        self.cfg["entity_width"] = self.kb.entity_vector_length
        if self.model is True:
            self.model = self.Model(**self.cfg)
            self.sgd_context = self.create_optimizer()
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
    def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
        self.require_model()
        self.require_kb()
        if losses is not None:
            losses.setdefault(self.name, 0.0)
        if not docs or not golds:
            return 0
        if len(docs) != len(golds):
            raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs),
                                                n_golds=len(golds)))
        if isinstance(docs, Doc):
            docs = [docs]
            golds = [golds]
        context_docs = []
        entity_encodings = []
        cats = []
        priors = []
        type_vectors = []
        type_to_int = self.cfg.get("type_to_int", dict())
        for doc, gold in zip(docs, golds):
            ents_by_offset = dict()
            for ent in doc.ents:
                ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
            for entity in gold.links:
                start, end, gold_kb = entity
                mention = doc.text[start:end]
                gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)]
                assert gold_ent is not None
                type_vector = [0 for i in range(len(type_to_int))]
                if len(type_to_int) > 0:
                    type_vector[type_to_int[gold_ent.label_]] = 1
                candidates = self.kb.get_candidates(mention)
                random.shuffle(candidates)
                nr_neg = 0
                for c in candidates:
                    kb_id = c.entity_
                    entity_encoding = c.entity_vector
                    entity_encodings.append(entity_encoding)
                    context_docs.append(doc)
                    type_vectors.append(type_vector)
                    if self.cfg.get("prior_weight", 1) > 0:
                        priors.append([c.prior_prob])
                    else:
                        priors.append([0])
                    if kb_id == gold_kb:
                        cats.append([1])
                    else:
                        nr_neg += 1
                        cats.append([0])
        if len(entity_encodings) > 0:
            assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) == len(type_vectors)
            context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
            entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
            mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] + type_vectors[i]
                                 for i in range(len(entity_encodings))]
            pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop)
            cats = self.model.ops.asarray(cats, dtype="float32")
            loss, d_scores = self.get_loss(prediction=pred, golds=cats, docs=None)
            mention_gradient = bp_mention(d_scores, sgd=sgd)
            context_gradients = [list(x[0:self.cfg.get("context_width")]) for x in mention_gradient]
            bp_context(self.model.ops.asarray(context_gradients, dtype="float32"), sgd=self.sgd_context)
            if losses is not None:
                losses[self.name] += loss
            return loss
        return 0
    def get_loss(self, docs, golds, prediction):
        d_scores = (prediction - golds)
        loss = (d_scores ** 2).sum()
        loss = loss / len(golds)
        return loss, d_scores
    def get_loss_old(self, docs, golds, scores):
        # this loss function assumes we're only using positive examples
        loss, gradients = get_cossim_loss(yh=scores, y=golds)
        loss = loss / len(golds)
        return loss, gradients
    def __call__(self, doc):
-        self.set_annotations([doc], scores=None, tensors=None)
+        entities, kb_ids = self.predict([doc])
        self.set_annotations([doc], entities, kb_ids)
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1):
        """Apply the pipe to a stream of documents.
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
-            self.set_annotations(docs, scores=None, tensors=None)
+            entities, kb_ids = self.predict(docs)
            self.set_annotations(docs, entities, kb_ids)
            yield from docs
-    def set_annotations(self, docs, scores, tensors=None):
+    def predict(self, docs):
-        """
+        self.require_model()
-        Currently implemented as taking the KB entry with highest prior probability for each named entity
+        self.require_kb()
-        TODO: actually use context etc
+
-        """
+        final_entities = []
        final_kb_ids = []
        if not docs:
            return final_entities, final_kb_ids
        if isinstance(docs, Doc):
            docs = [docs]
        context_encodings = self.model.tok2vec(docs)
        xp = get_array_module(context_encodings)
        type_to_int = self.cfg.get("type_to_int", dict())
        for i, doc in enumerate(docs):
            if len(doc) > 0:
                context_encoding = context_encodings[i]
                for ent in doc.ents:
                    type_vector = [0 for i in range(len(type_to_int))]
                    if len(type_to_int) > 0:
                        type_vector[type_to_int[ent.label_]] = 1
                    candidates = self.kb.get_candidates(ent.text)
                    if candidates:
-                    best_candidate = max(candidates, key=lambda c: c.prior_prob)
+                        random.shuffle(candidates)
                    for token in ent:
                        token.ent_kb_id_ = best_candidate.entity_
-    def get_loss(self, docs, golds, scores):
+                        # this will set the prior probabilities to 0 (just like in training) if their weight is 0
-        # TODO
+                        prior_probs = xp.asarray([[c.prior_prob] for c in candidates])
-        pass
+                        prior_probs *= self.cfg.get("prior_weight", 1)
                        scores = prior_probs
                        if self.cfg.get("context_weight", 1) > 0:
                            entity_encodings = xp.asarray([c.entity_vector for c in candidates])
                            assert len(entity_encodings) == len(prior_probs)
                            mention_encodings = [list(context_encoding) + list(entity_encodings[i])
                                                 + list(prior_probs[i]) + type_vector
                                                 for i in range(len(entity_encodings))]
                            scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))
                        # TODO: thresholding
                        best_index = scores.argmax()
                        best_candidate = candidates[best_index]
                        final_entities.append(ent)
                        final_kb_ids.append(best_candidate.entity_)
        return final_entities, final_kb_ids
    def set_annotations(self, docs, entities, kb_ids=None):
        for entity, kb_id in zip(entities, kb_ids):
            for token in entity:
                token.ent_kb_id_ = kb_id
    def to_disk(self, path, exclude=tuple(), **kwargs):
        serialize = OrderedDict()
        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
        serialize["kb"] = lambda p: self.kb.dump(p)
        if self.model not in (None, True, False):
            serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
        util.to_disk(path, serialize, exclude)
    def from_disk(self, path, exclude=tuple(), **kwargs):
        def load_model(p):
             if self.model is True:
                self.model = self.Model(**self.cfg)
             self.model.from_bytes(p.open("rb").read())
        def load_kb(p):
            kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
            kb.load_bulk(p)
            self.set_kb(kb)
        deserialize = OrderedDict()
        deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
        deserialize["kb"] = load_kb
        deserialize["model"] = load_model
        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
        util.from_disk(path, deserialize, exclude)
        return self
    def rehearse(self, docs, sgd=None, losses=None, **config):
        raise NotImplementedError
    def add_label(self, label):
-        # TODO
+        raise NotImplementedError
        pass
 class Sentencizer(object):
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -3,6 +3,10 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
 from .typedefs cimport flags_t, attr_t, hash_t
 from .parts_of_speech cimport univ_pos_t
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 cdef struct LexemeC:
    flags_t flags
@ -72,3 +76,32 @@ cdef struct TokenC:
    attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
    attr_t ent_kb_id
    hash_t ent_id
 # Internal struct, for storage and disambiguation of entities.
 cdef struct KBEntryC:
    # The hash of this entry's unique ID/name in the kB
    hash_t entity_hash
    # Allows retrieval of the entity vector, as an index into a vectors table of the KB.
    # Can be expanded later to refer to multiple rows (compositional model to reduce storage footprint).
    int32_t vector_index
    # Allows retrieval of a struct of non-vector features.
    # This is currently not implemented and set to -1 for the common case where there are no features.
    int32_t feats_row
    # log probability of entity, based on corpus frequency
    float prob
 # Each alias struct stores a list of Entry pointers with their prior probabilities
 # for this specific mention/alias.
 cdef struct AliasC:
    # All entry candidates for this alias
    vector[int64_t] entry_indices
    # Prior probability P(entity|alias) - should sum up to (at most) 1.
    vector[float] probs
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -81,6 +81,7 @@ cdef enum symbol_t:
    DEP
    ENT_IOB
    ENT_TYPE
    ENT_KB_ID
    HEAD
    SENT_START
    SPACY
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -86,6 +86,7 @@ IDS = {
    "DEP": DEP,
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
    "ENT_KB_ID": ENT_KB_ID,
    "HEAD": HEAD,
    "SENT_START": SENT_START,
    "SPACY": SPACY,
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@ -1,91 +0,0 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from spacy.kb import KnowledgeBase
 from spacy.lang.en import English
@pytest.fixture
 def nlp():
    return English()
 def test_kb_valid_entities(nlp):
    """Test the valid construction of a KB with 3 entities and two aliases"""
    mykb = KnowledgeBase(nlp.vocab)
    # adding entities
    mykb.add_entity(entity=u'Q1', prob=0.9)
    mykb.add_entity(entity=u'Q2')
    mykb.add_entity(entity=u'Q3', prob=0.5)
    # adding aliases
    mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
    mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9])
    # test the size of the corresponding KB
    assert(mykb.get_size_entities() == 3)
    assert(mykb.get_size_aliases() == 2)
 def test_kb_invalid_entities(nlp):
    """Test the invalid construction of a KB with an alias linked to a non-existing entity"""
    mykb = KnowledgeBase(nlp.vocab)
    # adding entities
    mykb.add_entity(entity=u'Q1', prob=0.9)
    mykb.add_entity(entity=u'Q2', prob=0.2)
    mykb.add_entity(entity=u'Q3', prob=0.5)
    # adding aliases - should fail because one of the given IDs is not valid
    with pytest.raises(ValueError):
        mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q342'], probabilities=[0.8, 0.2])
 def test_kb_invalid_probabilities(nlp):
    """Test the invalid construction of a KB with wrong prior probabilities"""
    mykb = KnowledgeBase(nlp.vocab)
    # adding entities
    mykb.add_entity(entity=u'Q1', prob=0.9)
    mykb.add_entity(entity=u'Q2', prob=0.2)
    mykb.add_entity(entity=u'Q3', prob=0.5)
    # adding aliases - should fail because the sum of the probabilities exceeds 1
    with pytest.raises(ValueError):
        mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.4])
 def test_kb_invalid_combination(nlp):
    """Test the invalid construction of a KB with non-matching entity and probability lists"""
    mykb = KnowledgeBase(nlp.vocab)
    # adding entities
    mykb.add_entity(entity=u'Q1', prob=0.9)
    mykb.add_entity(entity=u'Q2', prob=0.2)
    mykb.add_entity(entity=u'Q3', prob=0.5)
    # adding aliases - should fail because the entities and probabilities vectors are not of equal length
    with pytest.raises(ValueError):
        mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.3, 0.4, 0.1])
 def test_candidate_generation(nlp):
    """Test correct candidate generation"""
    mykb = KnowledgeBase(nlp.vocab)
    # adding entities
    mykb.add_entity(entity=u'Q1', prob=0.9)
    mykb.add_entity(entity=u'Q2', prob=0.2)
    mykb.add_entity(entity=u'Q3', prob=0.5)
    # adding aliases
    mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
    mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9])
    # test the size of the relevant candidates
    assert(len(mykb.get_candidates(u'douglas')) == 2)
    assert(len(mykb.get_candidates(u'adam')) == 1)
    assert(len(mykb.get_candidates(u'shrubbery')) == 0)
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -0,0 +1,145 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from spacy.kb import KnowledgeBase
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
@pytest.fixture
 def nlp():
    return English()
 def test_kb_valid_entities(nlp):
    """Test the valid construction of a KB with 3 entities and two aliases"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    # adding entities
    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
    mykb.add_entity(entity='Q2', prob=0.5, entity_vector=[2])
    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
    # adding aliases
    mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.2])
    mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9])
    # test the size of the corresponding KB
    assert(mykb.get_size_entities() == 3)
    assert(mykb.get_size_aliases() == 2)
 def test_kb_invalid_entities(nlp):
    """Test the invalid construction of a KB with an alias linked to a non-existing entity"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    # adding entities
    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
    # adding aliases - should fail because one of the given IDs is not valid
    with pytest.raises(ValueError):
        mykb.add_alias(alias='douglas', entities=['Q2', 'Q342'], probabilities=[0.8, 0.2])
 def test_kb_invalid_probabilities(nlp):
    """Test the invalid construction of a KB with wrong prior probabilities"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    # adding entities
    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
    # adding aliases - should fail because the sum of the probabilities exceeds 1
    with pytest.raises(ValueError):
        mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.4])
 def test_kb_invalid_combination(nlp):
    """Test the invalid construction of a KB with non-matching entity and probability lists"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    # adding entities
    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
    # adding aliases - should fail because the entities and probabilities vectors are not of equal length
    with pytest.raises(ValueError):
        mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.3, 0.4, 0.1])
 def test_kb_invalid_entity_vector(nlp):
    """Test the invalid construction of a KB with non-matching entity vector lengths"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    # adding entities
    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1, 2, 3])
    # this should fail because the kb's expected entity vector length is 3
    with pytest.raises(ValueError):
        mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
 def test_candidate_generation(nlp):
    """Test correct candidate generation"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    # adding entities
    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
    # adding aliases
    mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.2])
    mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9])
    # test the size of the relevant candidates
    assert(len(mykb.get_candidates('douglas')) == 2)
    assert(len(mykb.get_candidates('adam')) == 1)
    assert(len(mykb.get_candidates('shrubbery')) == 0)
 def test_preserving_links_asdoc(nlp):
    """Test that Span.as_doc preserves the existing entity links"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    # adding entities
    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
    mykb.add_entity(entity='Q2', prob=0.8, entity_vector=[1])
    # adding aliases
    mykb.add_alias(alias='Boston', entities=['Q1'], probabilities=[0.7])
    mykb.add_alias(alias='Denver', entities=['Q2'], probabilities=[0.6])
    # set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained)
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    ruler = EntityRuler(nlp)
    patterns = [{"label": "GPE", "pattern": "Boston"},
                {"label": "GPE", "pattern": "Denver"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    el_pipe = nlp.create_pipe(name='entity_linker', config={"context_width": 64})
    el_pipe.set_kb(mykb)
    el_pipe.begin_training()
    el_pipe.context_weight = 0
    el_pipe.prior_weight = 1
    nlp.add_pipe(el_pipe, last=True)
    # test whether the entity links are preserved by the `as_doc()` function
    text = "She lives in Boston. He lives in Denver."
    doc = nlp(text)
    for ent in doc.ents:
        orig_text = ent.text
        orig_kb_id = ent.kb_id_
        sent_doc = ent.sent.as_doc()
        for s_ent in sent_doc.ents:
            if s_ent.text == orig_text:
                assert s_ent.kb_id_ == orig_kb_id
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@ -0,0 +1,74 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from ..util import make_tempdir
 from ...util import ensure_path
 from spacy.kb import KnowledgeBase
 def test_serialize_kb_disk(en_vocab):
    # baseline assertions
    kb1 = _get_dummy_kb(en_vocab)
    _check_kb(kb1)
    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb1.dump(str(file_path))
        kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
        kb2.load_bulk(str(file_path))
    # final assertions
    _check_kb(kb2)
 def _get_dummy_kb(vocab):
    kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
    kb.add_entity(entity='Q53', prob=0.33, entity_vector=[0, 5, 3])
    kb.add_entity(entity='Q17', prob=0.2, entity_vector=[7, 1, 0])
    kb.add_entity(entity='Q007', prob=0.7, entity_vector=[0, 0, 7])
    kb.add_entity(entity='Q44', prob=0.4, entity_vector=[4, 4, 4])
    kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9])
    kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])
    kb.add_alias(alias='random', entities=['Q007'], probabilities=[1.0])
    return kb
 def _check_kb(kb):
    # check entities
    assert kb.get_size_entities() == 4
    for entity_string in ['Q53', 'Q17', 'Q007', 'Q44']:
        assert entity_string in kb.get_entity_strings()
    for entity_string in ['', 'Q0']:
        assert entity_string not in kb.get_entity_strings()
    # check aliases
    assert kb.get_size_aliases() == 3
    for alias_string in ['double07', 'guy', 'random']:
        assert alias_string in kb.get_alias_strings()
    for alias_string in ['nothingness', '', 'randomnoise']:
        assert alias_string not in kb.get_alias_strings()
    # check candidates & probabilities
    candidates = sorted(kb.get_candidates('double07'), key=lambda x: x.entity_)
    assert len(candidates) == 2
    assert candidates[0].entity_ == 'Q007'
    assert 0.6999 < candidates[0].entity_freq < 0.701
    assert candidates[0].entity_vector == [0, 0, 7]
    assert candidates[0].alias_ == 'double07'
    assert 0.899 < candidates[0].prior_prob < 0.901
    assert candidates[1].entity_ == 'Q17'
    assert 0.199 < candidates[1].entity_freq < 0.201
    assert candidates[1].entity_vector == [7, 1, 0]
    assert candidates[1].alias_ == 'double07'
    assert 0.099 < candidates[1].prior_prob < 0.101
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -22,7 +22,7 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
 from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, SENT_START, attr_id_t
+from ..attrs cimport ENT_TYPE, ENT_KB_ID, SENT_START, attr_id_t
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 from ..attrs import intify_attrs, IDS
@ -64,6 +64,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
        return token.ent_iob
    elif feat_name == ENT_TYPE:
        return token.ent_type
    elif feat_name == ENT_KB_ID:
        return token.ent_kb_id
    else:
        return Lexeme.get_struct_attr(token.lex, feat_name)
@ -851,7 +853,7 @@ cdef class Doc:
        DOCS: https://spacy.io/api/doc#to_bytes
        """
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]  # TODO: ENT_KB_ID ?
        if self.is_tagged:
            array_head.append(TAG)
        # If doc parsed add head and dep attribute
@ -1005,6 +1007,7 @@ cdef class Doc:
        """
        cdef unicode tag, lemma, ent_type
        deprecation_warning(Warnings.W013.format(obj="Doc"))
        # TODO: ENT_KB_ID ?
        if len(args) == 3:
            deprecation_warning(Warnings.W003)
            tag, lemma, ent_type = args
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -210,7 +210,7 @@ cdef class Span:
        words = [t.text for t in self]
        spaces = [bool(t.whitespace_) for t in self]
        cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_KB_ID]
        if self.doc.is_tagged:
            array_head.append(TAG)
        # If doc parsed add head and dep attribute
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@ -53,6 +53,8 @@ cdef class Token:
            return token.ent_iob
        elif feat_name == ENT_TYPE:
            return token.ent_type
        elif feat_name == ENT_KB_ID:
            return token.ent_kb_id
        elif feat_name == SENT_START:
            return token.sent_start
        else:
@ -79,5 +81,7 @@ cdef class Token:
            token.ent_iob = value
        elif feat_name == ENT_TYPE:
            token.ent_type = value
        elif feat_name == ENT_KB_ID:
            token.ent_kb_id = value
        elif feat_name == SENT_START:
            token.sent_start = value