From 60269589576222330fcb6f7f30fdff2ce3b71a95 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 15 Jul 2019 11:19:34 +0200
Subject: [PATCH 01/28] tokenizer doc fix

---
 spacy/tokenizer.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 70a693ba1..f19f851c7 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -348,7 +348,7 @@ cdef class Tokenizer:
         """Add a special-case tokenization rule.
 
         string (unicode): The string to specially tokenize.
-        token_attrs (iterable): A sequence of dicts, where each dict describes
+        substrings (iterable): A sequence of dicts, where each dict describes
             a token and its attributes. The `ORTH` fields of the attributes
             must exactly match the string when they are concatenated.
 

From 6e809e9b8b3c961bc4c7c4565edef699717c0919 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 15 Jul 2019 11:42:50 +0200
Subject: [PATCH 02/28] proper error for missing cfg arguments

---
 spacy/_ml.py    | 5 ++---
 spacy/errors.py | 1 +
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 4d9bb4c2b..d71b8d56c 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -663,11 +663,10 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
 
 
 def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
-    # TODO proper error
     if "entity_width" not in cfg:
-        raise ValueError("entity_width not found")
+        raise ValueError(Errors.E144.format(param="entity_width"))
     if "context_width" not in cfg:
-        raise ValueError("context_width not found")
+        raise ValueError(Errors.E144.format(param="context_width"))
 
     conv_depth = cfg.get("conv_depth", 2)
     cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
diff --git a/spacy/errors.py b/spacy/errors.py
index ed3d6afb9..cb8bb44b4 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -406,6 +406,7 @@ class Errors(object):
     E141 = ("Entity vectors should be of length {required} instead of the provided {found}.")
     E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or 'cosine'")
     E143 = ("Labels for component '{name}' not initialized. Did you forget to call add_label()?")
+    E144 = ("Could not find parameter `{param}` when building the entity linker model.")
 
 
 @add_codes

From 60f299374f61d4ab394f035bed8f39493669214d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 15 Jul 2019 12:03:09 +0200
Subject: [PATCH 03/28] set default context width

---
 spacy/pipeline/pipes.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 891e8d4e3..da3602b79 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1093,6 +1093,8 @@ class EntityLinker(Pipe):
         self.kb = None
         self.cfg = dict(cfg)
         self.sgd_context = None
+        if not self.cfg.get("context_width"):
+            self.cfg["context_width"] = 128
 
     def set_kb(self, kb):
         self.kb = kb

From cdc589d3447a073fac9c50b48531f9ebf2d771a6 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 15 Jul 2019 12:04:45 +0200
Subject: [PATCH 04/28] small fix

---
 spacy/_ml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index d71b8d56c..dedd1bee5 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -670,7 +670,7 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
 
     conv_depth = cfg.get("conv_depth", 2)
     cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
-    pretrained_vectors = cfg.get("pretrained_vectors")  # self.nlp.vocab.vectors.name
+    pretrained_vectors = cfg.get("pretrained_vectors", None)
     context_width = cfg.get("context_width")
     entity_width = cfg.get("entity_width")
 

From a63d15a14274737e0ea99bd1517fcfdacbff36dd Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 15 Jul 2019 17:36:43 +0200
Subject: [PATCH 05/28] code cleanup

---
 spacy/pipeline/pipes.pyx | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index da3602b79..6b948e585 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -14,7 +14,6 @@ from thinc.neural.util import to_categorical
 from thinc.neural.util import get_array_module
 
 from spacy.kb import KnowledgeBase
-from ..cli.pretrain import get_cossim_loss
 from .functions import merge_subtokens
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
@@ -1164,7 +1163,6 @@ class EntityLinker(Pipe):
 
                 candidates = self.kb.get_candidates(mention)
                 random.shuffle(candidates)
-                nr_neg = 0
                 for c in candidates:
                     kb_id = c.entity_
                     entity_encoding = c.entity_vector
@@ -1180,21 +1178,20 @@ class EntityLinker(Pipe):
                     if kb_id == gold_kb:
                         cats.append([1])
                     else:
-                        nr_neg += 1
                         cats.append([0])
 
         if len(entity_encodings) > 0:
             assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) == len(type_vectors)
 
-            context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
+            cats = self.model.ops.asarray(cats, dtype="float32")
             entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
 
+            context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
             mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] + type_vectors[i]
                                  for i in range(len(entity_encodings))]
             pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop)
-            cats = self.model.ops.asarray(cats, dtype="float32")
 
-            loss, d_scores = self.get_loss(prediction=pred, golds=cats, docs=None)
+            loss, d_scores = self.get_loss(scores=pred, golds=cats, docs=docs)
             mention_gradient = bp_mention(d_scores, sgd=sgd)
 
             context_gradients = [list(x[0:self.cfg.get("context_width")]) for x in mention_gradient]
@@ -1205,18 +1202,12 @@ class EntityLinker(Pipe):
             return loss
         return 0
 
-    def get_loss(self, docs, golds, prediction):
-        d_scores = (prediction - golds)
+    def get_loss(self, docs, golds, scores):
+        d_scores = (scores - golds)
         loss = (d_scores ** 2).sum()
         loss = loss / len(golds)
         return loss, d_scores
 
-    def get_loss_old(self, docs, golds, scores):
-        # this loss function assumes we're only using positive examples
-        loss, gradients = get_cossim_loss(yh=scores, y=golds)
-        loss = loss / len(golds)
-        return loss, gradients
-
     def __call__(self, doc):
         entities, kb_ids = self.predict([doc])
         self.set_annotations([doc], entities, kb_ids)

From 4086c6ff6041675b6cb01bc5bd02c7206a972c26 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 17 Jul 2019 12:17:02 +0200
Subject: [PATCH 06/28] get vector functionality + unit test

---
 .../training_set_creator.py                   | 211 ++++++++++++------
 spacy/kb.pyx                                  |   9 +
 spacy/tests/pipeline/test_entity_linker.py    |  85 ++++---
 3 files changed, 201 insertions(+), 104 deletions(-)

diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index 5d401bb3f..fb92373a3 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 import os
+import random
 import re
 import bz2
 import datetime
@@ -27,21 +28,23 @@ def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=N
     Read the XML wikipedia data to parse out training data:
     raw text data + positive instances
     """
-    title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
-    id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
+    title_regex = re.compile(r"(?<=<title>).*(?=</title>)")
+    id_regex = re.compile(r"(?<=<id>)\d*(?=</id>)")
 
     read_ids = set()
     entityfile_loc = training_output / ENTITY_FILE
-    with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
+    with open(entityfile_loc, mode="w", encoding="utf8") as entityfile:
         # write entity training header file
-        _write_training_entity(outputfile=entityfile,
-                               article_id="article_id",
-                               alias="alias",
-                               entity="WD_id",
-                               start="start",
-                               end="end")
+        _write_training_entity(
+            outputfile=entityfile,
+            article_id="article_id",
+            alias="alias",
+            entity="WD_id",
+            start="start",
+            end="end",
+        )
 
-        with bz2.open(wikipedia_input, mode='rb') as file:
+        with bz2.open(wikipedia_input, mode="rb") as file:
             line = file.readline()
             cnt = 0
             article_text = ""
@@ -51,7 +54,12 @@ def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=N
             reading_revision = False
             while line and (not limit or cnt < limit):
                 if cnt % 1000000 == 0:
-                    print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
+                    print(
+                        datetime.datetime.now(),
+                        "processed",
+                        cnt,
+                        "lines of Wikipedia dump",
+                    )
                 clean_line = line.strip().decode("utf-8")
 
                 if clean_line == "<revision>":
@@ -69,12 +77,23 @@ def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=N
                 elif clean_line == "</page>":
                     if article_id:
                         try:
-                            _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(),
-                                             training_output)
+                            _process_wp_text(
+                                wp_to_id,
+                                entityfile,
+                                article_id,
+                                article_title,
+                                article_text.strip(),
+                                training_output,
+                            )
                         except Exception as e:
-                            print("Error processing article", article_id, article_title, e)
+                            print(
+                                "Error processing article", article_id, article_title, e
+                            )
                     else:
-                        print("Done processing a page, but couldn't find an article_id ?", article_title)
+                        print(
+                            "Done processing a page, but couldn't find an article_id ?",
+                            article_title,
+                        )
                     article_text = ""
                     article_title = None
                     article_id = None
@@ -98,7 +117,9 @@ def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=N
                     if ids:
                         article_id = ids[0]
                         if article_id in read_ids:
-                            print("Found duplicate article ID", article_id, clean_line)  # This should never happen ...
+                            print(
+                                "Found duplicate article ID", article_id, clean_line
+                            )  # This should never happen ...
                         read_ids.add(article_id)
 
                 # read the title of this article (outside the revision portion of the document)
@@ -111,10 +132,12 @@ def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=N
                 cnt += 1
 
 
-text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
+text_regex = re.compile(r"(?<=<text xml:space=\"preserve\">).*(?=</text)")
 
 
-def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text, training_output):
+def _process_wp_text(
+    wp_to_id, entityfile, article_id, article_title, article_text, training_output
+):
     found_entities = False
 
     # ignore meta Wikipedia pages
@@ -141,11 +164,11 @@ def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_te
     entity_buffer = ""
     mention_buffer = ""
     for index, letter in enumerate(clean_text):
-        if letter == '[':
+        if letter == "[":
             open_read += 1
-        elif letter == ']':
+        elif letter == "]":
             open_read -= 1
-        elif letter == '|':
+        elif letter == "|":
             if reading_text:
                 final_text += letter
             # switch from reading entity to mention in the [[entity|mention]] pattern
@@ -163,7 +186,7 @@ def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_te
             elif reading_text:
                 final_text += letter
             else:
-                raise ValueError("Not sure at point", clean_text[index-2:index+2])
+                raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
 
         if open_read > 2:
             reading_special_case = True
@@ -175,7 +198,7 @@ def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_te
 
         # we just finished reading an entity
         if open_read == 0 and not reading_text:
-            if '#' in entity_buffer or entity_buffer.startswith(':'):
+            if "#" in entity_buffer or entity_buffer.startswith(":"):
                 reading_special_case = True
             # Ignore cases with nested structures like File: handles etc
             if not reading_special_case:
@@ -185,12 +208,14 @@ def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_te
                 end = start + len(mention_buffer)
                 qid = wp_to_id.get(entity_buffer, None)
                 if qid:
-                    _write_training_entity(outputfile=entityfile,
-                                           article_id=article_id,
-                                           alias=mention_buffer,
-                                           entity=qid,
-                                           start=start,
-                                           end=end)
+                    _write_training_entity(
+                        outputfile=entityfile,
+                        article_id=article_id,
+                        alias=mention_buffer,
+                        entity=qid,
+                        start=start,
+                        end=end,
+                    )
                 found_entities = True
                 final_text += mention_buffer
 
@@ -203,29 +228,35 @@ def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_te
             reading_special_case = False
 
     if found_entities:
-        _write_training_article(article_id=article_id, clean_text=final_text, training_output=training_output)
+        _write_training_article(
+            article_id=article_id,
+            clean_text=final_text,
+            training_output=training_output,
+        )
 
 
-info_regex = re.compile(r'{[^{]*?}')
-htlm_regex = re.compile(r'&lt;!--[^-]*--&gt;')
-category_regex = re.compile(r'\[\[Category:[^\[]*]]')
-file_regex = re.compile(r'\[\[File:[^[\]]+]]')
-ref_regex = re.compile(r'&lt;ref.*?&gt;')     # non-greedy
-ref_2_regex = re.compile(r'&lt;/ref.*?&gt;')  # non-greedy
+info_regex = re.compile(r"{[^{]*?}")
+htlm_regex = re.compile(r"&lt;!--[^-]*--&gt;")
+category_regex = re.compile(r"\[\[Category:[^\[]*]]")
+file_regex = re.compile(r"\[\[File:[^[\]]+]]")
+ref_regex = re.compile(r"&lt;ref.*?&gt;")  # non-greedy
+ref_2_regex = re.compile(r"&lt;/ref.*?&gt;")  # non-greedy
 
 
 def _get_clean_wp_text(article_text):
     clean_text = article_text.strip()
 
     # remove bolding & italic markup
-    clean_text = clean_text.replace('\'\'\'', '')
-    clean_text = clean_text.replace('\'\'', '')
+    clean_text = clean_text.replace("'''", "")
+    clean_text = clean_text.replace("''", "")
 
     # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
     try_again = True
     previous_length = len(clean_text)
     while try_again:
-        clean_text = info_regex.sub('', clean_text)  # non-greedy match excluding a nested {
+        clean_text = info_regex.sub(
+            "", clean_text
+        )  # non-greedy match excluding a nested {
         if len(clean_text) < previous_length:
             try_again = True
         else:
@@ -233,14 +264,14 @@ def _get_clean_wp_text(article_text):
         previous_length = len(clean_text)
 
     # remove HTML comments
-    clean_text = htlm_regex.sub('', clean_text)
+    clean_text = htlm_regex.sub("", clean_text)
 
     # remove Category and File statements
-    clean_text = category_regex.sub('', clean_text)
-    clean_text = file_regex.sub('', clean_text)
+    clean_text = category_regex.sub("", clean_text)
+    clean_text = file_regex.sub("", clean_text)
 
     # remove multiple =
-    while '==' in clean_text:
+    while "==" in clean_text:
         clean_text = clean_text.replace("==", "=")
 
     clean_text = clean_text.replace(". =", ".")
@@ -249,43 +280,56 @@ def _get_clean_wp_text(article_text):
     clean_text = clean_text.replace(" =", "")
 
     # remove refs (non-greedy match)
-    clean_text = ref_regex.sub('', clean_text)
-    clean_text = ref_2_regex.sub('', clean_text)
+    clean_text = ref_regex.sub("", clean_text)
+    clean_text = ref_2_regex.sub("", clean_text)
 
     # remove additional wikiformatting
-    clean_text = re.sub(r'&lt;blockquote&gt;', '', clean_text)
-    clean_text = re.sub(r'&lt;/blockquote&gt;', '', clean_text)
+    clean_text = re.sub(r"&lt;blockquote&gt;", "", clean_text)
+    clean_text = re.sub(r"&lt;/blockquote&gt;", "", clean_text)
 
     # change special characters back to normal ones
-    clean_text = clean_text.replace(r'&lt;', '<')
-    clean_text = clean_text.replace(r'&gt;', '>')
-    clean_text = clean_text.replace(r'&quot;', '"')
-    clean_text = clean_text.replace(r'&amp;nbsp;', ' ')
-    clean_text = clean_text.replace(r'&amp;', '&')
+    clean_text = clean_text.replace(r"&lt;", "<")
+    clean_text = clean_text.replace(r"&gt;", ">")
+    clean_text = clean_text.replace(r"&quot;", '"')
+    clean_text = clean_text.replace(r"&amp;nbsp;", " ")
+    clean_text = clean_text.replace(r"&amp;", "&")
 
     # remove multiple spaces
-    while '  ' in clean_text:
-        clean_text = clean_text.replace('  ', ' ')
+    while "  " in clean_text:
+        clean_text = clean_text.replace("  ", " ")
 
     return clean_text.strip()
 
 
 def _write_training_article(article_id, clean_text, training_output):
     file_loc = training_output / str(article_id) + ".txt"
-    with open(file_loc, mode='w', encoding='utf8') as outputfile:
+    with open(file_loc, mode="w", encoding="utf8") as outputfile:
         outputfile.write(clean_text)
 
 
 def _write_training_entity(outputfile, article_id, alias, entity, start, end):
-    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + str(start) + "|" + str(end) + "\n")
+    outputfile.write(
+        article_id
+        + "|"
+        + alias
+        + "|"
+        + entity
+        + "|"
+        + str(start)
+        + "|"
+        + str(end)
+        + "\n"
+    )
 
 
 def is_dev(article_id):
     return article_id.endswith("3")
 
 
-def read_training(nlp, training_dir, dev, limit):
-    # This method provides training examples that correspond to the entity annotations found by the nlp object
+def read_training(nlp, training_dir, dev, limit, kb=None):
+    """ This method provides training examples that correspond to the entity annotations found by the nlp object.
+     When kb is provided, it will include also negative training examples by using the candidate generator.
+     When kb=None, it will only include positive training examples."""
     entityfile_loc = training_dir / ENTITY_FILE
     data = []
 
@@ -296,24 +340,34 @@ def read_training(nlp, training_dir, dev, limit):
     skip_articles = set()
     total_entities = 0
 
-    with open(entityfile_loc, mode='r', encoding='utf8') as file:
+    with open(entityfile_loc, mode="r", encoding="utf8") as file:
         for line in file:
             if not limit or len(data) < limit:
-                fields = line.replace('\n', "").split(sep='|')
+                fields = line.replace("\n", "").split(sep="|")
                 article_id = fields[0]
                 alias = fields[1]
-                wp_title = fields[2]
+                wd_id = fields[2]
                 start = fields[3]
                 end = fields[4]
 
-                if dev == is_dev(article_id) and article_id != "article_id" and article_id not in skip_articles:
+                if (
+                    dev == is_dev(article_id)
+                    and article_id != "article_id"
+                    and article_id not in skip_articles
+                ):
                     if not current_doc or (current_article_id != article_id):
                         # parse the new article text
                         file_name = article_id + ".txt"
                         try:
-                            with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as f:
+                            with open(
+                                os.path.join(training_dir, file_name),
+                                mode="r",
+                                encoding="utf8",
+                            ) as f:
                                 text = f.read()
-                                if len(text) < 30000:   # threshold for convenience / speed of processing
+                                if (
+                                    len(text) < 30000
+                                ):  # threshold for convenience / speed of processing
                                     current_doc = nlp(text)
                                     current_article_id = article_id
                                     ents_by_offset = dict()
@@ -321,7 +375,11 @@ def read_training(nlp, training_dir, dev, limit):
                                         sent_length = len(ent.sent)
                                         # custom filtering to avoid too long or too short sentences
                                         if 5 < sent_length < 100:
-                                            ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
+                                            ents_by_offset[
+                                                str(ent.start_char)
+                                                + "_"
+                                                + str(ent.end_char)
+                                            ] = ent
                                 else:
                                     skip_articles.add(article_id)
                                     current_doc = None
@@ -332,7 +390,7 @@ def read_training(nlp, training_dir, dev, limit):
 
                     # repeat checking this condition in case an exception was thrown
                     if current_doc and (current_article_id == article_id):
-                        found_ent = ents_by_offset.get(start + "_" + end,  None)
+                        found_ent = ents_by_offset.get(start + "_" + end, None)
                         if found_ent:
                             if found_ent.text != alias:
                                 skip_articles.add(article_id)
@@ -342,7 +400,26 @@ def read_training(nlp, training_dir, dev, limit):
                                 # currently feeding the gold data one entity per sentence at a time
                                 gold_start = int(start) - found_ent.sent.start_char
                                 gold_end = int(end) - found_ent.sent.start_char
-                                gold_entities = [(gold_start, gold_end, wp_title)]
+
+                                # add both positive and negative examples (in random order just to be sure)
+                                if kb:
+                                    gold_entities = {}
+                                    candidate_ids = [
+                                        c.entity_ for c in kb.get_candidates(alias)
+                                    ]
+                                    candidate_ids.append(
+                                        wd_id
+                                    )  # in case the KB doesn't have it
+                                    random.shuffle(candidate_ids)
+                                    for kb_id in candidate_ids:
+                                        entry = (gold_start, gold_end, kb_id)
+                                        if kb_id != wd_id:
+                                            gold_entities[entry] = 0.0
+                                        else:
+                                            gold_entities[entry] = 1.0
+                                else:
+                                    gold_entities = {}
+
                                 gold = GoldParse(doc=sent, links=gold_entities)
                                 data.append((sent, gold))
                                 total_entities += 1
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 7c2daa659..315fa1945 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -203,6 +203,15 @@ cdef class KnowledgeBase:
                 for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                 if entry_index != 0]
 
+    def get_vector(self, unicode entity):
+        cdef hash_t entity_hash = self.vocab.strings.add(entity)
+
+        # Return an empty list if this entity is unknown in this KB
+        if entity_hash not in self._entry_index:
+            return []
+        entry_index = self._entry_index[entity_hash]
+
+        return self._vectors_table[self._entries[entry_index].vector_index]
 
     def dump(self, loc):
         cdef Writer writer = Writer(loc)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index cafc380ba..9d4ecb561 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -15,20 +15,25 @@ def nlp():
 
 def test_kb_valid_entities(nlp):
     """Test the valid construction of a KB with 3 entities and two aliases"""
-    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
 
     # adding entities
-    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity='Q2', prob=0.5, entity_vector=[2])
-    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[8, 4, 3])
+    mykb.add_entity(entity="Q2", prob=0.5, entity_vector=[2, 1, 0])
+    mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[-1, -6, 5])
 
     # adding aliases
-    mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.2])
-    mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9])
+    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     # test the size of the corresponding KB
-    assert(mykb.get_size_entities() == 3)
-    assert(mykb.get_size_aliases() == 2)
+    assert mykb.get_size_entities() == 3
+    assert mykb.get_size_aliases() == 2
+
+    # test retrieval of the entity vectors
+    assert mykb.get_vector("Q1") == [8, 4, 3]
+    assert mykb.get_vector("Q2") == [2, 1, 0]
+    assert mykb.get_vector("Q3") == [-1, -6, 5]
 
 
 def test_kb_invalid_entities(nlp):
@@ -36,13 +41,15 @@ def test_kb_invalid_entities(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
 
     # adding aliases - should fail because one of the given IDs is not valid
     with pytest.raises(ValueError):
-        mykb.add_alias(alias='douglas', entities=['Q2', 'Q342'], probabilities=[0.8, 0.2])
+        mykb.add_alias(
+            alias="douglas", entities=["Q2", "Q342"], probabilities=[0.8, 0.2]
+        )
 
 
 def test_kb_invalid_probabilities(nlp):
@@ -50,13 +57,13 @@ def test_kb_invalid_probabilities(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
 
     # adding aliases - should fail because the sum of the probabilities exceeds 1
     with pytest.raises(ValueError):
-        mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.4])
+        mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.4])
 
 
 def test_kb_invalid_combination(nlp):
@@ -64,13 +71,15 @@ def test_kb_invalid_combination(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
 
     # adding aliases - should fail because the entities and probabilities vectors are not of equal length
     with pytest.raises(ValueError):
-        mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.3, 0.4, 0.1])
+        mykb.add_alias(
+            alias="douglas", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1]
+        )
 
 
 def test_kb_invalid_entity_vector(nlp):
@@ -78,11 +87,11 @@ def test_kb_invalid_entity_vector(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
 
     # adding entities
-    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1, 2, 3])
+    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1, 2, 3])
 
     # this should fail because the kb's expected entity vector length is 3
     with pytest.raises(ValueError):
-        mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
+        mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
 
 
 def test_candidate_generation(nlp):
@@ -90,18 +99,18 @@ def test_candidate_generation(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
 
     # adding aliases
-    mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.2])
-    mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9])
+    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     # test the size of the relevant candidates
-    assert(len(mykb.get_candidates('douglas')) == 2)
-    assert(len(mykb.get_candidates('adam')) == 1)
-    assert(len(mykb.get_candidates('shrubbery')) == 0)
+    assert len(mykb.get_candidates("douglas")) == 2
+    assert len(mykb.get_candidates("adam")) == 1
+    assert len(mykb.get_candidates("shrubbery")) == 0
 
 
 def test_preserving_links_asdoc(nlp):
@@ -109,24 +118,26 @@ def test_preserving_links_asdoc(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity='Q2', prob=0.8, entity_vector=[1])
+    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity="Q2", prob=0.8, entity_vector=[1])
 
     # adding aliases
-    mykb.add_alias(alias='Boston', entities=['Q1'], probabilities=[0.7])
-    mykb.add_alias(alias='Denver', entities=['Q2'], probabilities=[0.6])
+    mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7])
+    mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6])
 
     # set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained)
     sentencizer = nlp.create_pipe("sentencizer")
     nlp.add_pipe(sentencizer)
 
     ruler = EntityRuler(nlp)
-    patterns = [{"label": "GPE", "pattern": "Boston"},
-                {"label": "GPE", "pattern": "Denver"}]
+    patterns = [
+        {"label": "GPE", "pattern": "Boston"},
+        {"label": "GPE", "pattern": "Denver"},
+    ]
     ruler.add_patterns(patterns)
     nlp.add_pipe(ruler)
 
-    el_pipe = nlp.create_pipe(name='entity_linker', config={"context_width": 64})
+    el_pipe = nlp.create_pipe(name="entity_linker", config={"context_width": 64})
     el_pipe.set_kb(mykb)
     el_pipe.begin_training()
     el_pipe.context_weight = 0

From d833d4c35898ce80eb24b33370a92c15414830b7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 17 Jul 2019 17:18:26 +0200
Subject: [PATCH 07/28] fixes in kb and gold

---
 .../training_set_creator.py                   | 15 +++++-----
 spacy/gold.pxd                                |  2 +-
 spacy/gold.pyx                                |  6 ++--
 spacy/kb.pyx                                  | 29 ++++++++++++++++---
 spacy/tests/pipeline/test_entity_linker.py    | 19 ++++++++++--
 5 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index fb92373a3..6261310ac 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -401,15 +401,13 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                 gold_start = int(start) - found_ent.sent.start_char
                                 gold_end = int(end) - found_ent.sent.start_char
 
-                                # add both positive and negative examples (in random order just to be sure)
+                                # add both pos and neg examples (in random order)
                                 if kb:
                                     gold_entities = {}
-                                    candidate_ids = [
-                                        c.entity_ for c in kb.get_candidates(alias)
-                                    ]
-                                    candidate_ids.append(
-                                        wd_id
-                                    )  # in case the KB doesn't have it
+                                    candidates = kb.get_candidates(alias)
+                                    candidate_ids = [c.entity_ for c in candidates]
+                                    # add positive example in case the KB doesn't have it
+                                    candidate_ids.append(wd_id)
                                     random.shuffle(candidate_ids)
                                     for kb_id in candidate_ids:
                                         entry = (gold_start, gold_end, kb_id)
@@ -418,7 +416,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                         else:
                                             gold_entities[entry] = 1.0
                                 else:
-                                    gold_entities = {}
+                                    entry = (gold_start, gold_end, wd_id)
+                                    gold_entities = {entry: 1.0}
 
                                 gold = GoldParse(doc=sent, links=gold_entities)
                                 data.append((sent, gold))
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index 8943a155a..a3123f7fa 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -31,7 +31,7 @@ cdef class GoldParse:
     cdef public list ents
     cdef public dict brackets
     cdef public object cats
-    cdef public list links
+    cdef public dict links
 
     cdef readonly list cand_to_gold
     cdef readonly list gold_to_cand
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 4fb22f3f0..81feb55a4 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -450,8 +450,10 @@ cdef class GoldParse:
             examples of a label to have the value 0.0. Labels not in the
             dictionary are treated as missing - the gradient for those labels
             will be zero.
-        links (iterable): A sequence of `(start_char, end_char, kb_id)` tuples,
-            representing the external ID of an entity in a knowledge base.
+        links (dict): A dict with `(start_char, end_char, kb_id)` keys,
+            representing the external ID of an entity in a knowledge base,
+            and the values being either 1.0 or 0.0, indicating positive and
+            negative examples, respectively.
         RETURNS (GoldParse): The newly constructed object.
         """
         if words is None:
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 315fa1945..214648c7f 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -191,7 +191,7 @@ cdef class KnowledgeBase:
 
     def get_candidates(self, unicode alias):
         cdef hash_t alias_hash = self.vocab.strings[alias]
-        alias_index = <int64_t>self._alias_index.get(alias_hash)
+        alias_index = <int64_t>self._alias_index.get(alias_hash)  # TODO: check for error? unit test !
         alias_entry = self._aliases_table[alias_index]
 
         return [Candidate(kb=self,
@@ -199,12 +199,12 @@ cdef class KnowledgeBase:
                           entity_freq=self._entries[entry_index].prob,
                           entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
                           alias_hash=alias_hash,
-                          prior_prob=prob)
-                for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+                          prior_prob=prior_prob)
+                for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                 if entry_index != 0]
 
     def get_vector(self, unicode entity):
-        cdef hash_t entity_hash = self.vocab.strings.add(entity)
+        cdef hash_t entity_hash = self.vocab.strings[entity]
 
         # Return an empty list if this entity is unknown in this KB
         if entity_hash not in self._entry_index:
@@ -213,6 +213,27 @@ cdef class KnowledgeBase:
 
         return self._vectors_table[self._entries[entry_index].vector_index]
 
+    def get_prior_prob(self, unicode entity, unicode alias):
+        """ Return the prior probability of a given alias being linked to a given entity,
+        or return 0.0 when this combination is not known in the knowledge base"""
+        cdef hash_t alias_hash = self.vocab.strings[alias]
+        cdef hash_t entity_hash = self.vocab.strings[entity]
+
+        # TODO: error  ?
+        if entity_hash not in self._entry_index or alias_hash not in self._alias_index:
+            return 0.0
+
+        alias_index = <int64_t>self._alias_index.get(alias_hash)
+        entry_index = self._entry_index[entity_hash]
+
+        alias_entry = self._aliases_table[alias_index]
+        for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
+            if self._entries[entry_index].entity_hash == entity_hash:
+                return prior_prob
+
+        return 0.0
+
+
     def dump(self, loc):
         cdef Writer writer = Writer(loc)
         writer.write_header(self.get_size_entities(), self.entity_vector_length)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 9d4ecb561..c3163200a 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -13,6 +13,11 @@ def nlp():
     return English()
 
 
+def assert_almost_equal(a, b):
+    delta = 0.0001
+    assert a - delta <= b <= a + delta
+
+
 def test_kb_valid_entities(nlp):
     """Test the valid construction of a KB with 3 entities and two aliases"""
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
@@ -35,6 +40,10 @@ def test_kb_valid_entities(nlp):
     assert mykb.get_vector("Q2") == [2, 1, 0]
     assert mykb.get_vector("Q3") == [-1, -6, 5]
 
+    # test retrieval of prior probabilities
+    assert_almost_equal(mykb.get_prior_prob(entity="Q2", alias="douglas"), 0.8)
+    assert_almost_equal(mykb.get_prior_prob(entity="Q3", alias="douglas"), 0.2)
+
 
 def test_kb_invalid_entities(nlp):
     """Test the invalid construction of a KB with an alias linked to a non-existing entity"""
@@ -99,12 +108,12 @@ def test_candidate_generation(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity="Q1", prob=0.7, entity_vector=[1])
     mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
     mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
 
     # adding aliases
-    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
     mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
 
     # test the size of the relevant candidates
@@ -112,6 +121,12 @@ def test_candidate_generation(nlp):
     assert len(mykb.get_candidates("adam")) == 1
     assert len(mykb.get_candidates("shrubbery")) == 0
 
+    # test the content of the candidates
+    assert mykb.get_candidates("adam")[0].entity_ == "Q2"
+    assert mykb.get_candidates("adam")[0].alias_ == "adam"
+    assert_almost_equal(mykb.get_candidates("adam")[0].entity_freq, 0.2)
+    assert_almost_equal(mykb.get_candidates("adam")[0].prior_prob, 0.9)
+
 
 def test_preserving_links_asdoc(nlp):
     """Test that Span.as_doc preserves the existing entity links"""

From ec55d2fccdf39ce78982b9cceb2cd339eccb6447 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 18 Jul 2019 10:22:24 +0200
Subject: [PATCH 08/28] filter training data beforehand (+black formatting)

---
 .../training_set_creator.py                   |  20 +-
 .../wikipedia_processor.py                    | 129 +++++++--
 examples/pipeline/wikidata_entity_linking.py  | 272 +++++++++++-------
 spacy/kb.pyx                                  |   2 +-
 spacy/pipeline/pipes.pyx                      |  31 +-
 5 files changed, 294 insertions(+), 160 deletions(-)

diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index 6261310ac..eb961b9da 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -18,6 +18,10 @@ Gold-standard entities are stored in one file in standoff format (by character o
 ENTITY_FILE = "gold_entities.csv"
 
 
+def now():
+    return datetime.datetime.now()
+
+
 def create_training(wikipedia_input, entity_def_input, training_output):
     wp_to_id = kb_creator.get_entity_to_id(entity_def_input)
     _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None)
@@ -54,12 +58,7 @@ def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=N
             reading_revision = False
             while line and (not limit or cnt < limit):
                 if cnt % 1000000 == 0:
-                    print(
-                        datetime.datetime.now(),
-                        "processed",
-                        cnt,
-                        "lines of Wikipedia dump",
-                    )
+                    print(now(), "processed", cnt, "lines of Wikipedia dump")
                 clean_line = line.strip().decode("utf-8")
 
                 if clean_line == "<revision>":
@@ -328,8 +327,9 @@ def is_dev(article_id):
 
 def read_training(nlp, training_dir, dev, limit, kb=None):
     """ This method provides training examples that correspond to the entity annotations found by the nlp object.
-     When kb is provided, it will include also negative training examples by using the candidate generator.
-     When kb=None, it will only include positive training examples."""
+     When kb is provided (for training), it will include negative training examples by using the candidate generator,
+     and it will only keep positive training examples that can be found in the KB.
+     When kb=None (for testing), it will include all positive examples only."""
     entityfile_loc = training_dir / ENTITY_FILE
     data = []
 
@@ -402,12 +402,11 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                 gold_end = int(end) - found_ent.sent.start_char
 
                                 # add both pos and neg examples (in random order)
+                                # this will exclude examples not in the KB
                                 if kb:
                                     gold_entities = {}
                                     candidates = kb.get_candidates(alias)
                                     candidate_ids = [c.entity_ for c in candidates]
-                                    # add positive example in case the KB doesn't have it
-                                    candidate_ids.append(wd_id)
                                     random.shuffle(candidate_ids)
                                     for kb_id in candidate_ids:
                                         entry = (gold_start, gold_end, kb_id)
@@ -415,6 +414,7 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                             gold_entities[entry] = 0.0
                                         else:
                                             gold_entities[entry] = 1.0
+                                # keep all positive examples
                                 else:
                                     entry = (gold_start, gold_end, wd_id)
                                     gold_entities = {entry: 1.0}
diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py
index c02e472bc..4d11aee61 100644
--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@@ -14,22 +14,97 @@ Write these results to file for downstream KB and training data generation.
 map_alias_to_link = dict()
 
 # these will/should be matched ignoring case
-wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
-                   "d", "dbdump", "download", "Draft", "Education", "Foundation",
-                   "Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator",
-                   "m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki",
-                   "MediaZilla", "Meta", "Metawikipedia", "Module",
-                   "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki",
-                   "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev",
-                   "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn",
-                   "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools",
-                   "tswiki", "User", "User talk", "v", "voy",
-                   "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews",
-                   "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech",
-                   "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"]
+wiki_namespaces = [
+    "b",
+    "betawikiversity",
+    "Book",
+    "c",
+    "Category",
+    "Commons",
+    "d",
+    "dbdump",
+    "download",
+    "Draft",
+    "Education",
+    "Foundation",
+    "Gadget",
+    "Gadget definition",
+    "gerrit",
+    "File",
+    "Help",
+    "Image",
+    "Incubator",
+    "m",
+    "mail",
+    "mailarchive",
+    "media",
+    "MediaWiki",
+    "MediaWiki talk",
+    "Mediawikiwiki",
+    "MediaZilla",
+    "Meta",
+    "Metawikipedia",
+    "Module",
+    "mw",
+    "n",
+    "nost",
+    "oldwikisource",
+    "outreach",
+    "outreachwiki",
+    "otrs",
+    "OTRSwiki",
+    "Portal",
+    "phab",
+    "Phabricator",
+    "Project",
+    "q",
+    "quality",
+    "rev",
+    "s",
+    "spcom",
+    "Special",
+    "species",
+    "Strategy",
+    "sulutil",
+    "svn",
+    "Talk",
+    "Template",
+    "Template talk",
+    "Testwiki",
+    "ticket",
+    "TimedText",
+    "Toollabs",
+    "tools",
+    "tswiki",
+    "User",
+    "User talk",
+    "v",
+    "voy",
+    "w",
+    "Wikibooks",
+    "Wikidata",
+    "wikiHow",
+    "Wikinvest",
+    "wikilivres",
+    "Wikimedia",
+    "Wikinews",
+    "Wikipedia",
+    "Wikipedia talk",
+    "Wikiquote",
+    "Wikisource",
+    "Wikispecies",
+    "Wikitech",
+    "Wikiversity",
+    "Wikivoyage",
+    "wikt",
+    "wiktionary",
+    "wmf",
+    "wmania",
+    "WP",
+]
 
 # find the links
-link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
+link_regex = re.compile(r"\[\[[^\[\]]*\]\]")
 
 # match on interwiki links, e.g. `en:` or `:fr:`
 ns_regex = r":?" + "[a-z][a-z]" + ":"
@@ -41,18 +116,22 @@ for ns in wiki_namespaces:
 ns_regex = re.compile(ns_regex, re.IGNORECASE)
 
 
-def read_wikipedia_prior_probs(wikipedia_input, prior_prob_output):
+def now():
+    return datetime.datetime.now()
+
+
+def read_prior_probs(wikipedia_input, prior_prob_output):
     """
     Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities.
     The full file takes about 2h to parse 1100M lines.
     It works relatively fast because it runs line by line, irrelevant of which article the intrawiki is from.
     """
-    with bz2.open(wikipedia_input, mode='rb') as file:
+    with bz2.open(wikipedia_input, mode="rb") as file:
         line = file.readline()
         cnt = 0
         while line:
             if cnt % 5000000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
+                print(now(), "processed", cnt, "lines of Wikipedia dump")
             clean_line = line.strip().decode("utf-8")
 
             aliases, entities, normalizations = get_wp_links(clean_line)
@@ -64,10 +143,11 @@ def read_wikipedia_prior_probs(wikipedia_input, prior_prob_output):
             cnt += 1
 
     # write all aliases and their entities and count occurrences to file
-    with open(prior_prob_output, mode='w', encoding='utf8') as outputfile:
+    with open(prior_prob_output, mode="w", encoding="utf8") as outputfile:
         outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
         for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
-            for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
+            s_dict = sorted(alias_dict.items(), key=lambda x: x[1], reverse=True)
+            for entity, count in s_dict:
                 outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
 
 
@@ -140,13 +220,13 @@ def write_entity_counts(prior_prob_input, count_output, to_print=False):
     entity_to_count = dict()
     total_count = 0
 
-    with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
+    with open(prior_prob_input, mode="r", encoding="utf8") as prior_file:
         # skip header
         prior_file.readline()
         line = prior_file.readline()
 
         while line:
-            splits = line.replace('\n', "").split(sep='|')
+            splits = line.replace("\n", "").split(sep="|")
             # alias = splits[0]
             count = int(splits[1])
             entity = splits[2]
@@ -158,7 +238,7 @@ def write_entity_counts(prior_prob_input, count_output, to_print=False):
 
             line = prior_file.readline()
 
-    with open(count_output, mode='w', encoding='utf8') as entity_file:
+    with open(count_output, mode="w", encoding="utf8") as entity_file:
         entity_file.write("entity" + "|" + "count" + "\n")
         for entity, count in entity_to_count.items():
             entity_file.write(entity + "|" + str(count) + "\n")
@@ -171,12 +251,11 @@ def write_entity_counts(prior_prob_input, count_output, to_print=False):
 
 def get_all_frequencies(count_input):
     entity_to_count = dict()
-    with open(count_input, 'r', encoding='utf8') as csvfile:
-        csvreader = csv.reader(csvfile, delimiter='|')
+    with open(count_input, "r", encoding="utf8") as csvfile:
+        csvreader = csv.reader(csvfile, delimiter="|")
         # skip header
         next(csvreader)
         for row in csvreader:
             entity_to_count[row[0]] = int(row[1])
 
     return entity_to_count
-
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 17c2976dd..341dc94ed 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -5,7 +5,8 @@ import random
 import datetime
 from pathlib import Path
 
-from bin.wiki_entity_linking import training_set_creator, kb_creator, wikipedia_processor as wp
+from bin.wiki_entity_linking import wikipedia_processor as wp
+from bin.wiki_entity_linking import training_set_creator, kb_creator
 from bin.wiki_entity_linking.kb_creator import DESC_WIDTH
 
 import spacy
@@ -17,23 +18,25 @@ Demonstrate how to build a knowledge base from WikiData and run an Entity Linkin
 """
 
 ROOT_DIR = Path("C:/Users/Sofie/Documents/data/")
-OUTPUT_DIR = ROOT_DIR / 'wikipedia'
-TRAINING_DIR = OUTPUT_DIR / 'training_data_nel'
+OUTPUT_DIR = ROOT_DIR / "wikipedia"
+TRAINING_DIR = OUTPUT_DIR / "training_data_nel"
 
-PRIOR_PROB = OUTPUT_DIR / 'prior_prob.csv'
-ENTITY_COUNTS = OUTPUT_DIR / 'entity_freq.csv'
-ENTITY_DEFS = OUTPUT_DIR / 'entity_defs.csv'
-ENTITY_DESCR = OUTPUT_DIR / 'entity_descriptions.csv'
+PRIOR_PROB = OUTPUT_DIR / "prior_prob.csv"
+ENTITY_COUNTS = OUTPUT_DIR / "entity_freq.csv"
+ENTITY_DEFS = OUTPUT_DIR / "entity_defs.csv"
+ENTITY_DESCR = OUTPUT_DIR / "entity_descriptions.csv"
 
-KB_FILE = OUTPUT_DIR / 'kb_1' / 'kb'
-NLP_1_DIR = OUTPUT_DIR / 'nlp_1'
-NLP_2_DIR = OUTPUT_DIR / 'nlp_2'
+KB_FILE = OUTPUT_DIR / "kb_1" / "kb"
+NLP_1_DIR = OUTPUT_DIR / "nlp_1"
+NLP_2_DIR = OUTPUT_DIR / "nlp_2"
 
 # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
-WIKIDATA_JSON = ROOT_DIR / 'wikidata' / 'wikidata-20190304-all.json.bz2'
+WIKIDATA_JSON = ROOT_DIR / "wikidata" / "wikidata-20190304-all.json.bz2"
 
 # get enwiki-latest-pages-articles-multistream.xml.bz2 from https://dumps.wikimedia.org/enwiki/latest/
-ENWIKI_DUMP = ROOT_DIR / 'wikipedia' / 'enwiki-20190320-pages-articles-multistream.xml.bz2'
+ENWIKI_DUMP = (
+    ROOT_DIR / "wikipedia" / "enwiki-20190320-pages-articles-multistream.xml.bz2"
+)
 
 # KB construction parameters
 MAX_CANDIDATES = 10
@@ -48,11 +51,15 @@ L2 = 1e-6
 CONTEXT_WIDTH = 128
 
 
+def now():
+    return datetime.datetime.now()
+
+
 def run_pipeline():
     # set the appropriate booleans to define which parts of the pipeline should be re(run)
-    print("START", datetime.datetime.now())
+    print("START", now())
     print()
-    nlp_1 = spacy.load('en_core_web_lg')
+    nlp_1 = spacy.load("en_core_web_lg")
     nlp_2 = None
     kb_2 = None
 
@@ -82,40 +89,42 @@ def run_pipeline():
 
     # STEP 1 : create prior probabilities from WP (run only once)
     if to_create_prior_probs:
-        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
-        wp.read_wikipedia_prior_probs(wikipedia_input=ENWIKI_DUMP, prior_prob_output=PRIOR_PROB)
+        print("STEP 1: to_create_prior_probs", now())
+        wp.read_prior_probs(ENWIKI_DUMP, PRIOR_PROB)
         print()
 
     # STEP 2 : deduce entity frequencies from WP (run only once)
     if to_create_entity_counts:
-        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
-        wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False)
+        print("STEP 2: to_create_entity_counts", now())
+        wp.write_entity_counts(PRIOR_PROB, ENTITY_COUNTS, to_print=False)
         print()
 
     # STEP 3 : create KB and write to file (run only once)
     if to_create_kb:
-        print("STEP 3a: to_create_kb", datetime.datetime.now())
-        kb_1 = kb_creator.create_kb(nlp_1,
-                                    max_entities_per_alias=MAX_CANDIDATES,
-                                    min_entity_freq=MIN_ENTITY_FREQ,
-                                    min_occ=MIN_PAIR_OCC,
-                                    entity_def_output=ENTITY_DEFS,
-                                    entity_descr_output=ENTITY_DESCR,
-                                    count_input=ENTITY_COUNTS,
-                                    prior_prob_input=PRIOR_PROB,
-                                    wikidata_input=WIKIDATA_JSON)
+        print("STEP 3a: to_create_kb", now())
+        kb_1 = kb_creator.create_kb(
+            nlp=nlp_1,
+            max_entities_per_alias=MAX_CANDIDATES,
+            min_entity_freq=MIN_ENTITY_FREQ,
+            min_occ=MIN_PAIR_OCC,
+            entity_def_output=ENTITY_DEFS,
+            entity_descr_output=ENTITY_DESCR,
+            count_input=ENTITY_COUNTS,
+            prior_prob_input=PRIOR_PROB,
+            wikidata_input=WIKIDATA_JSON,
+        )
         print("kb entities:", kb_1.get_size_entities())
         print("kb aliases:", kb_1.get_size_aliases())
         print()
 
-        print("STEP 3b: write KB and NLP", datetime.datetime.now())
+        print("STEP 3b: write KB and NLP", now())
         kb_1.dump(KB_FILE)
         nlp_1.to_disk(NLP_1_DIR)
         print()
 
     # STEP 4 : read KB back in from file
     if to_read_kb:
-        print("STEP 4: to_read_kb", datetime.datetime.now())
+        print("STEP 4: to_read_kb", now())
         nlp_2 = spacy.load(NLP_1_DIR)
         kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH)
         kb_2.load_bulk(KB_FILE)
@@ -130,20 +139,26 @@ def run_pipeline():
 
     # STEP 5: create a training dataset from WP
     if create_wp_training:
-        print("STEP 5: create training dataset", datetime.datetime.now())
-        training_set_creator.create_training(wikipedia_input=ENWIKI_DUMP,
-                                             entity_def_input=ENTITY_DEFS,
-                                             training_output=TRAINING_DIR)
+        print("STEP 5: create training dataset", now())
+        training_set_creator.create_training(
+            wikipedia_input=ENWIKI_DUMP,
+            entity_def_input=ENTITY_DEFS,
+            training_output=TRAINING_DIR,
+        )
 
     # STEP 6: create and train the entity linking pipe
     if train_pipe:
-        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
+        print("STEP 6: training Entity Linking pipe", now())
         type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)}
         print(" -analysing", len(type_to_int), "different entity types")
-        el_pipe = nlp_2.create_pipe(name='entity_linker',
-                                    config={"context_width": CONTEXT_WIDTH,
-                                            "pretrained_vectors": nlp_2.vocab.vectors.name,
-                                            "type_to_int": type_to_int})
+        el_pipe = nlp_2.create_pipe(
+            name="entity_linker",
+            config={
+                "context_width": CONTEXT_WIDTH,
+                "pretrained_vectors": nlp_2.vocab.vectors.name,
+                "type_to_int": type_to_int,
+            },
+        )
         el_pipe.set_kb(kb_2)
         nlp_2.add_pipe(el_pipe, last=True)
 
@@ -157,18 +172,22 @@ def run_pipeline():
         train_limit = 5000
         dev_limit = 5000
 
-        train_data = training_set_creator.read_training(nlp=nlp_2,
-                                                        training_dir=TRAINING_DIR,
-                                                        dev=False,
-                                                        limit=train_limit)
+        # for training, get pos & neg instances that correspond to entries in the kb
+        train_data = training_set_creator.read_training(
+            nlp=nlp_2,
+            training_dir=TRAINING_DIR,
+            dev=False,
+            limit=train_limit,
+            kb=el_pipe.kb,
+        )
 
         print("Training on", len(train_data), "articles")
         print()
 
-        dev_data = training_set_creator.read_training(nlp=nlp_2,
-                                                      training_dir=TRAINING_DIR,
-                                                      dev=True,
-                                                      limit=dev_limit)
+        # for testing, get all pos instances, whether or not they are in the kb
+        dev_data = training_set_creator.read_training(
+            nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit, kb=None
+        )
 
         print("Dev testing on", len(dev_data), "articles")
         print()
@@ -187,8 +206,8 @@ def run_pipeline():
                         try:
                             docs, golds = zip(*batch)
                             nlp_2.update(
-                                docs,
-                                golds,
+                                docs=docs,
+                                golds=golds,
                                 sgd=optimizer,
                                 drop=DROPOUT,
                                 losses=losses,
@@ -200,48 +219,61 @@ def run_pipeline():
                 if batchnr > 0:
                     el_pipe.cfg["context_weight"] = 1
                     el_pipe.cfg["prior_weight"] = 1
-                    dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
-                    losses['entity_linker'] = losses['entity_linker'] / batchnr
-                    print("Epoch, train loss", itn, round(losses['entity_linker'], 2),
-                          " / dev acc avg", round(dev_acc_context, 3))
+                    dev_acc_context, _ = _measure_acc(dev_data, el_pipe)
+                    losses["entity_linker"] = losses["entity_linker"] / batchnr
+                    print(
+                        "Epoch, train loss",
+                        itn,
+                        round(losses["entity_linker"], 2),
+                        " / dev acc avg",
+                        round(dev_acc_context, 3),
+                    )
 
         # STEP 7: measure the performance of our trained pipe on an independent dev set
         if len(dev_data) and measure_performance:
             print()
-            print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())
+            print("STEP 7: performance measurement of Entity Linking pipe", now())
             print()
 
-            counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines(dev_data, kb_2)
+            counts, acc_r, acc_r_d, acc_p, acc_p_d, acc_o, acc_o_d = _measure_baselines(
+                dev_data, kb_2
+            )
             print("dev counts:", sorted(counts.items(), key=lambda x: x[0]))
-            print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_label.items()])
-            print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()])
-            print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()])
+
+            oracle_by_label = [(x, round(y, 3)) for x, y in acc_o_d.items()]
+            print("dev acc oracle:", round(acc_o, 3), oracle_by_label)
+
+            random_by_label = [(x, round(y, 3)) for x, y in acc_r_d.items()]
+            print("dev acc random:", round(acc_r, 3), random_by_label)
+
+            prior_by_label = [(x, round(y, 3)) for x, y in acc_p_d.items()]
+            print("dev acc prior:", round(acc_p, 3), prior_by_label)
 
             # using only context
             el_pipe.cfg["context_weight"] = 1
             el_pipe.cfg["prior_weight"] = 0
-            dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
-            print("dev acc context avg:", round(dev_acc_context, 3),
-                  [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()])
+            dev_acc_context, dev_acc_cont_d = _measure_acc(dev_data, el_pipe)
+            context_by_label = [(x, round(y, 3)) for x, y in dev_acc_cont_d.items()]
+            print("dev acc context avg:", round(dev_acc_context, 3), context_by_label)
 
             # measuring combined accuracy (prior + context)
             el_pipe.cfg["context_weight"] = 1
             el_pipe.cfg["prior_weight"] = 1
-            dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe, error_analysis=False)
-            print("dev acc combo avg:", round(dev_acc_combo, 3),
-                  [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
+            dev_acc_combo, dev_acc_combo_d = _measure_acc(dev_data, el_pipe)
+            combo_by_label = [(x, round(y, 3)) for x, y in dev_acc_combo_d.items()]
+            print("dev acc combo avg:", round(dev_acc_combo, 3), combo_by_label)
 
         # STEP 8: apply the EL pipe on a toy example
         if to_test_pipeline:
             print()
-            print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())
+            print("STEP 8: applying Entity Linking to toy example", now())
             print()
             run_el_toy_example(nlp=nlp_2)
 
         # STEP 9: write the NLP pipeline (including entity linker) to file
         if to_write_nlp:
             print()
-            print("STEP 9: testing NLP IO", datetime.datetime.now())
+            print("STEP 9: testing NLP IO", now())
             print()
             print("writing to", NLP_2_DIR)
             nlp_2.to_disk(NLP_2_DIR)
@@ -262,23 +294,26 @@ def run_pipeline():
         el_pipe = nlp_3.get_pipe("entity_linker")
 
         dev_limit = 5000
-        dev_data = training_set_creator.read_training(nlp=nlp_2,
-                                                      training_dir=TRAINING_DIR,
-                                                      dev=True,
-                                                      limit=dev_limit)
+        dev_data = training_set_creator.read_training(
+            nlp=nlp_2,
+            training_dir=TRAINING_DIR,
+            dev=True,
+            limit=dev_limit,
+            kb=el_pipe.kb,
+        )
 
         print("Dev testing from file on", len(dev_data), "articles")
         print()
 
-        dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe, error_analysis=False)
-        print("dev acc combo avg:", round(dev_acc_combo, 3),
-              [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
+        dev_acc_combo, dev_acc_combo_dict = _measure_acc(dev_data, el_pipe)
+        combo_by_label = [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]
+        print("dev acc combo avg:", round(dev_acc_combo, 3), combo_by_label)
 
     print()
-    print("STOP", datetime.datetime.now())
+    print("STOP", now())
 
 
-def _measure_accuracy(data, el_pipe=None, error_analysis=False):
+def _measure_acc(data, el_pipe=None, error_analysis=False):
     # If the docs in the data require further processing with an entity linker, set el_pipe
     correct_by_label = dict()
     incorrect_by_label = dict()
@@ -291,16 +326,19 @@ def _measure_accuracy(data, el_pipe=None, error_analysis=False):
     for doc, gold in zip(docs, golds):
         try:
             correct_entries_per_article = dict()
-            for entity in gold.links:
-                start, end, gold_kb = entity
-                correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
+            for entity, value in gold.links.items():
+                # only evaluating on positive examples
+                if value:
+                    start, end, gold_kb = entity
+                    correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
 
             for ent in doc.ents:
                 ent_label = ent.label_
                 pred_entity = ent.kb_id_
                 start = ent.start_char
                 end = ent.end_char
-                gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
+                offset = str(start) + "-" + str(end)
+                gold_entity = correct_entries_per_article.get(offset, None)
                 # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
                 if gold_entity is not None:
                     if gold_entity == pred_entity:
@@ -311,28 +349,33 @@ def _measure_accuracy(data, el_pipe=None, error_analysis=False):
                         incorrect_by_label[ent_label] = incorrect + 1
                         if error_analysis:
                             print(ent.text, "in", doc)
-                            print("Predicted",  pred_entity, "should have been", gold_entity)
+                            print(
+                                "Predicted",
+                                pred_entity,
+                                "should have been",
+                                gold_entity,
+                            )
                             print()
 
         except Exception as e:
             print("Error assessing accuracy", e)
 
-    acc, acc_by_label = calculate_acc(correct_by_label,  incorrect_by_label)
+    acc, acc_by_label = calculate_acc(correct_by_label, incorrect_by_label)
     return acc, acc_by_label
 
 
 def _measure_baselines(data, kb):
     # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound
-    counts_by_label = dict()
+    counts_d = dict()
 
-    random_correct_by_label = dict()
-    random_incorrect_by_label = dict()
+    random_correct_d = dict()
+    random_incorrect_d = dict()
 
-    oracle_correct_by_label = dict()
-    oracle_incorrect_by_label = dict()
+    oracle_correct_d = dict()
+    oracle_incorrect_d = dict()
 
-    prior_correct_by_label = dict()
-    prior_incorrect_by_label = dict()
+    prior_correct_d = dict()
+    prior_incorrect_d = dict()
 
     docs = [d for d, g in data if len(d) > 0]
     golds = [g for d, g in data if len(d) > 0]
@@ -345,14 +388,15 @@ def _measure_baselines(data, kb):
                 correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
 
             for ent in doc.ents:
-                ent_label = ent.label_
+                label = ent.label_
                 start = ent.start_char
                 end = ent.end_char
-                gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
+                offset = str(start) + "-" + str(end)
+                gold_entity = correct_entries_per_article.get(offset, None)
 
                 # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
                 if gold_entity is not None:
-                    counts_by_label[ent_label] = counts_by_label.get(ent_label, 0) + 1
+                    counts_d[label] = counts_d.get(label, 0) + 1
                     candidates = kb.get_candidates(ent.text)
                     oracle_candidate = ""
                     best_candidate = ""
@@ -370,28 +414,36 @@ def _measure_baselines(data, kb):
                         random_candidate = random.choice(candidates).entity_
 
                     if gold_entity == best_candidate:
-                        prior_correct_by_label[ent_label] = prior_correct_by_label.get(ent_label, 0) + 1
+                        prior_correct_d[label] = prior_correct_d.get(label, 0) + 1
                     else:
-                        prior_incorrect_by_label[ent_label] = prior_incorrect_by_label.get(ent_label, 0) + 1
+                        prior_incorrect_d[label] = prior_incorrect_d.get(label, 0) + 1
 
                     if gold_entity == random_candidate:
-                        random_correct_by_label[ent_label] = random_correct_by_label.get(ent_label, 0) + 1
+                        random_correct_d[label] = random_correct_d.get(label, 0) + 1
                     else:
-                        random_incorrect_by_label[ent_label] = random_incorrect_by_label.get(ent_label, 0) + 1
+                        random_incorrect_d[label] = random_incorrect_d.get(label, 0) + 1
 
                     if gold_entity == oracle_candidate:
-                        oracle_correct_by_label[ent_label] = oracle_correct_by_label.get(ent_label, 0) + 1
+                        oracle_correct_d[label] = oracle_correct_d.get(label, 0) + 1
                     else:
-                        oracle_incorrect_by_label[ent_label] = oracle_incorrect_by_label.get(ent_label, 0) + 1
+                        oracle_incorrect_d[label] = oracle_incorrect_d.get(label, 0) + 1
 
         except Exception as e:
             print("Error assessing accuracy", e)
 
-    acc_prior, acc_prior_by_label = calculate_acc(prior_correct_by_label, prior_incorrect_by_label)
-    acc_rand, acc_rand_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label)
-    acc_oracle, acc_oracle_by_label = calculate_acc(oracle_correct_by_label, oracle_incorrect_by_label)
+    acc_prior, acc_prior_d = calculate_acc(prior_correct_d, prior_incorrect_d)
+    acc_rand, acc_rand_d = calculate_acc(random_correct_d, random_incorrect_d)
+    acc_oracle, acc_oracle_d = calculate_acc(oracle_correct_d, oracle_incorrect_d)
 
-    return counts_by_label, acc_rand, acc_rand_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label
+    return (
+        counts_d,
+        acc_rand,
+        acc_rand_d,
+        acc_prior,
+        acc_prior_d,
+        acc_oracle,
+        acc_oracle_d,
+    )
 
 
 def calculate_acc(correct_by_label, incorrect_by_label):
@@ -422,15 +474,23 @@ def check_kb(kb):
 
         print("generating candidates for " + mention + " :")
         for c in candidates:
-            print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
+            print(
+                " ",
+                c.prior_prob,
+                c.alias_,
+                "-->",
+                c.entity_ + " (freq=" + str(c.entity_freq) + ")",
+            )
         print()
 
 
 def run_el_toy_example(nlp):
-    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
-           "Douglas reminds us to always bring our towel, even in China or Brazil. " \
-           "The main character in Doug's novel is the man Arthur Dent, " \
-           "but Douglas doesn't write about George Washington or Homer Simpson."
+    text = (
+        "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, "
+        "Douglas reminds us to always bring our towel, even in China or Brazil. "
+        "The main character in Doug's novel is the man Arthur Dent, "
+        "but Douglas doesn't write about George Washington or Homer Simpson."
+    )
     doc = nlp(text)
     print(text)
     for ent in doc.ents:
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 214648c7f..bdd7da0f9 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -208,7 +208,7 @@ cdef class KnowledgeBase:
 
         # Return an empty list if this entity is unknown in this KB
         if entity_hash not in self._entry_index:
-            return []
+            return [0] * self.entity_vector_length
         entry_index = self._entry_index[entity_hash]
 
         return self._vectors_table[self._entries[entry_index].vector_index]
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 6b948e585..b3f384437 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1151,9 +1151,11 @@ class EntityLinker(Pipe):
             ents_by_offset = dict()
             for ent in doc.ents:
                 ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
-            for entity in gold.links:
-                start, end, gold_kb = entity
+            for entity, value in gold.links.items():
+                start, end, kb_id = entity
                 mention = doc.text[start:end]
+                entity_encoding = self.kb.get_vector(kb_id)
+                prior_prob = self.kb.get_prior_prob(kb_id, mention)
 
                 gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)]
                 assert gold_ent is not None
@@ -1161,24 +1163,17 @@ class EntityLinker(Pipe):
                 if len(type_to_int) > 0:
                     type_vector[type_to_int[gold_ent.label_]] = 1
 
-                candidates = self.kb.get_candidates(mention)
-                random.shuffle(candidates)
-                for c in candidates:
-                    kb_id = c.entity_
-                    entity_encoding = c.entity_vector
-                    entity_encodings.append(entity_encoding)
-                    context_docs.append(doc)
-                    type_vectors.append(type_vector)
+                # store data
+                entity_encodings.append(entity_encoding)
+                context_docs.append(doc)
+                type_vectors.append(type_vector)
 
-                    if self.cfg.get("prior_weight", 1) > 0:
-                        priors.append([c.prior_prob])
-                    else:
-                        priors.append([0])
+                if self.cfg.get("prior_weight", 1) > 0:
+                    priors.append([prior_prob])
+                else:
+                    priors.append([0])
 
-                    if kb_id == gold_kb:
-                        cats.append([1])
-                    else:
-                        cats.append([0])
+                cats.append([value])
 
         if len(entity_encodings) > 0:
             assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) == len(type_vectors)

From e1213eaf6af1a19b00e9140105982f1a587ae4a6 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 18 Jul 2019 13:35:10 +0200
Subject: [PATCH 09/28] use original gold object in get_loss function

---
 examples/pipeline/wikidata_entity_linking.py | 12 +++++------
 spacy/pipeline/pipes.pyx                     | 21 ++++++++++++--------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 341dc94ed..ab9aa51fd 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -295,11 +295,7 @@ def run_pipeline():
 
         dev_limit = 5000
         dev_data = training_set_creator.read_training(
-            nlp=nlp_2,
-            training_dir=TRAINING_DIR,
-            dev=True,
-            limit=dev_limit,
-            kb=el_pipe.kb,
+            nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit, kb=None
         )
 
         print("Dev testing from file on", len(dev_data), "articles")
@@ -383,9 +379,11 @@ def _measure_baselines(data, kb):
     for doc, gold in zip(docs, golds):
         try:
             correct_entries_per_article = dict()
-            for entity in gold.links:
+            for entity, value in gold.links.items():
                 start, end, gold_kb = entity
-                correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
+                # only evaluating on positive examples
+                if value:
+                    correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
 
             for ent in doc.ents:
                 label = ent.label_
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index b3f384437..7b6bd0ea0 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1141,7 +1141,7 @@ class EntityLinker(Pipe):
 
         context_docs = []
         entity_encodings = []
-        cats = []
+
         priors = []
         type_vectors = []
 
@@ -1173,12 +1173,9 @@ class EntityLinker(Pipe):
                 else:
                     priors.append([0])
 
-                cats.append([value])
-
         if len(entity_encodings) > 0:
-            assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) == len(type_vectors)
+            assert len(priors) == len(entity_encodings) == len(context_docs) == len(type_vectors)
 
-            cats = self.model.ops.asarray(cats, dtype="float32")
             entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
 
             context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
@@ -1186,7 +1183,7 @@ class EntityLinker(Pipe):
                                  for i in range(len(entity_encodings))]
             pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop)
 
-            loss, d_scores = self.get_loss(scores=pred, golds=cats, docs=docs)
+            loss, d_scores = self.get_loss(scores=pred, golds=golds, docs=docs)
             mention_gradient = bp_mention(d_scores, sgd=sgd)
 
             context_gradients = [list(x[0:self.cfg.get("context_width")]) for x in mention_gradient]
@@ -1198,9 +1195,17 @@ class EntityLinker(Pipe):
         return 0
 
     def get_loss(self, docs, golds, scores):
-        d_scores = (scores - golds)
+        cats = []
+        for gold in golds:
+            for entity, value in gold.links.items():
+                cats.append([value])
+
+        cats = self.model.ops.asarray(cats, dtype="float32")
+        assert len(scores) == len(cats)
+
+        d_scores = (scores - cats)
         loss = (d_scores ** 2).sum()
-        loss = loss / len(golds)
+        loss = loss / len(cats)
         return loss, d_scores
 
     def __call__(self, doc):

From 21176517a7db99939348edb89c635530a8cf6d03 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 19 Jul 2019 12:36:15 +0200
Subject: [PATCH 10/28] have gold.links correspond exactly to doc.ents

---
 .../training_set_creator.py                   | 56 ++++++++------
 examples/pipeline/wikidata_entity_linking.py  | 22 +++---
 spacy/gold.pyx                                |  9 ++-
 spacy/pipeline/pipes.pyx                      | 76 +++++++++++--------
 4 files changed, 94 insertions(+), 69 deletions(-)

diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index eb961b9da..e5530ecc7 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -397,33 +397,43 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                 current_doc = None
                             else:
                                 sent = found_ent.sent.as_doc()
-                                # currently feeding the gold data one entity per sentence at a time
+
                                 gold_start = int(start) - found_ent.sent.start_char
                                 gold_end = int(end) - found_ent.sent.start_char
 
-                                # add both pos and neg examples (in random order)
-                                # this will exclude examples not in the KB
-                                if kb:
-                                    gold_entities = {}
-                                    candidates = kb.get_candidates(alias)
-                                    candidate_ids = [c.entity_ for c in candidates]
-                                    random.shuffle(candidate_ids)
-                                    for kb_id in candidate_ids:
-                                        entry = (gold_start, gold_end, kb_id)
-                                        if kb_id != wd_id:
-                                            gold_entities[entry] = 0.0
+                                gold_entities = {}
+                                found_useful = False
+                                for ent in sent.ents:
+                                    if ent.start_char == gold_start and ent.end_char == gold_end:
+                                        # add both pos and neg examples (in random order)
+                                        # this will exclude examples not in the KB
+                                        if kb:
+                                            value_by_id = {}
+                                            candidates = kb.get_candidates(alias)
+                                            candidate_ids = [c.entity_ for c in candidates]
+                                            random.shuffle(candidate_ids)
+                                            for kb_id in candidate_ids:
+                                                found_useful = True
+                                                if kb_id != wd_id:
+                                                    value_by_id[kb_id] = 0.0
+                                                else:
+                                                    value_by_id[kb_id] = 1.0
+                                            gold_entities[(ent.start_char, ent.end_char)] = value_by_id
+                                        # if no KB, keep all positive examples
                                         else:
-                                            gold_entities[entry] = 1.0
-                                # keep all positive examples
-                                else:
-                                    entry = (gold_start, gold_end, wd_id)
-                                    gold_entities = {entry: 1.0}
-
-                                gold = GoldParse(doc=sent, links=gold_entities)
-                                data.append((sent, gold))
-                                total_entities += 1
-                                if len(data) % 2500 == 0:
-                                    print(" -read", total_entities, "entities")
+                                            found_useful = True
+                                            value_by_id = {wd_id: 1.0}
+                                            gold_entities[(ent.start_char, ent.end_char)] = value_by_id
+                                    # currently feeding the gold data one entity per sentence at a time
+                                    # setting all other entities to empty gold dictionary
+                                    else:
+                                        gold_entities[(ent.start_char, ent.end_char)] = {}
+                                if found_useful:
+                                    gold = GoldParse(doc=sent, links=gold_entities)
+                                    data.append((sent, gold))
+                                    total_entities += 1
+                                    if len(data) % 2500 == 0:
+                                        print(" -read", total_entities, "entities")
 
     print(" -read", total_entities, "entities")
     return data
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index ab9aa51fd..478d35111 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -322,11 +322,12 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
     for doc, gold in zip(docs, golds):
         try:
             correct_entries_per_article = dict()
-            for entity, value in gold.links.items():
+            for entity, kb_dict in gold.links.items():
+                start, end = entity
                 # only evaluating on positive examples
-                if value:
-                    start, end, gold_kb = entity
-                    correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
+                for gold_kb, value in kb_dict.items():
+                    if value:
+                        correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
 
             for ent in doc.ents:
                 ent_label = ent.label_
@@ -379,11 +380,12 @@ def _measure_baselines(data, kb):
     for doc, gold in zip(docs, golds):
         try:
             correct_entries_per_article = dict()
-            for entity, value in gold.links.items():
-                start, end, gold_kb = entity
-                # only evaluating on positive examples
-                if value:
-                    correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
+            for entity, kb_dict in gold.links.items():
+                start, end = entity
+                for gold_kb, value in kb_dict.items():
+                    # only evaluating on positive examples
+                    if value:
+                        correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
 
             for ent in doc.ents:
                 label = ent.label_
@@ -487,7 +489,7 @@ def run_el_toy_example(nlp):
         "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, "
         "Douglas reminds us to always bring our towel, even in China or Brazil. "
         "The main character in Doug's novel is the man Arthur Dent, "
-        "but Douglas doesn't write about George Washington or Homer Simpson."
+        "but Dougledydoug doesn't write about George Washington or Homer Simpson."
     )
     doc = nlp(text)
     print(text)
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 81feb55a4..5459d5424 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -450,10 +450,11 @@ cdef class GoldParse:
             examples of a label to have the value 0.0. Labels not in the
             dictionary are treated as missing - the gradient for those labels
             will be zero.
-        links (dict): A dict with `(start_char, end_char, kb_id)` keys,
-            representing the external ID of an entity in a knowledge base,
-            and the values being either 1.0 or 0.0, indicating positive and
-            negative examples, respectively.
+        links (dict): A dict with `(start_char, end_char)` keys,
+            and the values being dicts with kb_id:value entries,
+            representing the external IDs in a knowledge base (KB)
+            mapped to either 1.0 or 0.0, indicating positive and
+            negative examples respectively.
         RETURNS (GoldParse): The newly constructed object.
         """
         if words is None:
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 7b6bd0ea0..a8746c73d 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1076,6 +1076,7 @@ class EntityLinker(Pipe):
     DOCS: TODO
     """
     name = 'entity_linker'
+    NIL = "NIL"  # string used to refer to a non-existing link
 
     @classmethod
     def Model(cls, **cfg):
@@ -1151,27 +1152,28 @@ class EntityLinker(Pipe):
             ents_by_offset = dict()
             for ent in doc.ents:
                 ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
-            for entity, value in gold.links.items():
-                start, end, kb_id = entity
+            for entity, kb_dict in gold.links.items():
+                start, end = entity
                 mention = doc.text[start:end]
-                entity_encoding = self.kb.get_vector(kb_id)
-                prior_prob = self.kb.get_prior_prob(kb_id, mention)
+                for kb_id, value in kb_dict.items():
+                    entity_encoding = self.kb.get_vector(kb_id)
+                    prior_prob = self.kb.get_prior_prob(kb_id, mention)
 
-                gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)]
-                assert gold_ent is not None
-                type_vector = [0 for i in range(len(type_to_int))]
-                if len(type_to_int) > 0:
-                    type_vector[type_to_int[gold_ent.label_]] = 1
+                    gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)]
+                    assert gold_ent is not None
+                    type_vector = [0 for i in range(len(type_to_int))]
+                    if len(type_to_int) > 0:
+                        type_vector[type_to_int[gold_ent.label_]] = 1
 
-                # store data
-                entity_encodings.append(entity_encoding)
-                context_docs.append(doc)
-                type_vectors.append(type_vector)
+                    # store data
+                    entity_encodings.append(entity_encoding)
+                    context_docs.append(doc)
+                    type_vectors.append(type_vector)
 
-                if self.cfg.get("prior_weight", 1) > 0:
-                    priors.append([prior_prob])
-                else:
-                    priors.append([0])
+                    if self.cfg.get("prior_weight", 1) > 0:
+                        priors.append([prior_prob])
+                    else:
+                        priors.append([0])
 
         if len(entity_encodings) > 0:
             assert len(priors) == len(entity_encodings) == len(context_docs) == len(type_vectors)
@@ -1197,8 +1199,9 @@ class EntityLinker(Pipe):
     def get_loss(self, docs, golds, scores):
         cats = []
         for gold in golds:
-            for entity, value in gold.links.items():
-                cats.append([value])
+            for entity, kb_dict in gold.links.items():
+                for kb_id, value in kb_dict.items():
+                    cats.append([value])
 
         cats = self.model.ops.asarray(cats, dtype="float32")
         assert len(scores) == len(cats)
@@ -1209,26 +1212,27 @@ class EntityLinker(Pipe):
         return loss, d_scores
 
     def __call__(self, doc):
-        entities, kb_ids = self.predict([doc])
-        self.set_annotations([doc], entities, kb_ids)
+        kb_ids = self.predict([doc])
+        self.set_annotations([doc], kb_ids)
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
         for docs in util.minibatch(stream, size=batch_size):
             docs = list(docs)
-            entities, kb_ids = self.predict(docs)
-            self.set_annotations(docs, entities, kb_ids)
+            kb_ids = self.predict(docs)
+            self.set_annotations(docs, kb_ids)
             yield from docs
 
     def predict(self, docs):
+        """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
         self.require_model()
         self.require_kb()
 
-        final_entities = []
+        entity_count = 0
         final_kb_ids = []
 
         if not docs:
-            return final_entities, final_kb_ids
+            return final_kb_ids
 
         if isinstance(docs, Doc):
             docs = [docs]
@@ -1242,12 +1246,15 @@ class EntityLinker(Pipe):
             if len(doc) > 0:
                 context_encoding = context_encodings[i]
                 for ent in doc.ents:
+                    entity_count += 1
                     type_vector = [0 for i in range(len(type_to_int))]
                     if len(type_to_int) > 0:
                         type_vector[type_to_int[ent.label_]] = 1
 
                     candidates = self.kb.get_candidates(ent.text)
-                    if candidates:
+                    if not candidates:
+                        final_kb_ids.append(self.NIL)  # no prediction possible for this entity
+                    else:
                         random.shuffle(candidates)
 
                         # this will set the prior probabilities to 0 (just like in training) if their weight is 0
@@ -1266,15 +1273,20 @@ class EntityLinker(Pipe):
                         # TODO: thresholding
                         best_index = scores.argmax()
                         best_candidate = candidates[best_index]
-                        final_entities.append(ent)
                         final_kb_ids.append(best_candidate.entity_)
 
-        return final_entities, final_kb_ids
+        assert len(final_kb_ids) == entity_count
 
-    def set_annotations(self, docs, entities, kb_ids=None):
-        for entity, kb_id in zip(entities, kb_ids):
-            for token in entity:
-                token.ent_kb_id_ = kb_id
+        return final_kb_ids
+
+    def set_annotations(self, docs, kb_ids, tensors=None):
+        i=0
+        for doc in docs:
+            for ent in doc.ents:
+                kb_id = kb_ids[i]
+                i += 1
+                for token in ent:
+                    token.ent_kb_id_ = kb_id
 
     def to_disk(self, path, exclude=tuple(), **kwargs):
         serialize = OrderedDict()

From 41fb5204ba47b71b37d012d06e8b039983fa0ef9 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 19 Jul 2019 14:47:36 +0200
Subject: [PATCH 11/28] output tensors as part of predict

---
 spacy/pipeline/pipes.pyx | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index a8746c73d..5704878b8 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1212,15 +1212,15 @@ class EntityLinker(Pipe):
         return loss, d_scores
 
     def __call__(self, doc):
-        kb_ids = self.predict([doc])
-        self.set_annotations([doc], kb_ids)
+        kb_ids, tensors = self.predict([doc])
+        self.set_annotations([doc], kb_ids, tensors=tensors)
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
         for docs in util.minibatch(stream, size=batch_size):
             docs = list(docs)
-            kb_ids = self.predict(docs)
-            self.set_annotations(docs, kb_ids)
+            kb_ids, tensors = self.predict(docs)
+            self.set_annotations(docs, kb_ids, tensors=tensors)
             yield from docs
 
     def predict(self, docs):
@@ -1230,6 +1230,7 @@ class EntityLinker(Pipe):
 
         entity_count = 0
         final_kb_ids = []
+        final_tensors = []
 
         if not docs:
             return final_kb_ids
@@ -1244,6 +1245,7 @@ class EntityLinker(Pipe):
 
         for i, doc in enumerate(docs):
             if len(doc) > 0:
+                # currently, the context is the same for each entity in a sentence (should be refined)
                 context_encoding = context_encodings[i]
                 for ent in doc.ents:
                     entity_count += 1
@@ -1254,6 +1256,7 @@ class EntityLinker(Pipe):
                     candidates = self.kb.get_candidates(ent.text)
                     if not candidates:
                         final_kb_ids.append(self.NIL)  # no prediction possible for this entity
+                        final_tensors.append(context_encoding)
                     else:
                         random.shuffle(candidates)
 
@@ -1274,12 +1277,16 @@ class EntityLinker(Pipe):
                         best_index = scores.argmax()
                         best_candidate = candidates[best_index]
                         final_kb_ids.append(best_candidate.entity_)
+                        final_tensors.append(context_encoding)
 
-        assert len(final_kb_ids) == entity_count
+        assert len(final_tensors) == len(final_kb_ids) == entity_count
 
-        return final_kb_ids
+        return final_kb_ids, final_tensors
 
     def set_annotations(self, docs, kb_ids, tensors=None):
+        count_ents = len([ent for doc in docs for ent in doc.ents])
+        assert count_ents == len(kb_ids)
+
         i=0
         for doc in docs:
             for ent in doc.ents:

From f75d1299a771b1d9870d5de05822df471b321f61 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 19 Jul 2019 14:52:45 +0200
Subject: [PATCH 12/28] formatting

---
 .../training_set_creator.py                   | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index e5530ecc7..cc7997a1e 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -365,9 +365,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                 encoding="utf8",
                             ) as f:
                                 text = f.read()
-                                if (
-                                    len(text) < 30000
-                                ):  # threshold for convenience / speed of processing
+                                # threshold for convenience / speed of processing
+                                if len(text) < 30000:
                                     current_doc = nlp(text)
                                     current_article_id = article_id
                                     ents_by_offset = dict()
@@ -386,7 +385,6 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                         except Exception as e:
                             print("Problem parsing article", article_id, e)
                             skip_articles.add(article_id)
-                            raise e
 
                     # repeat checking this condition in case an exception was thrown
                     if current_doc and (current_article_id == article_id):
@@ -404,13 +402,17 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                 gold_entities = {}
                                 found_useful = False
                                 for ent in sent.ents:
-                                    if ent.start_char == gold_start and ent.end_char == gold_end:
+                                    entry = (ent.start_char, ent.end_char)
+                                    gold_entry = (gold_start, gold_end)
+                                    if entry == gold_entry:
                                         # add both pos and neg examples (in random order)
                                         # this will exclude examples not in the KB
                                         if kb:
                                             value_by_id = {}
                                             candidates = kb.get_candidates(alias)
-                                            candidate_ids = [c.entity_ for c in candidates]
+                                            candidate_ids = [
+                                                c.entity_ for c in candidates
+                                            ]
                                             random.shuffle(candidate_ids)
                                             for kb_id in candidate_ids:
                                                 found_useful = True
@@ -418,16 +420,17 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                                     value_by_id[kb_id] = 0.0
                                                 else:
                                                     value_by_id[kb_id] = 1.0
-                                            gold_entities[(ent.start_char, ent.end_char)] = value_by_id
+                                            gold_entities[entry] = value_by_id
                                         # if no KB, keep all positive examples
                                         else:
                                             found_useful = True
                                             value_by_id = {wd_id: 1.0}
-                                            gold_entities[(ent.start_char, ent.end_char)] = value_by_id
+
+                                            gold_entities[entry] = value_by_id
                                     # currently feeding the gold data one entity per sentence at a time
                                     # setting all other entities to empty gold dictionary
                                     else:
-                                        gold_entities[(ent.start_char, ent.end_char)] = {}
+                                        gold_entities[entry] = {}
                                 if found_useful:
                                     gold = GoldParse(doc=sent, links=gold_entities)
                                     data.append((sent, gold))

From dae8a21282ab1260d141eccf79a9224cc5b0df48 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 19 Jul 2019 17:40:28 +0200
Subject: [PATCH 13/28] rename entity frequency

---
 bin/wiki_entity_linking/kb_creator.py      |  2 +-
 examples/pipeline/dummy_entity_linking.py  |  6 ++--
 spacy/kb.pxd                               | 12 +++----
 spacy/kb.pyx                               | 30 ++++++++---------
 spacy/structs.pxd                          |  2 +-
 spacy/tests/pipeline/test_entity_linker.py | 38 +++++++++++-----------
 spacy/tests/serialize/test_serialize_kb.py |  8 ++---
 7 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
index e8e081cef..d8cdf6dd7 100644
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@@ -70,7 +70,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
 
     print()
     print(" * adding", len(entity_list), "entities", datetime.datetime.now())
-    kb.set_entities(entity_list=entity_list, prob_list=frequency_list, vector_list=embeddings)
+    kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings)
 
     print()
     print(" * adding aliases", datetime.datetime.now())
diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
index 0e59db304..6dde616b8 100644
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -14,15 +14,15 @@ def create_kb(vocab):
     # adding entities
     entity_0 = "Q1004791_Douglas"
     print("adding entity", entity_0)
-    kb.add_entity(entity=entity_0, prob=0.5, entity_vector=[0])
+    kb.add_entity(entity=entity_0, freq=0.5, entity_vector=[0])
 
     entity_1 = "Q42_Douglas_Adams"
     print("adding entity", entity_1)
-    kb.add_entity(entity=entity_1, prob=0.5, entity_vector=[1])
+    kb.add_entity(entity=entity_1, freq=0.5, entity_vector=[1])
 
     entity_2 = "Q5301561_Douglas_Haig"
     print("adding entity", entity_2)
-    kb.add_entity(entity=entity_2, prob=0.5, entity_vector=[2])
+    kb.add_entity(entity=entity_2, freq=0.5, entity_vector=[2])
 
     # adding aliases
     print()
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 40b22b275..d5aa382b1 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -79,7 +79,7 @@ cdef class KnowledgeBase:
         return new_index
 
 
-    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
+    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
                                      int32_t vector_index, int feats_row) nogil:
         """Add an entry to the vector of entries.
         After calling this method, make sure to update also the _entry_index using the return value"""
@@ -92,7 +92,7 @@ cdef class KnowledgeBase:
         entry.entity_hash = entity_hash
         entry.vector_index = vector_index
         entry.feats_row = feats_row
-        entry.prob = prob
+        entry.freq = freq
 
         self._entries.push_back(entry)
         return new_index
@@ -125,7 +125,7 @@ cdef class KnowledgeBase:
         entry.entity_hash = dummy_hash
         entry.vector_index = dummy_value
         entry.feats_row = dummy_value
-        entry.prob = dummy_value
+        entry.freq = dummy_value
 
         # Avoid struct initializer to enable nogil
         cdef vector[int64_t] dummy_entry_indices
@@ -141,7 +141,7 @@ cdef class KnowledgeBase:
         self._aliases_table.push_back(alias)
 
     cpdef load_bulk(self, loc)
-    cpdef set_entities(self, entity_list, prob_list, vector_list)
+    cpdef set_entities(self, entity_list, freq_list, vector_list)
 
 
 cdef class Writer:
@@ -149,7 +149,7 @@ cdef class Writer:
 
     cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
     cdef int write_vector_element(self, float element) except -1
-    cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1
+    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
 
     cdef int write_alias_length(self, int64_t alias_length) except -1
     cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
@@ -162,7 +162,7 @@ cdef class Reader:
 
     cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
     cdef int read_vector_element(self, float* element) except -1
-    cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1
+    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
 
     cdef int read_alias_length(self, int64_t* alias_length) except -1
     cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index bdd7da0f9..9df0e4fc2 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -94,7 +94,7 @@ cdef class KnowledgeBase:
     def get_alias_strings(self):
         return [self.vocab.strings[x] for x in self._alias_index]
 
-    def add_entity(self, unicode entity, float prob, vector[float] entity_vector):
+    def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
         """
         Add an entity to the KB, optionally specifying its log probability based on corpus frequency
         Return the hash of the entity ID/name at the end.
@@ -113,15 +113,15 @@ cdef class KnowledgeBase:
         vector_index = self.c_add_vector(entity_vector=entity_vector)
 
         new_index = self.c_add_entity(entity_hash=entity_hash,
-                                      prob=prob,
+                                      freq=freq,
                                       vector_index=vector_index,
                                       feats_row=-1)  # Features table currently not implemented
         self._entry_index[entity_hash] = new_index
 
         return entity_hash
 
-    cpdef set_entities(self, entity_list, prob_list, vector_list):
-        if len(entity_list) != len(prob_list) or len(entity_list) != len(vector_list):
+    cpdef set_entities(self, entity_list, freq_list, vector_list):
+        if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
             raise ValueError(Errors.E140)
 
         nr_entities = len(entity_list)
@@ -137,7 +137,7 @@ cdef class KnowledgeBase:
 
             entity_hash = self.vocab.strings.add(entity_list[i])
             entry.entity_hash = entity_hash
-            entry.prob = prob_list[i]
+            entry.freq = freq_list[i]
 
             vector_index = self.c_add_vector(entity_vector=vector_list[i])
             entry.vector_index = vector_index
@@ -196,7 +196,7 @@ cdef class KnowledgeBase:
 
         return [Candidate(kb=self,
                           entity_hash=self._entries[entry_index].entity_hash,
-                          entity_freq=self._entries[entry_index].prob,
+                          entity_freq=self._entries[entry_index].freq,
                           entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
                           alias_hash=alias_hash,
                           prior_prob=prior_prob)
@@ -252,7 +252,7 @@ cdef class KnowledgeBase:
             entry = self._entries[entry_index]
             assert entry.entity_hash == entry_hash
             assert entry_index == i
-            writer.write_entry(entry.entity_hash, entry.prob, entry.vector_index)
+            writer.write_entry(entry.entity_hash, entry.freq, entry.vector_index)
             i = i+1
 
         writer.write_alias_length(self.get_size_aliases())
@@ -278,7 +278,7 @@ cdef class KnowledgeBase:
         cdef hash_t entity_hash
         cdef hash_t alias_hash
         cdef int64_t entry_index
-        cdef float prob
+        cdef float freq
         cdef int32_t vector_index
         cdef KBEntryC entry
         cdef AliasC alias
@@ -314,10 +314,10 @@ cdef class KnowledgeBase:
         # index 0 is a dummy object not stored in the _entry_index and can be ignored.
         i = 1
         while i <= nr_entities:
-            reader.read_entry(&entity_hash, &prob, &vector_index)
+            reader.read_entry(&entity_hash, &freq, &vector_index)
 
             entry.entity_hash = entity_hash
-            entry.prob = prob
+            entry.freq = freq
             entry.vector_index = vector_index
             entry.feats_row = -1    # Features table currently not implemented
 
@@ -387,9 +387,9 @@ cdef class Writer:
     cdef int write_vector_element(self, float element) except -1:
         self._write(&element, sizeof(element))
 
-    cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1:
+    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
         self._write(&entry_hash, sizeof(entry_hash))
-        self._write(&entry_prob, sizeof(entry_prob))
+        self._write(&entry_freq, sizeof(entry_freq))
         self._write(&vector_index, sizeof(vector_index))
         # Features table currently not implemented and not written to file
 
@@ -444,18 +444,18 @@ cdef class Reader:
                 return 0  # end of file
             raise IOError("error reading entity vector from input file")
 
-    cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1:
+    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
         status = self._read(entity_hash, sizeof(hash_t))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
             raise IOError("error reading entity hash from input file")
 
-        status = self._read(prob, sizeof(float))
+        status = self._read(freq, sizeof(float))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading entity prob from input file")
+            raise IOError("error reading entity freq from input file")
 
         status = self._read(vector_index, sizeof(int32_t))
         if status < 1:
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index e80b1b4d6..6c643b4cd 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -93,7 +93,7 @@ cdef struct KBEntryC:
     int32_t feats_row
 
     # log probability of entity, based on corpus frequency
-    float prob
+    float freq
 
 
 # Each alias struct stores a list of Entry pointers with their prior probabilities
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index c3163200a..ab4055bba 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -23,9 +23,9 @@ def test_kb_valid_entities(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
 
     # adding entities
-    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[8, 4, 3])
-    mykb.add_entity(entity="Q2", prob=0.5, entity_vector=[2, 1, 0])
-    mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[-1, -6, 5])
+    mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[8, 4, 3])
+    mykb.add_entity(entity="Q2", freq=0.5, entity_vector=[2, 1, 0])
+    mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[-1, -6, 5])
 
     # adding aliases
     mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
@@ -50,9 +50,9 @@ def test_kb_invalid_entities(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
+    mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
+    mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
 
     # adding aliases - should fail because one of the given IDs is not valid
     with pytest.raises(ValueError):
@@ -66,9 +66,9 @@ def test_kb_invalid_probabilities(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
+    mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
+    mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
 
     # adding aliases - should fail because the sum of the probabilities exceeds 1
     with pytest.raises(ValueError):
@@ -80,9 +80,9 @@ def test_kb_invalid_combination(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
+    mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
+    mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
 
     # adding aliases - should fail because the entities and probabilities vectors are not of equal length
     with pytest.raises(ValueError):
@@ -96,11 +96,11 @@ def test_kb_invalid_entity_vector(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
 
     # adding entities
-    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1, 2, 3])
+    mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1, 2, 3])
 
     # this should fail because the kb's expected entity vector length is 3
     with pytest.raises(ValueError):
-        mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
+        mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
 
 
 def test_candidate_generation(nlp):
@@ -108,9 +108,9 @@ def test_candidate_generation(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity="Q1", prob=0.7, entity_vector=[1])
-    mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity="Q1", freq=0.7, entity_vector=[1])
+    mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
+    mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
 
     # adding aliases
     mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
@@ -133,8 +133,8 @@ def test_preserving_links_asdoc(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity="Q2", prob=0.8, entity_vector=[1])
+    mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
+    mykb.add_entity(entity="Q2", freq=0.8, entity_vector=[1])
 
     # adding aliases
     mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7])
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index fa7253fa1..1752abda2 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -30,10 +30,10 @@ def test_serialize_kb_disk(en_vocab):
 def _get_dummy_kb(vocab):
     kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
 
-    kb.add_entity(entity='Q53', prob=0.33, entity_vector=[0, 5, 3])
-    kb.add_entity(entity='Q17', prob=0.2, entity_vector=[7, 1, 0])
-    kb.add_entity(entity='Q007', prob=0.7, entity_vector=[0, 0, 7])
-    kb.add_entity(entity='Q44', prob=0.4, entity_vector=[4, 4, 4])
+    kb.add_entity(entity='Q53', freq=0.33, entity_vector=[0, 5, 3])
+    kb.add_entity(entity='Q17', freq=0.2, entity_vector=[7, 1, 0])
+    kb.add_entity(entity='Q007', freq=0.7, entity_vector=[0, 0, 7])
+    kb.add_entity(entity='Q44', freq=0.4, entity_vector=[4, 4, 4])
 
     kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9])
     kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])

From 9f8c1e71a21457aa4110d9cced223665d70017d5 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 22 Jul 2019 13:34:12 +0200
Subject: [PATCH 14/28] fix for Issue #4000

---
 examples/pipeline/wikidata_entity_linking.py | 18 +++++++++++++-----
 spacy/kb.pyx                                 |  4 ++--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 478d35111..32f751cd7 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -1,6 +1,8 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import os
+from os import path
 import random
 import datetime
 from pathlib import Path
@@ -26,7 +28,8 @@ ENTITY_COUNTS = OUTPUT_DIR / "entity_freq.csv"
 ENTITY_DEFS = OUTPUT_DIR / "entity_defs.csv"
 ENTITY_DESCR = OUTPUT_DIR / "entity_descriptions.csv"
 
-KB_FILE = OUTPUT_DIR / "kb_1" / "kb"
+KB_DIR = OUTPUT_DIR / "kb_1"
+KB_FILE = "kb"
 NLP_1_DIR = OUTPUT_DIR / "nlp_1"
 NLP_2_DIR = OUTPUT_DIR / "nlp_2"
 
@@ -118,7 +121,10 @@ def run_pipeline():
         print()
 
         print("STEP 3b: write KB and NLP", now())
-        kb_1.dump(KB_FILE)
+
+        if not path.exists(KB_DIR):
+            os.makedirs(KB_DIR)
+        kb_1.dump(KB_DIR / KB_FILE)
         nlp_1.to_disk(NLP_1_DIR)
         print()
 
@@ -127,7 +133,7 @@ def run_pipeline():
         print("STEP 4: to_read_kb", now())
         nlp_2 = spacy.load(NLP_1_DIR)
         kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH)
-        kb_2.load_bulk(KB_FILE)
+        kb_2.load_bulk(KB_DIR / KB_FILE)
         print("kb entities:", kb_2.get_size_entities())
         print("kb aliases:", kb_2.get_size_aliases())
         print()
@@ -327,7 +333,8 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
                 # only evaluating on positive examples
                 for gold_kb, value in kb_dict.items():
                     if value:
-                        correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
+                        offset = str(start) + "-" + str(end)
+                        correct_entries_per_article[offset] = gold_kb
 
             for ent in doc.ents:
                 ent_label = ent.label_
@@ -385,7 +392,8 @@ def _measure_baselines(data, kb):
                 for gold_kb, value in kb_dict.items():
                     # only evaluating on positive examples
                     if value:
-                        correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
+                        offset = str(start) + "-" + str(end)
+                        correct_entries_per_article[offset] = gold_kb
 
             for ent in doc.ents:
                 label = ent.label_
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 9df0e4fc2..0f1c87de8 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -278,7 +278,7 @@ cdef class KnowledgeBase:
         cdef hash_t entity_hash
         cdef hash_t alias_hash
         cdef int64_t entry_index
-        cdef float freq
+        cdef float freq, prob
         cdef int32_t vector_index
         cdef KBEntryC entry
         cdef AliasC alias
@@ -373,7 +373,7 @@ cdef class Writer:
             loc = bytes(loc)
         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
         self._fp = fopen(<char*>bytes_loc, 'wb')
-        assert self._fp != NULL
+        assert self._fp != NULL, "Could not access %s" % loc
         fseek(self._fp, 0, 0)
 
     def close(self):

From 76184374e2d438928417fbc2dc4a31380889ebab Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 22 Jul 2019 13:39:32 +0200
Subject: [PATCH 15/28] test corner cases

---
 spacy/kb.pyx                               | 3 +--
 spacy/tests/pipeline/test_entity_linker.py | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 0f1c87de8..11d895eee 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -191,7 +191,7 @@ cdef class KnowledgeBase:
 
     def get_candidates(self, unicode alias):
         cdef hash_t alias_hash = self.vocab.strings[alias]
-        alias_index = <int64_t>self._alias_index.get(alias_hash)  # TODO: check for error? unit test !
+        alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
 
         return [Candidate(kb=self,
@@ -219,7 +219,6 @@ cdef class KnowledgeBase:
         cdef hash_t alias_hash = self.vocab.strings[alias]
         cdef hash_t entity_hash = self.vocab.strings[entity]
 
-        # TODO: error  ?
         if entity_hash not in self._entry_index or alias_hash not in self._alias_index:
             return 0.0
 
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index ab4055bba..ca6bf2b6c 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -43,6 +43,8 @@ def test_kb_valid_entities(nlp):
     # test retrieval of prior probabilities
     assert_almost_equal(mykb.get_prior_prob(entity="Q2", alias="douglas"), 0.8)
     assert_almost_equal(mykb.get_prior_prob(entity="Q3", alias="douglas"), 0.2)
+    assert_almost_equal(mykb.get_prior_prob(entity="Q342", alias="douglas"), 0.0)
+    assert_almost_equal(mykb.get_prior_prob(entity="Q3", alias="douglassssss"), 0.0)
 
 
 def test_kb_invalid_entities(nlp):

From a32b033b8c30fa038ce8845333c1560059475f39 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 22 Jul 2019 14:18:24 +0200
Subject: [PATCH 16/28] Add regression test for #4002

Test that the PhraseMatcher can match on overwritten NORM attributes.
---
 spacy/tests/regression/test_issue4002.py | 28 ++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue4002.py

diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py
new file mode 100644
index 000000000..d9b509a30
--- /dev/null
+++ b/spacy/tests/regression/test_issue4002.py
@@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.matcher import PhraseMatcher
+from spacy.tokens import Doc
+
+
+@pytest.mark.xfail
+def test_issue4002(en_vocab):
+    """Test that the PhraseMatcher can match on overwritten NORM attributes.
+    """
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern1 = Doc(en_vocab, words=["c", "d"])
+    assert [t.norm_ for t in pattern1] == ["c", "d"]
+    matcher.add("TEST", None, pattern1)
+    doc = Doc(en_vocab, words=["a", "b", "c", "d"])
+    assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
+    matches = matcher(doc)
+    assert len(matches) == 1
+    matcher = PhraseMatcher(en_vocab, attr="NORM")
+    pattern2 = Doc(en_vocab, words=["1", "2"])
+    pattern2[0].norm_ = "c"
+    pattern2[1].norm_ = "d"
+    assert [t.norm_ for t in pattern2] == ["c", "d"]
+    matcher.add("TEST", None, pattern2)
+    matches = matcher(doc)
+    assert len(matches) == 1

From 5d544f89baf0f8ab0598c3df5a64afe059623487 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 22 Jul 2019 14:36:07 +0200
Subject: [PATCH 17/28] Errors.E145 for IO errors when reading KB

---
 spacy/errors.py |  1 +
 spacy/kb.pyx    | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index cb8bb44b4..937596d5d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -407,6 +407,7 @@ class Errors(object):
     E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or 'cosine'")
     E143 = ("Labels for component '{name}' not initialized. Did you forget to call add_label()?")
     E144 = ("Could not find parameter `{param}` when building the entity linker model.")
+    E145 = ("Error reading `{param}` from input file.")
 
 
 @add_codes
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 11d895eee..a300ed34a 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -428,39 +428,39 @@ cdef class Reader:
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading header from input file")
+            raise IOError(Errors.E145.format(param="header"))
 
         status = self._read(entity_vector_length, sizeof(int64_t))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading header from input file")
+            raise IOError(Errors.E145.format(param="vector length"))
 
     cdef int read_vector_element(self, float* element) except -1:
         status = self._read(element, sizeof(float))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading entity vector from input file")
+            raise IOError(Errors.E145.format(param="vector element"))
 
     cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
         status = self._read(entity_hash, sizeof(hash_t))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading entity hash from input file")
+            raise IOError(Errors.E145.format(param="entity hash"))
 
         status = self._read(freq, sizeof(float))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading entity freq from input file")
+            raise IOError(Errors.E145.format(param="entity freq"))
 
         status = self._read(vector_index, sizeof(int32_t))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading entity vector from input file")
+            raise IOError(Errors.E145.format(param="vector index"))
 
         if feof(self._fp):
             return 0
@@ -472,33 +472,33 @@ cdef class Reader:
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading alias length from input file")
+            raise IOError(Errors.E145.format(param="alias length"))
 
     cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
         status = self._read(alias_hash, sizeof(hash_t))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading alias hash from input file")
+            raise IOError(Errors.E145.format(param="alias hash"))
 
         status = self._read(candidate_length, sizeof(int64_t))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading candidate length from input file")
+            raise IOError(Errors.E145.format(param="candidate length"))
 
     cdef int read_alias(self, int64_t* entry_index, float* prob) except -1:
         status = self._read(entry_index, sizeof(int64_t))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading entry index for alias from input file")
+            raise IOError(Errors.E145.format(param="entry index"))
 
         status = self._read(prob, sizeof(float))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
-            raise IOError("error reading prob for entity/alias from input file")
+            raise IOError(Errors.E145.format(param="prior probability"))
 
     cdef int _read(self, void* value, size_t size) except -1:
         status = fread(value, size, 1, self._fp)

From b1911f7105758730695ff71d07638da3e8253669 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 22 Jul 2019 14:56:13 +0200
Subject: [PATCH 18/28] Errors.E146 for IO error when FP is null

---
 spacy/errors.py | 1 +
 spacy/kb.pyx    | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 937596d5d..bfdf71652 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -408,6 +408,7 @@ class Errors(object):
     E143 = ("Labels for component '{name}' not initialized. Did you forget to call add_label()?")
     E144 = ("Could not find parameter `{param}` when building the entity linker model.")
     E145 = ("Error reading `{param}` from input file.")
+    E146 = ("Could not access `{path}`.")
 
 
 @add_codes
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index a300ed34a..28e762653 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -372,7 +372,8 @@ cdef class Writer:
             loc = bytes(loc)
         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
         self._fp = fopen(<char*>bytes_loc, 'wb')
-        assert self._fp != NULL, "Could not access %s" % loc
+        if not self._fp:
+            raise IOError(Errors.E146.format(path=loc))
         fseek(self._fp, 0, 0)
 
     def close(self):

From 3e140534d96cecae065a762f0530a3c67a440579 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 22 Jul 2019 15:04:57 +0200
Subject: [PATCH 19/28] format

---
 bin/wiki_entity_linking/training_set_creator.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index cc7997a1e..a0ca4444c 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -307,18 +307,8 @@ def _write_training_article(article_id, clean_text, training_output):
 
 
 def _write_training_entity(outputfile, article_id, alias, entity, start, end):
-    outputfile.write(
-        article_id
-        + "|"
-        + alias
-        + "|"
-        + entity
-        + "|"
-        + str(start)
-        + "|"
-        + str(end)
-        + "\n"
-    )
+    line = "{}|{}|{}|{}|{}\n".format(article_id, alias, entity, start, end)
+    outputfile.write(line)
 
 
 def is_dev(article_id):

From 20389e4553becc0f9f86c8ff7082e8e33e9e4afa Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 22 Jul 2019 15:08:17 +0200
Subject: [PATCH 20/28] format and bugfix

---
 spacy/pipeline/pipes.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 5704878b8..f5cc58411 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1151,7 +1151,7 @@ class EntityLinker(Pipe):
         for doc, gold in zip(docs, golds):
             ents_by_offset = dict()
             for ent in doc.ents:
-                ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
+                ents_by_offset["{}_{}".format(ent.start_char, ent.end_char)] = ent
             for entity, kb_dict in gold.links.items():
                 start, end = entity
                 mention = doc.text[start:end]
@@ -1159,7 +1159,7 @@ class EntityLinker(Pipe):
                     entity_encoding = self.kb.get_vector(kb_id)
                     prior_prob = self.kb.get_prior_prob(kb_id, mention)
 
-                    gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)]
+                    gold_ent = ents_by_offset["{}_{}".format(start, end)]
                     assert gold_ent is not None
                     type_vector = [0 for i in range(len(type_to_int))]
                     if len(type_to_int) > 0:

From cd6c263fe4119b74e88d0c072f201d740717d83a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Jul 2019 11:31:29 +0200
Subject: [PATCH 21/28] format offsets

---
 bin/wiki_entity_linking/training_set_creator.py | 10 ++++------
 examples/pipeline/wikidata_entity_linking.py    | 12 ++++++++----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index a0ca4444c..74bdbe9fb 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -364,11 +364,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                         sent_length = len(ent.sent)
                                         # custom filtering to avoid too long or too short sentences
                                         if 5 < sent_length < 100:
-                                            ents_by_offset[
-                                                str(ent.start_char)
-                                                + "_"
-                                                + str(ent.end_char)
-                                            ] = ent
+                                            offset = "{}_{}".format(ent.start_char, ent.end_char)
+                                            ents_by_offset[offset] = ent
                                 else:
                                     skip_articles.add(article_id)
                                     current_doc = None
@@ -378,7 +375,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
 
                     # repeat checking this condition in case an exception was thrown
                     if current_doc and (current_article_id == article_id):
-                        found_ent = ents_by_offset.get(start + "_" + end, None)
+                        offset = "{}_{}".format(start, end)
+                        found_ent = ents_by_offset.get(offset, None)
                         if found_ent:
                             if found_ent.text != alias:
                                 skip_articles.add(article_id)
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 32f751cd7..04e5bce6d 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -333,7 +333,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
                 # only evaluating on positive examples
                 for gold_kb, value in kb_dict.items():
                     if value:
-                        offset = str(start) + "-" + str(end)
+                        offset = _offset(start, end)
                         correct_entries_per_article[offset] = gold_kb
 
             for ent in doc.ents:
@@ -341,7 +341,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
                 pred_entity = ent.kb_id_
                 start = ent.start_char
                 end = ent.end_char
-                offset = str(start) + "-" + str(end)
+                offset = _offset(start, end)
                 gold_entity = correct_entries_per_article.get(offset, None)
                 # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
                 if gold_entity is not None:
@@ -392,14 +392,14 @@ def _measure_baselines(data, kb):
                 for gold_kb, value in kb_dict.items():
                     # only evaluating on positive examples
                     if value:
-                        offset = str(start) + "-" + str(end)
+                        offset = _offset(start, end)
                         correct_entries_per_article[offset] = gold_kb
 
             for ent in doc.ents:
                 label = ent.label_
                 start = ent.start_char
                 end = ent.end_char
-                offset = str(start) + "-" + str(end)
+                offset = _offset(start, end)
                 gold_entity = correct_entries_per_article.get(offset, None)
 
                 # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
@@ -454,6 +454,10 @@ def _measure_baselines(data, kb):
     )
 
 
+def _offset(start, end):
+    return "{}_{}".format(start, end)
+
+
 def calculate_acc(correct_by_label, incorrect_by_label):
     acc_by_label = dict()
     total_correct = 0

From 400ff342cfd3b4abf540ba36bebab4a1ca5fc751 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Jul 2019 11:52:48 +0200
Subject: [PATCH 22/28] replace assert's with custom error messages

---
 spacy/errors.py          |  4 ++++
 spacy/pipeline/pipes.pyx | 20 ++++++++++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index bfdf71652..4af8b756c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -409,6 +409,10 @@ class Errors(object):
     E144 = ("Could not find parameter `{param}` when building the entity linker model.")
     E145 = ("Error reading `{param}` from input file.")
     E146 = ("Could not access `{path}`.")
+    E147 = ("Unexpected error in the {method} functionality of the EntityLinker: {msg}. "
+            "This is likely a bug in spaCy, so feel free to open an issue.")
+    E148 = ("Expected {ents} KB identifiers but got {ids}. Make sure that each entity in `doc.ents` "
+            "is assigned to a KB identifier.")
 
 
 @add_codes
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index f5cc58411..63efc3f49 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1160,7 +1160,9 @@ class EntityLinker(Pipe):
                     prior_prob = self.kb.get_prior_prob(kb_id, mention)
 
                     gold_ent = ents_by_offset["{}_{}".format(start, end)]
-                    assert gold_ent is not None
+                    if gold_ent is None:
+                        raise RuntimeError(Errors.E147.format(method="update", msg="gold entity not found"))
+
                     type_vector = [0 for i in range(len(type_to_int))]
                     if len(type_to_int) > 0:
                         type_vector[type_to_int[gold_ent.label_]] = 1
@@ -1176,7 +1178,8 @@ class EntityLinker(Pipe):
                         priors.append([0])
 
         if len(entity_encodings) > 0:
-            assert len(priors) == len(entity_encodings) == len(context_docs) == len(type_vectors)
+            if not (len(priors) == len(entity_encodings) == len(context_docs) == len(type_vectors)):
+                raise RuntimeError(Errors.E147.format(method="update", msg="vector lengths not equal"))
 
             entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
 
@@ -1204,7 +1207,8 @@ class EntityLinker(Pipe):
                     cats.append([value])
 
         cats = self.model.ops.asarray(cats, dtype="float32")
-        assert len(scores) == len(cats)
+        if len(scores) != len(cats):
+            raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))
 
         d_scores = (scores - cats)
         loss = (d_scores ** 2).sum()
@@ -1267,7 +1271,9 @@ class EntityLinker(Pipe):
 
                         if self.cfg.get("context_weight", 1) > 0:
                             entity_encodings = xp.asarray([c.entity_vector for c in candidates])
-                            assert len(entity_encodings) == len(prior_probs)
+                            if len(entity_encodings) != len(prior_probs):
+                                raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length"))
+
                             mention_encodings = [list(context_encoding) + list(entity_encodings[i])
                                                  + list(prior_probs[i]) + type_vector
                                                  for i in range(len(entity_encodings))]
@@ -1279,13 +1285,15 @@ class EntityLinker(Pipe):
                         final_kb_ids.append(best_candidate.entity_)
                         final_tensors.append(context_encoding)
 
-        assert len(final_tensors) == len(final_kb_ids) == entity_count
+        if not (len(final_tensors) == len(final_kb_ids) == entity_count):
+            raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))
 
         return final_kb_ids, final_tensors
 
     def set_annotations(self, docs, kb_ids, tensors=None):
         count_ents = len([ent for doc in docs for ent in doc.ents])
-        assert count_ents == len(kb_ids)
+        if count_ents != len(kb_ids):
+            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
 
         i=0
         for doc in docs:

From a037206f0a5453050f19fc4801d2efa445cbe4d7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Jul 2019 12:17:19 +0200
Subject: [PATCH 23/28] use pathlib instead

---
 bin/wiki_entity_linking/kb_creator.py         | 63 ++++++++++++-------
 .../training_set_creator.py                   | 20 +++---
 .../wikipedia_processor.py                    |  8 +--
 3 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
index d8cdf6dd7..5b25475b2 100644
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@@ -13,9 +13,17 @@ INPUT_DIM = 300  # dimension of pre-trained input vectors
 DESC_WIDTH = 64  # dimension of output entity vectors
 
 
-def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
-              entity_def_output, entity_descr_output,
-              count_input, prior_prob_input, wikidata_input):
+def create_kb(
+    nlp,
+    max_entities_per_alias,
+    min_entity_freq,
+    min_occ,
+    entity_def_output,
+    entity_descr_output,
+    count_input,
+    prior_prob_input,
+    wikidata_input,
+):
     # Create the knowledge base from Wikidata entries
     kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH)
 
@@ -28,7 +36,9 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
         title_to_id, id_to_descr = wd.read_wikidata_entities_json(wikidata_input)
 
         # write the title-ID and ID-description mappings to file
-        _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr)
+        _write_entity_files(
+            entity_def_output, entity_descr_output, title_to_id, id_to_descr
+        )
 
     else:
         # read the mappings from file
@@ -54,8 +64,8 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
             frequency_list.append(freq)
             filtered_title_to_id[title] = entity
 
-    print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()),
-          "titles with filter frequency", min_entity_freq)
+    print(len(title_to_id.keys()), "original titles")
+    print("kept", len(filtered_title_to_id.keys()), " with frequency", min_entity_freq)
 
     print()
     print(" * train entity encoder", datetime.datetime.now())
@@ -70,14 +80,20 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
 
     print()
     print(" * adding", len(entity_list), "entities", datetime.datetime.now())
-    kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings)
+    kb.set_entities(
+        entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings
+    )
 
     print()
     print(" * adding aliases", datetime.datetime.now())
     print()
-    _add_aliases(kb, title_to_id=filtered_title_to_id,
-                 max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
-                 prior_prob_input=prior_prob_input)
+    _add_aliases(
+        kb,
+        title_to_id=filtered_title_to_id,
+        max_entities_per_alias=max_entities_per_alias,
+        min_occ=min_occ,
+        prior_prob_input=prior_prob_input,
+    )
 
     print()
     print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
@@ -86,13 +102,15 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
     return kb
 
 
-def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr):
-    with open(entity_def_output, mode='w', encoding='utf8') as id_file:
+def _write_entity_files(
+    entity_def_output, entity_descr_output, title_to_id, id_to_descr
+):
+    with entity_def_output.open("w", encoding="utf8") as id_file:
         id_file.write("WP_title" + "|" + "WD_id" + "\n")
         for title, qid in title_to_id.items():
             id_file.write(title + "|" + str(qid) + "\n")
 
-    with open(entity_descr_output, mode='w', encoding='utf8') as descr_file:
+    with entity_descr_output.open("w", encoding="utf8") as descr_file:
         descr_file.write("WD_id" + "|" + "description" + "\n")
         for qid, descr in id_to_descr.items():
             descr_file.write(str(qid) + "|" + descr + "\n")
@@ -100,8 +118,8 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_
 
 def get_entity_to_id(entity_def_output):
     entity_to_id = dict()
-    with open(entity_def_output, 'r', encoding='utf8') as csvfile:
-        csvreader = csv.reader(csvfile, delimiter='|')
+    with entity_def_output.open("r", encoding="utf8") as csvfile:
+        csvreader = csv.reader(csvfile, delimiter="|")
         # skip header
         next(csvreader)
         for row in csvreader:
@@ -111,8 +129,8 @@ def get_entity_to_id(entity_def_output):
 
 def get_id_to_description(entity_descr_output):
     id_to_desc = dict()
-    with open(entity_descr_output, 'r', encoding='utf8') as csvfile:
-        csvreader = csv.reader(csvfile, delimiter='|')
+    with entity_descr_output.open("r", encoding="utf8") as csvfile:
+        csvreader = csv.reader(csvfile, delimiter="|")
         # skip header
         next(csvreader)
         for row in csvreader:
@@ -125,7 +143,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
 
     # adding aliases with prior probabilities
     # we can read this file sequentially, it's sorted by alias, and then by count
-    with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
+    with prior_prob_input.open("r", encoding="utf8") as prior_file:
         # skip header
         prior_file.readline()
         line = prior_file.readline()
@@ -134,7 +152,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
         counts = []
         entities = []
         while line:
-            splits = line.replace('\n', "").split(sep='|')
+            splits = line.replace("\n", "").split(sep="|")
             new_alias = splits[0]
             count = int(splits[1])
             entity = splits[2]
@@ -153,7 +171,11 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
 
                     if selected_entities:
                         try:
-                            kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
+                            kb.add_alias(
+                                alias=previous_alias,
+                                entities=selected_entities,
+                                probabilities=prior_probs,
+                            )
                         except ValueError as e:
                             print(e)
                 total_count = 0
@@ -168,4 +190,3 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
             previous_alias = new_alias
 
             line = prior_file.readline()
-
diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index 74bdbe9fb..b090d7659 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -1,7 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import os
 import random
 import re
 import bz2
@@ -37,7 +36,7 @@ def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=N
 
     read_ids = set()
     entityfile_loc = training_output / ENTITY_FILE
-    with open(entityfile_loc, mode="w", encoding="utf8") as entityfile:
+    with entityfile_loc.open("w", encoding="utf8") as entityfile:
         # write entity training header file
         _write_training_entity(
             outputfile=entityfile,
@@ -301,8 +300,8 @@ def _get_clean_wp_text(article_text):
 
 
 def _write_training_article(article_id, clean_text, training_output):
-    file_loc = training_output / str(article_id) + ".txt"
-    with open(file_loc, mode="w", encoding="utf8") as outputfile:
+    file_loc = training_output / "{}.txt".format(article_id)
+    with file_loc.open("w", encoding="utf8") as outputfile:
         outputfile.write(clean_text)
 
 
@@ -330,7 +329,7 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
     skip_articles = set()
     total_entities = 0
 
-    with open(entityfile_loc, mode="r", encoding="utf8") as file:
+    with entityfile_loc.open("r", encoding="utf8") as file:
         for line in file:
             if not limit or len(data) < limit:
                 fields = line.replace("\n", "").split(sep="|")
@@ -349,11 +348,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                         # parse the new article text
                         file_name = article_id + ".txt"
                         try:
-                            with open(
-                                os.path.join(training_dir, file_name),
-                                mode="r",
-                                encoding="utf8",
-                            ) as f:
+                            training_file = training_dir / file_name
+                            with training_file.open("r", encoding="utf8") as f:
                                 text = f.read()
                                 # threshold for convenience / speed of processing
                                 if len(text) < 30000:
@@ -364,7 +360,9 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                         sent_length = len(ent.sent)
                                         # custom filtering to avoid too long or too short sentences
                                         if 5 < sent_length < 100:
-                                            offset = "{}_{}".format(ent.start_char, ent.end_char)
+                                            offset = "{}_{}".format(
+                                                ent.start_char, ent.end_char
+                                            )
                                             ents_by_offset[offset] = ent
                                 else:
                                     skip_articles.add(article_id)
diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py
index 4d11aee61..80d75b013 100644
--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@@ -143,7 +143,7 @@ def read_prior_probs(wikipedia_input, prior_prob_output):
             cnt += 1
 
     # write all aliases and their entities and count occurrences to file
-    with open(prior_prob_output, mode="w", encoding="utf8") as outputfile:
+    with prior_prob_output.open("w", encoding="utf8") as outputfile:
         outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
         for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
             s_dict = sorted(alias_dict.items(), key=lambda x: x[1], reverse=True)
@@ -220,7 +220,7 @@ def write_entity_counts(prior_prob_input, count_output, to_print=False):
     entity_to_count = dict()
     total_count = 0
 
-    with open(prior_prob_input, mode="r", encoding="utf8") as prior_file:
+    with prior_prob_input.open("r", encoding="utf8") as prior_file:
         # skip header
         prior_file.readline()
         line = prior_file.readline()
@@ -238,7 +238,7 @@ def write_entity_counts(prior_prob_input, count_output, to_print=False):
 
             line = prior_file.readline()
 
-    with open(count_output, mode="w", encoding="utf8") as entity_file:
+    with count_output.open("w", encoding="utf8") as entity_file:
         entity_file.write("entity" + "|" + "count" + "\n")
         for entity, count in entity_to_count.items():
             entity_file.write(entity + "|" + str(count) + "\n")
@@ -251,7 +251,7 @@ def write_entity_counts(prior_prob_input, count_output, to_print=False):
 
 def get_all_frequencies(count_input):
     entity_to_count = dict()
-    with open(count_input, "r", encoding="utf8") as csvfile:
+    with count_input.open("r", encoding="utf8") as csvfile:
         csvreader = csv.reader(csvfile, delimiter="|")
         # skip header
         next(csvreader)

From 4e7ec1ed31301ee24fec74633fcc7f8224491317 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Jul 2019 14:23:58 +0200
Subject: [PATCH 24/28] return fix

---
 spacy/pipeline/pipes.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 63efc3f49..609c4e852 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1237,7 +1237,7 @@ class EntityLinker(Pipe):
         final_tensors = []
 
         if not docs:
-            return final_kb_ids
+            return final_kb_ids, final_tensors
 
         if isinstance(docs, Doc):
             docs = [docs]

From ba02957c80c49658f323c49df8d1f768a11e99b3 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 23 Jul 2019 18:28:55 +0200
Subject: [PATCH 25/28] Fix dependency copy for as_doc (#3969)

* failing unit test for issue 3962

* attempt to fix Issue #3962

* create artificial unit test example

* using length instead of self.length

* sp

* reformat with black

* find better ancestor within span and use generic 'dep'

* attach to span.root if there is no appropriate ancestor

* comment span text

* clean up ancestor code

* reconstruct dep tree to keep same number of sentences
---
 spacy/tests/regression/test_issue3962.py | 112 +++++++++++++++++++++++
 spacy/tokens/doc.pyx                     |   4 +-
 spacy/tokens/span.pyx                    |  46 +++++++++-
 3 files changed, 157 insertions(+), 5 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue3962.py

diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py
new file mode 100644
index 000000000..c7979c2f3
--- /dev/null
+++ b/spacy/tests/regression/test_issue3962.py
@@ -0,0 +1,112 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+from ..util import get_doc
+
+
+@pytest.fixture
+def doc(en_tokenizer):
+    text = "He jests at scars, that never felt a wound."
+    heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
+    deps = [
+        "nsubj",
+        "ccomp",
+        "prep",
+        "pobj",
+        "punct",
+        "nsubj",
+        "neg",
+        "ROOT",
+        "det",
+        "dobj",
+        "punct",
+    ]
+    tokens = en_tokenizer(text)
+    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+
+
+def test_issue3962(doc):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    span2 = doc[1:5]  # "jests at scars ,"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+
+    assert doc2[0].head.text == "jests"  # head set to itself, being the new artificial root
+    assert doc2[0].dep_ == "dep"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"  # head set to the new artificial root
+    assert doc2[3].dep_ == "dep"
+
+    # We should still have 1 sentence
+    assert len(list(doc2.sents)) == 1
+
+    span3 = doc[6:9]  # "never felt a"
+    doc3 = span3.as_doc()
+    doc3_json = doc3.to_json()
+    assert doc3_json
+
+    assert doc3[0].head.text == "felt"
+    assert doc3[0].dep_ == "neg"
+    assert doc3[1].head.text == "felt"
+    assert doc3[1].dep_ == "ROOT"
+    assert doc3[2].head.text == "felt"  # head set to ancestor
+    assert doc3[2].dep_ == "dep"
+
+    # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
+    assert len(list(doc3.sents)) == 1
+
+
+@pytest.fixture
+def two_sent_doc(en_tokenizer):
+    text = "He jests at scars. They never felt a wound."
+    heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
+    deps = [
+        "nsubj",
+        "ROOT",
+        "prep",
+        "pobj",
+        "punct",
+        "nsubj",
+        "neg",
+        "ROOT",
+        "det",
+        "dobj",
+        "punct",
+    ]
+    tokens = en_tokenizer(text)
+    return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+
+
+def test_issue3962_long(two_sent_doc):
+    """ Ensure that as_doc does not result in out-of-bound access of tokens.
+    This is achieved by setting the head to itself if it would lie out of the span otherwise."""
+    span2 = two_sent_doc[1:7]  # "jests at scars. They never"
+    doc2 = span2.as_doc()
+    doc2_json = doc2.to_json()
+    assert doc2_json
+
+    assert doc2[0].head.text == "jests"  # head set to itself, being the new artificial root (in sentence 1)
+    assert doc2[0].dep_ == "ROOT"
+    assert doc2[1].head.text == "jests"
+    assert doc2[1].dep_ == "prep"
+    assert doc2[2].head.text == "at"
+    assert doc2[2].dep_ == "pobj"
+    assert doc2[3].head.text == "jests"
+    assert doc2[3].dep_ == "punct"
+    assert doc2[4].head.text == "They"  # head set to itself, being the new artificial root (in sentence 2)
+    assert doc2[4].dep_ == "dep"
+    assert doc2[4].head.text == "They"  # head set to the new artificial head (in sentence 2)
+    assert doc2[4].dep_ == "dep"
+
+    # We should still have 2 sentences
+    sents = list(doc2.sents)
+    assert len(sents) == 2
+    assert sents[0].text == "jests at scars ."
+    assert sents[1].text == "They never"
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index c1883f9c0..7ab1563e9 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -794,7 +794,7 @@ cdef class Doc:
                 if array[i, col] != 0:
                     self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
         # Now load the data
-        for i in range(self.length):
+        for i in range(length):
             token = &self.c[i]
             for j in range(n_attrs):
                 if attr_ids[j] != TAG:
@@ -804,7 +804,7 @@ cdef class Doc:
         self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
         # If document is parsed, set children
         if self.is_parsed:
-            set_children_from_heads(self.c, self.length)
+            set_children_from_heads(self.c, length)
         return self
 
     def get_lca_matrix(self):
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 3f4f4418b..42fb9852d 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -17,6 +17,7 @@ from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
 from ..attrs cimport *
 from ..lexeme cimport Lexeme
+from ..symbols cimport dep
 
 from ..util import normalize_slice
 from ..compat import is_config, basestring_
@@ -206,7 +207,6 @@ cdef class Span:
 
         DOCS: https://spacy.io/api/span#as_doc
         """
-        # TODO: Fix!
         words = [t.text for t in self]
         spaces = [bool(t.whitespace_) for t in self]
         cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
@@ -220,7 +220,9 @@ cdef class Span:
         else:
             array_head.append(SENT_START)
         array = self.doc.to_array(array_head)
-        doc.from_array(array_head, array[self.start : self.end])
+        array = array[self.start : self.end]
+        self._fix_dep_copy(array_head, array)
+        doc.from_array(array_head, array)
         doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
         doc.user_hooks = self.doc.user_hooks
         doc.user_span_hooks = self.doc.user_span_hooks
@@ -235,6 +237,44 @@ cdef class Span:
                     doc.cats[cat_label] = value
         return doc
 
+    def _fix_dep_copy(self, attrs, array):
+        """ Rewire dependency links to make sure their heads fall into the span
+        while still keeping the correct number of sentences. """
+        cdef int length = len(array)
+        cdef attr_t value
+        cdef int i, head_col, ancestor_i
+        old_to_new_root = dict()
+        if HEAD in attrs:
+            head_col = attrs.index(HEAD)
+            for i in range(length):
+                # if the HEAD refers to a token outside this span, find a more appropriate ancestor
+                token = self[i]
+                ancestor_i = token.head.i - self.start   # span offset
+                if ancestor_i not in range(length):
+                    if DEP in attrs:
+                        array[i, attrs.index(DEP)] = dep
+
+                    # try finding an ancestor within this span
+                    ancestors = token.ancestors
+                    for ancestor in ancestors:
+                        ancestor_i = ancestor.i - self.start
+                        if ancestor_i in range(length):
+                            array[i, head_col] = ancestor_i - i
+
+                # if there is no appropriate ancestor, define a new artificial root
+                value = array[i, head_col]
+                if (i+value) not in range(length):
+                    new_root = old_to_new_root.get(ancestor_i, None)
+                    if new_root is not None:
+                        # take the same artificial root as a previous token from the same sentence
+                        array[i, head_col] = new_root - i
+                    else:
+                        # set this token as the new artificial root
+                        array[i, head_col] = 0
+                        old_to_new_root[ancestor_i] = i
+
+        return array
+
     def merge(self, *args, **attributes):
         """Retokenize the document, such that the span is merged into a single
         token.
@@ -500,7 +540,7 @@ cdef class Span:
         if "root" in self.doc.user_span_hooks:
             return self.doc.user_span_hooks["root"](self)
         # This should probably be called 'head', and the other one called
-        # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
+        # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/
         cdef int i
         # First, we scan through the Span, and check whether there's a word
         # with head==0, i.e. a sentence root. If so, we can return it. The

From c8949ce88a8498d3b28466e39960a171e81954c3 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 24 Jul 2019 06:10:06 +0900
Subject: [PATCH 26/28] Remove old comment (#4012)

Norwegian used to borrow from French but that doesn't appear to have
been true for a while now, so the comment that was here is no longer
relevant.
---
 spacy/lang/nb/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index fa0f31d33..c94ea3f39 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -12,10 +12,6 @@ from ...language import Language
 from ...attrs import LANG, NORM
 from ...util import update_exc, add_lookups
 
-# Borrowing french syntax parser because both languages use
-# universal dependencies for tagging/parsing.
-# Read here for more:
-# https://github.com/explosion/spaCy/pull/1882#issuecomment-361409573
 from .syntax_iterators import SYNTAX_ITERATORS
 
 
From 73e095923f93b81fe40b80c384be644d2a829749 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 24 Jul 2019 11:27:34 +0200
Subject: [PATCH 27/28] =?UTF-8?q?=F0=9F=92=AB=20Improve=20error=20message?=
 =?UTF-8?q?=20when=20model.from=5Fbytes()=20dies=20(#4014)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Improve error message when model.from_bytes() dies

When Thinc's model.from_bytes() is called with a mismatched model, often
we get a particularly ungraceful error,

e.g. "AttributeError: FunctionLayer has no attribute G"

This is because we're trying to load the parameters for something like
a LayerNorm layer, and the model architecture has some other layer there
instead. This is obviously terrible, especially since the error *type*
is wrong.

I've changed it to raise a ValueError. The error message is still
probably a bit terse, but it's hard to be sure exactly what's gone
wrong.

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>


Co-authored-by: Ines Montani <ines@ines.io>
---
 spacy/errors.py            |  3 ++-
 spacy/pipeline/pipes.pyx   | 27 +++++++++++++++++++++------
 spacy/syntax/nn_parser.pyx | 10 ++++++++--
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 4af8b756c..1699809a7 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -413,7 +413,8 @@ class Errors(object):
             "This is likely a bug in spaCy, so feel free to open an issue.")
     E148 = ("Expected {ents} KB identifiers but got {ids}. Make sure that each entity in `doc.ents` "
             "is assigned to a KB identifier.")
-
+    E149 = ("Error deserializing model. Check that the config used to create the "
+            "component matches the model being loaded.")
 
 @add_codes
 class TempErrors(object):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 609c4e852..ca166607f 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -167,7 +167,10 @@ class Pipe(object):
                 self.cfg["pretrained_vectors"] = self.vocab.vectors.name
             if self.model is True:
                 self.model = self.Model(**self.cfg)
-            self.model.from_bytes(b)
+            try:
+                self.model.from_bytes(b)
+            except AttributeError:
+                raise ValueError(Errors.E149)
 
         deserialize = OrderedDict()
         deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
@@ -196,7 +199,10 @@ class Pipe(object):
                 self.cfg["pretrained_vectors"] = self.vocab.vectors.name
             if self.model is True:
                 self.model = self.Model(**self.cfg)
-            self.model.from_bytes(p.open("rb").read())
+            try:
+                self.model.from_bytes(p.open("rb").read())
+            except AttributeError:
+                raise ValueError(Errors.E149)
 
         deserialize = OrderedDict()
         deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
@@ -562,7 +568,10 @@ class Tagger(Pipe):
                     "token_vector_width",
                     self.cfg.get("token_vector_width", 96))
                 self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
-            self.model.from_bytes(b)
+            try:
+                self.model.from_bytes(b)
+            except AttributeError:
+                raise ValueError(Errors.E149)
 
         def load_tag_map(b):
             tag_map = srsly.msgpack_loads(b)
@@ -600,7 +609,10 @@ class Tagger(Pipe):
             if self.model is True:
                 self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
             with p.open("rb") as file_:
-                self.model.from_bytes(file_.read())
+                try:
+                    self.model.from_bytes(file_.read())
+                except AttributeError:
+                    raise ValueError(Errors.E149)
 
         def load_tag_map(p):
             tag_map = srsly.read_msgpack(p)
@@ -1315,9 +1327,12 @@ class EntityLinker(Pipe):
 
     def from_disk(self, path, exclude=tuple(), **kwargs):
         def load_model(p):
-             if self.model is True:
+            if self.model is True:
                 self.model = self.Model(**self.cfg)
-             self.model.from_bytes(p.open("rb").read())
+            try: 
+                self.model.from_bytes(p.open("rb").read())
+            except AttributeError:
+                raise ValueError(Errors.E149)
 
         def load_kb(p):
             kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index a6a476901..fa1a41fa4 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -631,7 +631,10 @@ cdef class Parser:
                 cfg = {}
             with (path / 'model').open('rb') as file_:
                 bytes_data = file_.read()
-            self.model.from_bytes(bytes_data)
+            try:
+                self.model.from_bytes(bytes_data)
+            except AttributeError:
+                raise ValueError(Errors.E149)
             self.cfg.update(cfg)
         return self
 
@@ -663,6 +666,9 @@ cdef class Parser:
             else:
                 cfg = {}
             if 'model' in msg:
-                self.model.from_bytes(msg['model'])
+                try:
+                    self.model.from_bytes(msg['model'])
+                except AttributeError:
+                    raise ValueError(Errors.E149)
             self.cfg.update(cfg)
         return self

From 784a5f4284ca6e115987187714cfc02796a3d523 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Thu, 25 Jul 2019 12:14:02 +0200
Subject: [PATCH 28/28] Update GoldParse attributes in API docs (#4023)

* add `words`
* update name of entity list to `ner`

I think it might be a bit more consistent to have `ner` named `entities`
or `ents` (and `ents` is actually set somewhere to `None`, which is a
bit confusing), but it looks like renaming it would be a non-trivial
decision.
---
 website/docs/api/goldparse.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md
index 23575038c..9770f9537 100644
--- a/website/docs/api/goldparse.md
+++ b/website/docs/api/goldparse.md
@@ -45,10 +45,11 @@ Whether the provided syntactic annotations form a projective dependency tree.
 
 | Name                              | Type | Description                                                                                                                                              |
 | --------------------------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `words`                           | list | The words.                                                                                                                                               |
 | `tags`                            | list | The part-of-speech tag annotations.                                                                                                                      |
 | `heads`                           | list | The syntactic head annotations.                                                                                                                          |
 | `labels`                          | list | The syntactic relation-type annotations.                                                                                                                 |
-| `ents`                            | list | The named entity annotations.                                                                                                                            |
+| `ner`                             | list | The named entity annotations as BILUO tags.                                                                                                              |
 | `cand_to_gold`                    | list | The alignment from candidate tokenization to gold tokenization.                                                                                          |
 | `gold_to_cand`                    | list | The alignment from gold tokenization to candidate tokenization.                                                                                          |
 | `cats` <Tag variant="new">2</Tag> | list | Entries in the list should be either a label, or a `(start, end, label)` triple. The tuple form is used for categories applied to spans of the document. |