From 61a33f55d2eec93a335dfecc9c9a5e85c339e00a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 10 Apr 2019 16:06:09 +0200
Subject: [PATCH 001/148] little fixes

---
 spacy/kb.pxd | 12 +++++++++---
 spacy/kb.pyx | 13 +++++--------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index e34a0a9ba..e57c162fc 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -13,7 +13,7 @@ from .typedefs cimport hash_t
 # of bits we need to keep track of the answers.
 cdef struct _EntryC:
 
-    # The hash of this entry's unique ID and name in the kB
+    # The hash of this entry's unique ID/name in the kB
     hash_t entity_hash
 
     # Allows retrieval of one or more vectors.
@@ -99,7 +99,7 @@ cdef class KnowledgeBase:
     cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
                                      int32_t* vector_rows, int feats_row):
         """Add an entry to the knowledge base."""
-        # This is what we'll map the hash key to. It's where the entry will sit
+        # This is what we'll map the entity hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
         cdef int64_t new_index = self._entries.size()
         self._entries.push_back(
@@ -114,6 +114,8 @@ cdef class KnowledgeBase:
 
     cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
         """Connect a mention to a list of potential entities with their prior probabilities ."""
+        # This is what we'll map the alias hash key to. It's where the alias will be defined
+        # in the vector of aliases.
         cdef int64_t new_index = self._aliases_table.size()
 
         self._aliases_table.push_back(
@@ -126,12 +128,14 @@ cdef class KnowledgeBase:
 
     cdef inline _create_empty_vectors(self):
         """ 
-        Making sure the first element of each vector is a dummy,
+        Initializing the vectors and making sure the first element of each vector is a dummy,
         because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
         cf. https://github.com/explosion/preshed/issues/17
         """
         cdef int32_t dummy_value = 0
         self.vocab.strings.add("")
+
+        self._entry_index = PreshMap()
         self._entries.push_back(
             _EntryC(
                 entity_hash=self.vocab.strings[""],
@@ -139,6 +143,8 @@ cdef class KnowledgeBase:
                 feats_row=dummy_value,
                 prob=dummy_value
             ))
+
+        self._alias_index = PreshMap()
         self._aliases_table.push_back(
             _AliasC(
                 entry_indices=[dummy_value],
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 3a0a8b918..38c393355 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,3 +1,4 @@
+# cython: infer_types=True
 # cython: profile=True
 # coding: utf8
 from spacy.errors import Errors, Warnings, user_warning
@@ -19,7 +20,7 @@ cdef class Candidate:
     @property
     def entity_(self):
         """RETURNS (unicode): ID/name of this entity in the KB"""
-        return self.kb.vocab.strings[self.entity]
+        return self.kb.vocab.strings[self.entity_hash]
 
     @property
     def alias(self):
@@ -29,7 +30,7 @@ cdef class Candidate:
     @property
     def alias_(self):
         """RETURNS (unicode): ID of the original alias"""
-        return self.kb.vocab.strings[self.alias]
+        return self.kb.vocab.strings[self.alias_hash]
 
     @property
     def prior_prob(self):
@@ -40,8 +41,6 @@ cdef class KnowledgeBase:
 
     def __init__(self, Vocab vocab):
         self.vocab = vocab
-        self._entry_index = PreshMap()
-        self._alias_index = PreshMap()
         self.mem = Pool()
         self._create_empty_vectors()
 
@@ -56,8 +55,8 @@ cdef class KnowledgeBase:
 
     def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None):
         """
-        Add an entity to the KB.
-        Return the hash of the entity ID at the end
+        Add an entity to the KB, optionally specifying its log probability based on corpus frequency
+        Return the hash of the entity ID/name at the end
         """
         cdef hash_t entity_hash = self.vocab.strings.add(entity)
 
@@ -98,8 +97,6 @@ cdef class KnowledgeBase:
             user_warning(Warnings.W017.format(alias=alias))
             return
 
-        cdef hash_t entity_hash
-
         cdef vector[int64_t] entry_indices
         cdef vector[float] probs
 

From 9a7d534b1bc07898d855a254fcbaa39b28023fa3 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 10 Apr 2019 17:25:10 +0200
Subject: [PATCH 002/148] enable nogil for cython functions in kb.pxd

---
 examples/pipeline/dummy_entity_linking.py     |  2 +-
 spacy/kb.pxd                                  | 74 ++++++++++---------
 spacy/kb.pyx                                  | 15 +++-
 .../{test_el.py => test_entity_linker.py}     |  0
 4 files changed, 52 insertions(+), 39 deletions(-)
 rename spacy/tests/pipeline/{test_el.py => test_entity_linker.py} (100%)

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
index 88415d040..e93e3e20b 100644
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -28,7 +28,7 @@ def create_kb(vocab):
     print()
     alias_0 = "Douglas"
     print("adding alias", alias_0)
-    kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.1, 0.6, 0.2])
+    kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2])
 
     alias_1 = "Douglas Adams"
     print("adding alias", alias_1)
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index e57c162fc..3cdf1e07e 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -97,58 +97,64 @@ cdef class KnowledgeBase:
     cdef object _features_table
 
     cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
-                                     int32_t* vector_rows, int feats_row):
-        """Add an entry to the knowledge base."""
+                                     int32_t* vector_rows, int feats_row) nogil:
+        """Add an entry to the vector of entries.
+        After calling this method, make sure to update also the _entry_index using the return value"""
         # This is what we'll map the entity hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
         cdef int64_t new_index = self._entries.size()
-        self._entries.push_back(
-            _EntryC(
-                entity_hash=entity_hash,
-                vector_rows=vector_rows,
-                feats_row=feats_row,
-                prob=prob
-            ))
-        self._entry_index[entity_hash] = new_index
+
+        # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
+        cdef _EntryC entry
+        entry.entity_hash = entity_hash
+        entry.vector_rows = vector_rows
+        entry.feats_row = feats_row
+        entry.prob = prob
+
+        self._entries.push_back(entry)
         return new_index
 
-    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
-        """Connect a mention to a list of potential entities with their prior probabilities ."""
+    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
+        """Connect a mention to a list of potential entities with their prior probabilities .
+        After calling this method, make sure to update also the _alias_index using the return value"""
         # This is what we'll map the alias hash key to. It's where the alias will be defined
         # in the vector of aliases.
         cdef int64_t new_index = self._aliases_table.size()
 
-        self._aliases_table.push_back(
-            _AliasC(
-                entry_indices=entry_indices,
-                probs=probs
-            ))
-        self._alias_index[alias_hash] = new_index
+        # Avoid struct initializer to enable nogil
+        cdef _AliasC alias
+        alias.entry_indices = entry_indices
+        alias.probs = probs
+
+        self._aliases_table.push_back(alias)
         return new_index
 
-    cdef inline _create_empty_vectors(self):
+    cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
         """ 
         Initializing the vectors and making sure the first element of each vector is a dummy,
         because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
         cf. https://github.com/explosion/preshed/issues/17
         """
         cdef int32_t dummy_value = 0
-        self.vocab.strings.add("")
 
-        self._entry_index = PreshMap()
-        self._entries.push_back(
-            _EntryC(
-                entity_hash=self.vocab.strings[""],
-                vector_rows=&dummy_value,
-                feats_row=dummy_value,
-                prob=dummy_value
-            ))
+        # Avoid struct initializer to enable nogil
+        cdef _EntryC entry
+        entry.entity_hash = dummy_hash
+        entry.vector_rows = &dummy_value
+        entry.feats_row = dummy_value
+        entry.prob = dummy_value
 
-        self._alias_index = PreshMap()
-        self._aliases_table.push_back(
-            _AliasC(
-                entry_indices=[dummy_value],
-                probs=[dummy_value]
-            ))
+        # Avoid struct initializer to enable nogil
+        cdef vector[int64_t] dummy_entry_indices
+        dummy_entry_indices.push_back(0)
+        cdef vector[float] dummy_probs
+        dummy_probs.push_back(0)
+
+        cdef _AliasC alias
+        alias.entry_indices = dummy_entry_indices
+        alias.probs = dummy_probs
+
+        self._entries.push_back(entry)
+        self._aliases_table.push_back(alias)
 
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 38c393355..97e86d01f 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -42,7 +42,11 @@ cdef class KnowledgeBase:
     def __init__(self, Vocab vocab):
         self.vocab = vocab
         self.mem = Pool()
-        self._create_empty_vectors()
+        self._entry_index = PreshMap()
+        self._alias_index = PreshMap()
+
+        self.vocab.strings.add("")
+        self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
 
     def __len__(self):
         return self.get_size_entities()
@@ -66,8 +70,10 @@ cdef class KnowledgeBase:
             return
 
         cdef int32_t dummy_value = 342
-        self.c_add_entity(entity_hash=entity_hash, prob=prob,
-                          vector_rows=&dummy_value, feats_row=dummy_value)
+        new_index = self.c_add_entity(entity_hash=entity_hash, prob=prob,
+                                      vector_rows=&dummy_value, feats_row=dummy_value)
+        self._entry_index[entity_hash] = new_index
+
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
@@ -109,7 +115,8 @@ cdef class KnowledgeBase:
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
-        self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
+        new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
+        self._alias_index[alias_hash] = new_index
 
         return alias_hash
 
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_entity_linker.py
similarity index 100%
rename from spacy/tests/pipeline/test_el.py
rename to spacy/tests/pipeline/test_entity_linker.py

From 6e997be4b4b364583c2e148992756992cd195b43 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 11 Apr 2019 21:08:22 +0200
Subject: [PATCH 003/148] reading wikidata descriptions and aliases

---
 examples/pipeline/wikidata_entity_linking.py | 94 ++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 examples/pipeline/wikidata_entity_linking.py

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
new file mode 100644
index 000000000..b467a5982
--- /dev/null
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -0,0 +1,94 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
+"""
+import json
+import spacy
+import bz2
+from spacy.kb import KnowledgeBase
+
+
+def create_kb(vocab):
+    kb = KnowledgeBase(vocab=vocab)
+    _read_wikidata()
+
+    # adding entities
+    # kb.add_entity(entity=entity, prob=prob)
+
+    # adding aliases
+    # kb.add_alias(alias=alias, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2])
+
+    print()
+    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
+
+    return kb
+
+
+def _read_wikidata():
+    """ Read the JSON wiki data """
+    # TODO remove hardcoded path
+
+    languages = {'en', 'de'}
+
+    with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
+        line = file.readline()
+        cnt = 1
+        while line and cnt < 10:
+            clean_line = line.strip()
+            if clean_line.endswith(b","):
+                clean_line = clean_line[:-1]
+            if len(clean_line) > 1:
+                obj = json.loads(clean_line)
+                unique_id = obj["id"]
+                print(unique_id)
+
+                labels = obj["labels"]
+                if labels:
+                    for lang in languages:
+                        lang_label = labels.get(lang, None)
+                        if lang_label:
+                            print("label (" + lang + "):", lang_label["value"])
+
+                descriptions = obj["descriptions"]
+                if descriptions:
+                    for lang in languages:
+                        lang_descr = descriptions.get(lang, None)
+                        if lang_descr:
+                            print("description (" + lang + "):", lang_descr["value"])
+
+                aliases = obj["aliases"]
+                if aliases:
+                    for lang in languages:
+                        lang_aliases = aliases.get(lang, None)
+                        if lang_aliases:
+                            for item in lang_aliases:
+                                print("alias (" + lang + "):", item["value"])
+
+                print()
+            line = file.readline()
+            cnt += 1
+
+
+def add_el(kb, nlp):
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
+    nlp.add_pipe(el_pipe, last=True)
+
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
+           "Douglas reminds us to always bring our towel. " \
+           "The main character in Doug's novel is called Arthur Dent."
+    doc = nlp(text)
+
+    print()
+    for token in doc:
+        print("token", token.text, token.ent_type_, token.ent_kb_id_)
+
+    print()
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
+
+
+if __name__ == "__main__":
+    nlp = spacy.load('en_core_web_sm')
+    my_kb = create_kb(nlp.vocab)
+    # add_el(my_kb, nlp)

From b31a390a9aaedccbdc4dc4c7ce62197ef2e9e533 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 11 Apr 2019 21:42:44 +0200
Subject: [PATCH 004/148] reading types, claims and sitelinks

---
 examples/pipeline/wikidata_entity_linking.py | 21 +++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index b467a5982..11e4cc04c 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -30,6 +30,8 @@ def _read_wikidata():
     # TODO remove hardcoded path
 
     languages = {'en', 'de'}
+    properties = {'P31'}
+    sites = {'enwiki'}
 
     with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
         line = file.readline()
@@ -40,8 +42,25 @@ def _read_wikidata():
                 clean_line = clean_line[:-1]
             if len(clean_line) > 1:
                 obj = json.loads(clean_line)
+
                 unique_id = obj["id"]
-                print(unique_id)
+                print("ID:", unique_id)
+
+                entry_type = obj["type"]
+                print("type:", entry_type)
+
+                # TODO: filter on rank:  preferred, normal or deprecated
+                claims = obj["claims"]
+                for prop in properties:
+                    claim_property = claims.get(prop, None)
+                    if claim_property:
+                        for cp in claim_property:
+                            print(prop, cp['mainsnak']['datavalue']['value']['id'])
+
+                entry_sites = obj["sitelinks"]
+                for site in sites:
+                    site_value = entry_sites.get(site, None)
+                    print(site, ":", site_value['title'])
 
                 labels = obj["labels"]
                 if labels:

From 3163331b1ee4238265e9584247fc36965fb9da13 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 14 Apr 2019 21:52:01 +0200
Subject: [PATCH 005/148] wikipedia dump parser and mediawiki format regex
 cleanup

---
 examples/pipeline/wikidata_entity_linking.py | 81 +++++++++++++++++++-
 1 file changed, 80 insertions(+), 1 deletion(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 11e4cc04c..02106bc31 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 
 """Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
 """
+import re
 import json
 import spacy
 import bz2
@@ -11,7 +12,8 @@ from spacy.kb import KnowledgeBase
 
 def create_kb(vocab):
     kb = KnowledgeBase(vocab=vocab)
-    _read_wikidata()
+    # _read_wikidata()
+    _read_wikipedia()
 
     # adding entities
     # kb.add_entity(entity=entity, prob=prob)
@@ -89,6 +91,83 @@ def _read_wikidata():
             cnt += 1
 
 
+def _read_wikipedia():
+    """ Read the XML wikipedia data """
+    # TODO remove hardcoded path
+
+    # with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2', mode='rb') as file:
+    with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file:
+        line = file.readline()
+        cnt = 1
+        article_text = ""
+        article_title = None
+        article_id = None
+        reading_text = False
+        while line and cnt < 10000:
+            clean_line = line.strip().decode("utf-8")
+
+            # Start reading new page
+            if clean_line == "<page>":
+                article_text = ""
+                article_title = None
+                article_id = 342
+
+            # finished reading this page
+            elif clean_line == "</page>":
+                if article_id:
+                    _store_wp_article(article_id, article_title, article_text.strip())
+
+            # start reading text within a page
+            if "<text" in clean_line:
+                reading_text = True
+
+            if reading_text:
+                article_text += " " + clean_line
+
+            # stop reading text within a page
+            if "</text" in clean_line:
+                reading_text = False
+
+            # read the ID of this article
+            ids = re.findall(r"(?<=<id>)\d*(?=</id>)", clean_line)
+            if ids:
+                article_id = ids[0]
+
+            # read the title of this article
+            titles = re.findall(r"(?<=<title>).*(?=</title>)", clean_line)
+            if titles:
+                article_title = titles[0].strip()
+
+            line = file.readline()
+            cnt += 1
+
+
+def _store_wp_article(article_id, article_title, article_text):
+    print("WP article", article_id, ":", article_title)
+    print(article_text)
+    print(_get_clean_wp_text(article_text))
+    print()
+
+
+def _get_clean_wp_text(article_text):
+    # remove category statements
+    clean_text = re.sub('\[\[Category:.*\]\]', '', article_text)
+
+    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
+    try_again = True
+    previous_length = len(clean_text)
+    while try_again:
+        clean_text = re.sub('{[^{]*?}', '', clean_text)  # non-greedy match
+        print(clean_text)
+        if len(clean_text) < previous_length:
+            try_again = True
+        else:
+            try_again = False
+        previous_length = len(clean_text)
+
+    return clean_text
+
+
 def add_el(kb, nlp):
     el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
     nlp.add_pipe(el_pipe, last=True)

From 6763e025e1f351b5c3f133d5a334217f172867b9 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 15 Apr 2019 11:41:57 +0200
Subject: [PATCH 006/148] parse wp dump for links to determine prior
 probabilities

---
 examples/pipeline/wikidata_entity_linking.py | 136 +++++++++++++++++--
 1 file changed, 128 insertions(+), 8 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 02106bc31..5065648ef 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -6,9 +6,27 @@ from __future__ import unicode_literals
 import re
 import json
 import spacy
+import datetime
 import bz2
 from spacy.kb import KnowledgeBase
 
+# these will/should be matched ignoring case
+wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
+                   "d", "dbdump", "download", "Draft", "Education", "Foundation",
+                   "Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator",
+                   "m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki",
+                   "MediaZilla", "Meta", "Metawikipedia", "Module",
+                   "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki",
+                   "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev",
+                   "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn",
+                   "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki",
+                   "User", "User talk", "v", "voy",
+                   "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews",
+                   "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech",
+                   "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"]
+
+map_alias_to_link = dict()
+
 
 def create_kb(vocab):
     kb = KnowledgeBase(vocab=vocab)
@@ -38,7 +56,7 @@ def _read_wikidata():
     with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
         line = file.readline()
         cnt = 1
-        while line and cnt < 10:
+        while line and cnt < 100000:
             clean_line = line.strip()
             if clean_line.endswith(b","):
                 clean_line = clean_line[:-1]
@@ -91,6 +109,78 @@ def _read_wikidata():
             cnt += 1
 
 
+def _read_wikipedia_prior_probs():
+    """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities """
+
+    # find the links
+    link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
+
+    # match on interwiki links, e.g. `en:` or `:fr:`
+    ns_regex = r":?" + "[a-z][a-z]" + ":"
+
+    # match on Namespace: optionally preceded by a :
+    for ns in wiki_namespaces:
+        ns_regex += "|" + ":?" + ns + ":"
+
+    ns_regex = re.compile(ns_regex, re.IGNORECASE)
+
+    # TODO remove hardcoded path
+    with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file:
+        line = file.readline()
+        cnt = 0
+        while line:
+            if cnt % 5000000 == 0:
+                print(datetime.datetime.now(), "processed", cnt, "lines")
+            clean_line = line.strip().decode("utf-8")
+
+            matches = link_regex.findall(clean_line)
+            for match in matches:
+                match = match[2:][:-2].replace("_", " ").strip()
+
+                if ns_regex.match(match):
+                    pass  # ignore namespaces at the beginning of the string
+
+                # this is a simple link, with the alias the same as the mention
+                elif "|" not in match:
+                    _store_alias(match, match)
+
+                # in wiki format, the link is written as [[entity|alias]]
+                else:
+                    splits = match.split("|")
+                    entity = splits[0].strip()
+                    alias = splits[1].strip()
+                    # specific wiki format  [[alias (specification)|]]
+                    if len(alias) == 0 and "(" in entity:
+                        alias = entity.split("(")[0]
+                        _store_alias(alias, entity)
+                    else:
+                        _store_alias(alias, entity)
+
+            line = file.readline()
+            cnt += 1
+
+    # only print aliases with more than one potential entity
+    # TODO remove hardcoded path
+    with open('C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv', mode='w', encoding='utf8') as outputfile:
+        outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
+        for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
+            for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
+                outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
+
+
+def _store_alias(alias, entity):
+    alias = alias.strip()
+    entity = entity.strip()
+
+    # remove everything after # as this is not part of the title but refers to a specific paragraph
+    clean_entity = entity.split("#")[0].capitalize()
+
+    if len(alias) > 0 and len(clean_entity) > 0:
+        alias_dict = map_alias_to_link.get(alias, dict())
+        entity_count = alias_dict.get(clean_entity, 0)
+        alias_dict[clean_entity] = entity_count + 1
+        map_alias_to_link[alias] = alias_dict
+
 def _read_wikipedia():
     """ Read the XML wikipedia data """
     # TODO remove hardcoded path
@@ -103,7 +193,7 @@ def _read_wikipedia():
         article_title = None
         article_id = None
         reading_text = False
-        while line and cnt < 10000:
+        while line and cnt < 1000000:
             clean_line = line.strip().decode("utf-8")
 
             # Start reading new page
@@ -143,28 +233,51 @@ def _read_wikipedia():
 
 
 def _store_wp_article(article_id, article_title, article_text):
+    pass
     print("WP article", article_id, ":", article_title)
     print(article_text)
     print(_get_clean_wp_text(article_text))
     print()
 
 
+
 def _get_clean_wp_text(article_text):
-    # remove category statements
-    clean_text = re.sub('\[\[Category:.*\]\]', '', article_text)
+    # TODO: compile the regular expressions
+
+    # remove Category and File statements
+    clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', article_text)
+    print("1", clean_text)
+    clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text)       # TODO: this doesn't work yet
+    print("2", clean_text)
+
+    # remove bolding markup
+    clean_text = re.sub('\'\'\'', '', clean_text)
+    clean_text = re.sub('\'\'', '', clean_text)
 
     # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
     try_again = True
     previous_length = len(clean_text)
     while try_again:
-        clean_text = re.sub('{[^{]*?}', '', clean_text)  # non-greedy match
-        print(clean_text)
+        clean_text = re.sub('{[^{]*?}', '', clean_text)  # non-greedy match excluding a nested {
         if len(clean_text) < previous_length:
             try_again = True
         else:
             try_again = False
         previous_length = len(clean_text)
 
+    # remove multiple spaces
+    while '  ' in clean_text:
+        clean_text = re.sub('  ', ' ', clean_text)
+
+    # remove simple interwiki links (no alternative name)
+    clean_text = re.sub('\[\[([^|]*?)]]', r'\1', clean_text)
+
+    # remove simple interwiki links by picking the alternative name
+    clean_text = re.sub(r'\[\[[^|]*?\|([^|]*?)]]', r'\1', clean_text)
+
+    # remove HTML comments
+    clean_text = re.sub('&lt;!--[^!]*--&gt;', '', clean_text)
+
     return clean_text
 
 
@@ -187,6 +300,13 @@ def add_el(kb, nlp):
 
 
 if __name__ == "__main__":
-    nlp = spacy.load('en_core_web_sm')
-    my_kb = create_kb(nlp.vocab)
+    _read_wikipedia_prior_probs()
+
+    # nlp = spacy.load('en_core_web_sm')
+    # my_kb = create_kb(nlp.vocab)
     # add_el(my_kb, nlp)
+
+    # clean_text = "[[File:smomething]] jhk"
+    # clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', clean_text)
+    # clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text)
+    # print(clean_text)

From 10ee8dfea240ffe7e2b4d644df12b5179b6f01b6 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 18 Apr 2019 14:12:17 +0200
Subject: [PATCH 007/148] poc with few entities and collecting aliases from the
 WP links

---
 examples/pipeline/wikidata_entity_linking.py | 106 +++++++++++++++----
 spacy/kb.pyx                                 |   8 +-
 2 files changed, 92 insertions(+), 22 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 5065648ef..cd6cc7c40 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -10,6 +10,13 @@ import datetime
 import bz2
 from spacy.kb import KnowledgeBase
 
+# TODO: remove hardcoded paths
+WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
+ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
+ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
+PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
+
+
 # these will/should be matched ignoring case
 wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
                    "d", "dbdump", "download", "Draft", "Education", "Foundation",
@@ -28,16 +35,14 @@ wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
 map_alias_to_link = dict()
 
 
-def create_kb(vocab):
+def create_kb(vocab, max_entities_per_alias, min_occ):
     kb = KnowledgeBase(vocab=vocab)
+
+    _add_entities(kb)
+    _add_aliases(kb, max_entities_per_alias, min_occ)
+
     # _read_wikidata()
-    _read_wikipedia()
-
-    # adding entities
-    # kb.add_entity(entity=entity, prob=prob)
-
-    # adding aliases
-    # kb.add_alias(alias=alias, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2])
+    # _read_wikipedia()
 
     print()
     print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
@@ -45,6 +50,66 @@ def create_kb(vocab):
     return kb
 
 
+def _add_entities(kb):
+
+    kb.add_entity(entity="Earthquake", prob=0.342)
+    kb.add_entity(entity="2010 haiti earthquake", prob=0.1)
+    kb.add_entity(entity="1906 san francisco earthquake", prob=0.1)
+    kb.add_entity(entity="2011 christchurch earthquak", prob=0.1)
+
+    kb.add_entity(entity="Soft drink", prob=0.342)
+
+    print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings())
+
+
+def _add_aliases(kb, max_entities_per_alias, min_occ):
+    all_entities = kb.get_entity_strings()
+    # adding aliases with prior probabilities
+    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
+        # skip header
+        prior_file.readline()
+        line = prior_file.readline()
+        # we can read this file sequentially, it's sorted by alias, and then by count
+        previous_alias = None
+        total_count = 0
+        counts = list()
+        entities = list()
+        while line:
+            splits = line.replace('\n', "").split(sep='|')
+            new_alias = splits[0]
+            count = int(splits[1])
+            entity = splits[2]
+
+            if new_alias != previous_alias and previous_alias:
+                # done reading the previous alias --> output
+                if len(entities) > 0:
+                    selected_entities = list()
+                    prior_probs = list()
+                    for ent_count, ent_string in zip(counts, entities):
+                        if ent_string in all_entities:
+                            p_entity_givenalias = ent_count / total_count
+                            selected_entities.append(ent_string)
+                            prior_probs.append(p_entity_givenalias)
+
+                    if selected_entities:
+                        kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
+                total_count = 0
+                counts = list()
+                entities = list()
+
+            total_count += count
+
+            if len(entities) < max_entities_per_alias and count >= min_occ:
+                counts.append(count)
+                entities.append(entity)
+            previous_alias = new_alias
+
+            line = prior_file.readline()
+
+    print()
+    print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
+
+
 def _read_wikidata():
     """ Read the JSON wiki data """
     # TODO remove hardcoded path
@@ -53,7 +118,7 @@ def _read_wikidata():
     properties = {'P31'}
     sites = {'enwiki'}
 
-    with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
+    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
         line = file.readline()
         cnt = 1
         while line and cnt < 100000:
@@ -124,8 +189,7 @@ def _read_wikipedia_prior_probs():
 
     ns_regex = re.compile(ns_regex, re.IGNORECASE)
 
-    # TODO remove hardcoded path
-    with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file:
+    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
         line = file.readline()
         cnt = 0
         while line:
@@ -159,9 +223,8 @@ def _read_wikipedia_prior_probs():
             line = file.readline()
             cnt += 1
 
-    # only print aliases with more than one potential entity
-    # TODO remove hardcoded path
-    with open('C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv', mode='w', encoding='utf8') as outputfile:
+    # write all aliases and their entities and occurrences to file
+    with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile:
         outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
         for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
             for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
@@ -181,12 +244,11 @@ def _store_alias(alias, entity):
         alias_dict[clean_entity] = entity_count + 1
         map_alias_to_link[alias] = alias_dict
 
+
 def _read_wikipedia():
     """ Read the XML wikipedia data """
-    # TODO remove hardcoded path
 
-    # with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2', mode='rb') as file:
-    with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file:
+    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
         line = file.readline()
         cnt = 1
         article_text = ""
@@ -240,7 +302,6 @@ def _store_wp_article(article_id, article_title, article_text):
     print()
 
 
-
 def _get_clean_wp_text(article_text):
     # TODO: compile the regular expressions
 
@@ -300,10 +361,13 @@ def add_el(kb, nlp):
 
 
 if __name__ == "__main__":
-    _read_wikipedia_prior_probs()
+    # STEP 1 : create prior probabilities from WP
+    # run only once !
+    # _read_wikipedia_prior_probs()
 
-    # nlp = spacy.load('en_core_web_sm')
-    # my_kb = create_kb(nlp.vocab)
+    # STEP 2 : create KB
+    nlp = spacy.load('en_core_web_sm')
+    my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5)
     # add_el(my_kb, nlp)
 
     # clean_text = "[[File:smomething]] jhk"
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 97e86d01f..8a1710a9c 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -54,8 +54,14 @@ cdef class KnowledgeBase:
     def get_size_entities(self):
         return self._entries.size() - 1  # not counting dummy element on index 0
 
+    def get_entity_strings(self):
+        return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0
+
     def get_size_aliases(self):
-        return self._aliases_table.size() - 1 # not counting dummy element on index 0
+        return self._aliases_table.size() - 1 # not counting dummy element on index
+
+    def get_alias_strings(self):
+        return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0
 
     def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None):
         """

From 9f308eb5dc8fab4dc3a625480abf567f6841d144 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 18 Apr 2019 16:14:25 +0200
Subject: [PATCH 008/148] fixes for prior prob and linking wikidata IDs with
 wikipedia titles

---
 examples/pipeline/wikidata_entity_linking.py | 164 ++++++++++++-------
 1 file changed, 102 insertions(+), 62 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index cd6cc7c40..b7dba1e0d 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -38,10 +38,13 @@ map_alias_to_link = dict()
 def create_kb(vocab, max_entities_per_alias, min_occ):
     kb = KnowledgeBase(vocab=vocab)
 
-    _add_entities(kb)
-    _add_aliases(kb, max_entities_per_alias, min_occ)
+    id_to_title = _read_wikidata(limit=100, to_print=False)
+    title_to_id = {v:k for k,v in id_to_title.items()}
 
-    # _read_wikidata()
+    _add_entities(kb, entities=id_to_title.keys(), probs=[0.4 for x in id_to_title.keys()])
+    _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ)
+
+    # TODO: read wikipedia texts for entity context
     # _read_wikipedia()
 
     print()
@@ -50,20 +53,17 @@ def create_kb(vocab, max_entities_per_alias, min_occ):
     return kb
 
 
-def _add_entities(kb):
-
-    kb.add_entity(entity="Earthquake", prob=0.342)
-    kb.add_entity(entity="2010 haiti earthquake", prob=0.1)
-    kb.add_entity(entity="1906 san francisco earthquake", prob=0.1)
-    kb.add_entity(entity="2011 christchurch earthquak", prob=0.1)
-
-    kb.add_entity(entity="Soft drink", prob=0.342)
+def _add_entities(kb, entities, probs):
+    for entity, prob in zip(entities, probs):
+        kb.add_entity(entity=entity, prob=prob)
 
     print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings())
 
 
-def _add_aliases(kb, max_entities_per_alias, min_occ):
-    all_entities = kb.get_entity_strings()
+def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ):
+    wp_titles = title_to_id.keys()
+    print("wp titles", wp_titles)
+
     # adding aliases with prior probabilities
     with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
         # skip header
@@ -86,13 +86,17 @@ def _add_aliases(kb, max_entities_per_alias, min_occ):
                     selected_entities = list()
                     prior_probs = list()
                     for ent_count, ent_string in zip(counts, entities):
-                        if ent_string in all_entities:
+                        if ent_string in wp_titles:
+                            wd_id = title_to_id[ent_string]
                             p_entity_givenalias = ent_count / total_count
-                            selected_entities.append(ent_string)
+                            selected_entities.append(wd_id)
                             prior_probs.append(p_entity_givenalias)
 
                     if selected_entities:
                         kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
+                        print("analysed", previous_alias, "with entities", entities, "and counts", counts)
+                        print("added", previous_alias, "with selected entities", selected_entities, "and probs", prior_probs)
+                        print()
                 total_count = 0
                 counts = list()
                 entities = list()
@@ -110,69 +114,94 @@ def _add_aliases(kb, max_entities_per_alias, min_occ):
     print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
 
 
-def _read_wikidata():
+def _read_wikidata(limit=None, to_print=False):
     """ Read the JSON wiki data """
-    # TODO remove hardcoded path
 
     languages = {'en', 'de'}
-    properties = {'P31'}
+    prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
     sites = {'enwiki'}
 
+    entity_dict = dict()
+
     with bz2.open(WIKIDATA_JSON, mode='rb') as file:
         line = file.readline()
         cnt = 1
-        while line and cnt < 100000:
+        while line and (not limit or cnt < limit):
             clean_line = line.strip()
             if clean_line.endswith(b","):
                 clean_line = clean_line[:-1]
             if len(clean_line) > 1:
                 obj = json.loads(clean_line)
+                keep = False
 
-                unique_id = obj["id"]
-                print("ID:", unique_id)
-
-                entry_type = obj["type"]
-                print("type:", entry_type)
-
+                # filtering records on their properties
                 # TODO: filter on rank:  preferred, normal or deprecated
                 claims = obj["claims"]
-                for prop in properties:
+                for prop, value_set in prop_filter.items():
                     claim_property = claims.get(prop, None)
                     if claim_property:
                         for cp in claim_property:
-                            print(prop, cp['mainsnak']['datavalue']['value']['id'])
+                            cp_id = cp['mainsnak']['datavalue']['value']['id']
+                            if cp_id in value_set:
+                                keep = True
 
-                entry_sites = obj["sitelinks"]
-                for site in sites:
-                    site_value = entry_sites.get(site, None)
-                    print(site, ":", site_value['title'])
+                if keep:
+                    unique_id = obj["id"]
+                    entry_type = obj["type"]
 
-                labels = obj["labels"]
-                if labels:
-                    for lang in languages:
-                        lang_label = labels.get(lang, None)
-                        if lang_label:
-                            print("label (" + lang + "):", lang_label["value"])
+                    if to_print:
+                        print("ID:", unique_id)
+                        print("type:", entry_type)
 
-                descriptions = obj["descriptions"]
-                if descriptions:
-                    for lang in languages:
-                        lang_descr = descriptions.get(lang, None)
-                        if lang_descr:
-                            print("description (" + lang + "):", lang_descr["value"])
+                    # parsing all properties that refer to other entities
+                    for prop, claim_property in claims.items():
+                        cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
+                        cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
+                        if cp_values:
+                            if to_print:
+                                print("prop:", prop, cp_values)
 
-                aliases = obj["aliases"]
-                if aliases:
-                    for lang in languages:
-                        lang_aliases = aliases.get(lang, None)
-                        if lang_aliases:
-                            for item in lang_aliases:
-                                print("alias (" + lang + "):", item["value"])
+                    entry_sites = obj["sitelinks"]
+                    for site in sites:
+                        site_value = entry_sites.get(site, None)
+                        if site_value:
+                            if to_print:
+                                print(site, ":", site_value['title'])
+                            if site == "enwiki":
+                                entity_dict[unique_id] = site_value['title']
 
-                print()
+                    labels = obj["labels"]
+                    if labels:
+                        for lang in languages:
+                            lang_label = labels.get(lang, None)
+                            if lang_label:
+                                if to_print:
+                                    print("label (" + lang + "):", lang_label["value"])
+
+                    descriptions = obj["descriptions"]
+                    if descriptions:
+                        for lang in languages:
+                            lang_descr = descriptions.get(lang, None)
+                            if lang_descr:
+                                if to_print:
+                                    print("description (" + lang + "):", lang_descr["value"])
+
+                    aliases = obj["aliases"]
+                    if aliases:
+                        for lang in languages:
+                            lang_aliases = aliases.get(lang, None)
+                            if lang_aliases:
+                                for item in lang_aliases:
+                                    if to_print:
+                                        print("alias (" + lang + "):", item["value"])
+
+                    if to_print:
+                        print()
             line = file.readline()
             cnt += 1
 
+    return entity_dict
+
 
 def _read_wikipedia_prior_probs():
     """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities """
@@ -206,7 +235,7 @@ def _read_wikipedia_prior_probs():
 
                 # this is a simple link, with the alias the same as the mention
                 elif "|" not in match:
-                    _store_alias(match, match)
+                    _store_alias(match, match, normalize_alias=True, normalize_entity=True)
 
                 # in wiki format, the link is written as [[entity|alias]]
                 else:
@@ -216,9 +245,9 @@ def _read_wikipedia_prior_probs():
                     # specific wiki format  [[alias (specification)|]]
                     if len(alias) == 0 and "(" in entity:
                         alias = entity.split("(")[0]
-                        _store_alias(alias, entity)
+                        _store_alias(alias, entity, normalize_alias=False, normalize_entity=True)
                     else:
-                        _store_alias(alias, entity)
+                        _store_alias(alias, entity, normalize_alias=False, normalize_entity=True)
 
             line = file.readline()
             cnt += 1
@@ -231,17 +260,20 @@ def _read_wikipedia_prior_probs():
                 outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
 
 
-def _store_alias(alias, entity):
+def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
     alias = alias.strip()
     entity = entity.strip()
 
     # remove everything after # as this is not part of the title but refers to a specific paragraph
-    clean_entity = entity.split("#")[0].capitalize()
+    if normalize_entity:
+        entity = capitalize_first(entity.split("#")[0])
+    if normalize_alias:
+        alias = capitalize_first(alias.split("#")[0])
 
-    if len(alias) > 0 and len(clean_entity) > 0:
+    if alias and entity:
         alias_dict = map_alias_to_link.get(alias, dict())
-        entity_count = alias_dict.get(clean_entity, 0)
-        alias_dict[clean_entity] = entity_count + 1
+        entity_count = alias_dict.get(entity, 0)
+        alias_dict[entity] = entity_count + 1
         map_alias_to_link[alias] = alias_dict
 
 
@@ -360,14 +392,22 @@ def add_el(kb, nlp):
         print("ent", ent.text, ent.label_, ent.kb_id_)
 
 
+def capitalize_first(text):
+    if not text:
+        return None
+    result = text[0].capitalize()
+    if len(result) > 0:
+        result += text[1:]
+    return result
+
 if __name__ == "__main__":
     # STEP 1 : create prior probabilities from WP
     # run only once !
-    # _read_wikipedia_prior_probs()
+    _read_wikipedia_prior_probs()
 
     # STEP 2 : create KB
-    nlp = spacy.load('en_core_web_sm')
-    my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5)
+    # nlp = spacy.load('en_core_web_sm')
+    # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5)
     # add_el(my_kb, nlp)
 
     # clean_text = "[[File:smomething]] jhk"

From 9a8197185b733e471fa672e544fa2c8de57b991c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 18 Apr 2019 22:37:50 +0200
Subject: [PATCH 009/148] fix alias capitalization

---
 examples/pipeline/wikidata_entity_linking.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index b7dba1e0d..691be7990 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -204,7 +204,9 @@ def _read_wikidata(limit=None, to_print=False):
 
 
 def _read_wikipedia_prior_probs():
-    """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities """
+    """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
+    The full file takes about 2h to parse 1100M lines (update printed every 5M lines)
+     """
 
     # find the links
     link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
@@ -266,9 +268,10 @@ def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
 
     # remove everything after # as this is not part of the title but refers to a specific paragraph
     if normalize_entity:
+        # wikipedia titles are always capitalized
         entity = capitalize_first(entity.split("#")[0])
     if normalize_alias:
-        alias = capitalize_first(alias.split("#")[0])
+        alias = alias.split("#")[0]
 
     if alias and entity:
         alias_dict = map_alias_to_link.get(alias, dict())

From 004e5e7d1c76bd507e83aab6321177ce5d27f39b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 19 Apr 2019 14:24:02 +0200
Subject: [PATCH 010/148] little fixes

---
 examples/pipeline/wikidata_entity_linking.py | 62 ++++++++++++--------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 691be7990..a02226f9f 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -35,34 +35,46 @@ wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
 map_alias_to_link = dict()
 
 
-def create_kb(vocab, max_entities_per_alias, min_occ):
+def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     kb = KnowledgeBase(vocab=vocab)
 
-    id_to_title = _read_wikidata(limit=100, to_print=False)
+    id_to_title = _read_wikidata(limit=1000)
     title_to_id = {v:k for k,v in id_to_title.items()}
 
-    _add_entities(kb, entities=id_to_title.keys(), probs=[0.4 for x in id_to_title.keys()])
-    _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ)
+    _add_entities(kb,
+                  entities=id_to_title.keys(),
+                  probs=[0.4 for x in id_to_title.keys()],
+                  to_print=to_print)
+
+    _add_aliases(kb,
+                 title_to_id=title_to_id,
+                 max_entities_per_alias=max_entities_per_alias,
+                 min_occ=min_occ,
+                 to_print=to_print)
 
     # TODO: read wikipedia texts for entity context
     # _read_wikipedia()
 
-    print()
-    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
+    if to_print:
+        print()
+        print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
 
     return kb
 
 
-def _add_entities(kb, entities, probs):
+def _add_entities(kb, entities, probs, to_print=False):
     for entity, prob in zip(entities, probs):
         kb.add_entity(entity=entity, prob=prob)
 
-    print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings())
+    if to_print:
+        print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings())
 
 
-def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ):
+def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False):
     wp_titles = title_to_id.keys()
-    print("wp titles", wp_titles)
+
+    if to_print:
+        print("wp titles", wp_titles)
 
     # adding aliases with prior probabilities
     with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
@@ -94,9 +106,6 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ):
 
                     if selected_entities:
                         kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
-                        print("analysed", previous_alias, "with entities", entities, "and counts", counts)
-                        print("added", previous_alias, "with selected entities", selected_entities, "and probs", prior_probs)
-                        print()
                 total_count = 0
                 counts = list()
                 entities = list()
@@ -110,8 +119,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ):
 
             line = prior_file.readline()
 
-    print()
-    print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
+    if to_print:
+        print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
 
 
 def _read_wikidata(limit=None, to_print=False):
@@ -141,7 +150,7 @@ def _read_wikidata(limit=None, to_print=False):
                     claim_property = claims.get(prop, None)
                     if claim_property:
                         for cp in claim_property:
-                            cp_id = cp['mainsnak']['datavalue']['value']['id']
+                            cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
                             if cp_id in value_set:
                                 keep = True
 
@@ -383,7 +392,7 @@ def add_el(kb, nlp):
 
     text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
            "Douglas reminds us to always bring our towel. " \
-           "The main character in Doug's novel is called Arthur Dent."
+           "The main character in Doug's novel is the man Arthur Dent, but Douglas doesn't write about George Washington."
     doc = nlp(text)
 
     print()
@@ -406,14 +415,17 @@ def capitalize_first(text):
 if __name__ == "__main__":
     # STEP 1 : create prior probabilities from WP
     # run only once !
-    _read_wikipedia_prior_probs()
+    # _read_wikipedia_prior_probs()
 
     # STEP 2 : create KB
-    # nlp = spacy.load('en_core_web_sm')
-    # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5)
-    # add_el(my_kb, nlp)
+    nlp = spacy.load('en_core_web_sm')
+    my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
 
-    # clean_text = "[[File:smomething]] jhk"
-    # clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', clean_text)
-    # clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text)
-    # print(clean_text)
+    # STEP 3 : write KB to file
+    # TODO
+
+    # STEP 4 : read KB back in from file
+    # TODO
+
+    # STEP 5 : actually use the EL functionality
+    add_el(my_kb, nlp)

From 8e70a564f11f70b8e1d8acd7b2639562394d7455 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Apr 2019 16:33:40 +0200
Subject: [PATCH 011/148] custom reader and writer for _EntryC fields (first
 stab at it - not complete)

---
 examples/pipeline/wikidata_entity_linking.py |  16 ++-
 spacy/kb.pxd                                 |  14 +++
 spacy/kb.pyx                                 | 106 +++++++++++++++++++
 3 files changed, 133 insertions(+), 3 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index a02226f9f..84e8066e2 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -16,6 +16,8 @@ ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-art
 ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
 PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
 
+KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
+
 
 # these will/should be matched ignoring case
 wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
@@ -418,14 +420,22 @@ if __name__ == "__main__":
     # _read_wikipedia_prior_probs()
 
     # STEP 2 : create KB
-    nlp = spacy.load('en_core_web_sm')
-    my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
+    # nlp = spacy.load('en_core_web_sm')
+    # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
 
     # STEP 3 : write KB to file
     # TODO
+    nlp = spacy.load('en_core_web_sm')
+    kb = KnowledgeBase(vocab=nlp.vocab)
+    kb.dump(KB_FILE)
+    print("DUMPED")
+    kb.load(KB_FILE)
+    print("LOADED")
+
+    # PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
 
     # STEP 4 : read KB back in from file
     # TODO
 
     # STEP 5 : actually use the EL functionality
-    add_el(my_kb, nlp)
+    # add_el(my_kb, nlp)
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 3cdf1e07e..eab947b66 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -7,6 +7,8 @@ from libc.stdint cimport int32_t, int64_t
 from spacy.vocab cimport Vocab
 from .typedefs cimport hash_t
 
+from libc.stdio cimport FILE
+
 
 # Internal struct, for storage and disambiguation. This isn't what we return
 # to the user as the answer to "here's your entity". It's the minimum number
@@ -158,3 +160,15 @@ cdef class KnowledgeBase:
         self._aliases_table.push_back(alias)
 
 
+cdef class Writer:
+    cdef FILE* _fp
+
+    cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1
+
+
+cdef class Reader:
+    cdef FILE* _fp
+    cdef public int32_t nr_feat
+
+    cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1
+
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 8a1710a9c..207231c99 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,8 +1,23 @@
 # cython: infer_types=True
 # cython: profile=True
 # coding: utf8
+from collections import OrderedDict
+from cpython.exc cimport PyErr_CheckSignals
+
+from spacy import util
 from spacy.errors import Errors, Warnings, user_warning
 
+from cpython.mem cimport PyMem_Malloc
+from cpython.exc cimport PyErr_SetFromErrno
+
+from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
+from libc.stdint cimport int32_t, int64_t
+from libc.stdlib cimport qsort
+
+from .typedefs cimport hash_t
+
+from os import path
+
 
 cdef class Candidate:
 
@@ -139,3 +154,94 @@ cdef class KnowledgeBase:
                           prior_prob=prob)
                 for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                 if entry_index != 0]
+
+
+    def dump(self, loc):
+        # TODO: actually dump the data in this KB :-)
+
+        cdef int64_t entry_id = 32
+        self.vocab.strings.add("Q342")
+        cdef hash_t entity_hash = self.vocab.strings["Q342"]
+        cdef float prob = 0.333
+
+        cdef Writer writer = Writer(loc)
+        writer.write(entry_id, entity_hash, prob)
+        writer.close()
+
+    def load(self, loc):
+        cdef int64_t entry_id
+        cdef hash_t entity_hash
+        cdef float prob
+
+        cdef Reader reader = Reader(loc)
+        reader.read(self.mem, &entry_id, &entity_hash, &prob)
+
+        cdef _EntryC entry
+        entry.entity_hash = entity_hash
+        entry.prob = prob
+
+        # TODO
+        cdef int32_t dummy_value = 342
+        entry.vector_rows = &dummy_value
+        entry.feats_row = dummy_value
+
+cdef class Writer:
+    def __init__(self, object loc):
+        if path.exists(loc):
+            assert not path.isdir(loc), "%s is directory." % loc
+        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+        self._fp = fopen(<char*>bytes_loc, 'wb')
+        assert self._fp != NULL
+        fseek(self._fp, 0, 0)
+
+    def close(self):
+        cdef size_t status = fclose(self._fp)
+        assert status == 0
+
+    cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1:
+        cdef int i = 0
+
+        # TODO: feats_rows and vector rows
+
+        _write(&entry_id, sizeof(entry_id), self._fp)
+        _write(&entry_hash, sizeof(entry_hash), self._fp)
+        _write(&entry_prob, sizeof(entry_prob), self._fp)
+
+
+cdef int _write(void* value, size_t size, FILE* fp) except -1:
+    status = fwrite(value, size, 1, fp)
+    assert status == 1, status
+
+
+cdef class Reader:
+    def __init__(self, object loc):
+        assert path.exists(loc)
+        assert not path.isdir(loc)
+        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+        self._fp = fopen(<char*>bytes_loc, 'rb')
+        if not self._fp:
+            PyErr_SetFromErrno(IOError)
+        status = fseek(self._fp, 0, 0)  # this can be 0 if there is no header
+
+    def __dealloc__(self):
+        fclose(self._fp)
+
+    cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
+        status = fread(entry_id, sizeof(entry_id), 1, self._fp)
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading entry ID from input file")
+
+        #status = fread(&entity_hash, sizeof(entity_hash), 1, self._fp)
+        status = fread(entity_hash, sizeof(entity_hash), 1, self._fp)
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading entity hash from input file")
+
+        status = fread(prob, sizeof(prob), 1, self._fp)
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading entity prob from input file")

From 694fea597aedd75ea9e045fd12268aba3ffd171d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Apr 2019 18:36:50 +0200
Subject: [PATCH 012/148] dumping all entryC entries + (inefficient) reading
 back in

---
 examples/pipeline/wikidata_entity_linking.py | 18 ++++--
 spacy/kb.pxd                                 |  1 -
 spacy/kb.pyx                                 | 64 +++++++++++++-------
 3 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 84e8066e2..db8d4577c 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -426,16 +426,22 @@ if __name__ == "__main__":
     # STEP 3 : write KB to file
     # TODO
     nlp = spacy.load('en_core_web_sm')
-    kb = KnowledgeBase(vocab=nlp.vocab)
-    kb.dump(KB_FILE)
-    print("DUMPED")
-    kb.load(KB_FILE)
-    print("LOADED")
+    kb1 = KnowledgeBase(vocab=nlp.vocab)
 
-    # PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
+    kb1.add_entity(entity="Q53", prob=0.33)
+    kb1.add_entity(entity="Q17", prob=0.1)
+    kb1.add_entity(entity="Q007", prob=0.7)
+    kb1.add_entity(entity="Q44", prob=0.4)
+    print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases())
+
+    kb1.dump(KB_FILE)
 
     # STEP 4 : read KB back in from file
     # TODO
 
+    kb2 = KnowledgeBase(vocab=nlp.vocab)
+    kb2.load(KB_FILE)
+    print("kb2 size:", len(kb2), kb2.get_size_entities(), kb2.get_size_aliases())
+
     # STEP 5 : actually use the EL functionality
     # add_el(my_kb, nlp)
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index eab947b66..c655c6bff 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -168,7 +168,6 @@ cdef class Writer:
 
 cdef class Reader:
     cdef FILE* _fp
-    cdef public int32_t nr_feat
 
     cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 207231c99..4ec910b03 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -157,33 +157,45 @@ cdef class KnowledgeBase:
 
 
     def dump(self, loc):
-        # TODO: actually dump the data in this KB :-)
-
-        cdef int64_t entry_id = 32
-        self.vocab.strings.add("Q342")
-        cdef hash_t entity_hash = self.vocab.strings["Q342"]
-        cdef float prob = 0.333
-
         cdef Writer writer = Writer(loc)
-        writer.write(entry_id, entity_hash, prob)
+
+        for key, entry_index in self._entry_index.items():
+            entry = self._entries[entry_index]
+            print("dumping")
+            print("index", entry_index)
+            print("hash", entry.entity_hash)
+            print("prob", entry.prob)
+            print("")
+            writer.write(entry_index, entry.entity_hash, entry.prob)
+
         writer.close()
 
     def load(self, loc):
         cdef int64_t entry_id
         cdef hash_t entity_hash
         cdef float prob
+        cdef _EntryC entry
+        cdef int32_t dummy_value = 342
 
         cdef Reader reader = Reader(loc)
-        reader.read(self.mem, &entry_id, &entity_hash, &prob)
+        result = reader.read(self.mem, &entry_id, &entity_hash, &prob)  # -1: error, 0: eof after this one
+        while result:
+            print("loading")
+            print("entryID", entry_id)
+            print("hash", entity_hash)
+            print("prob", prob)
+            print("result:", result)
+            print("")
+            entry.entity_hash = entity_hash
+            entry.prob = prob
 
-        cdef _EntryC entry
-        entry.entity_hash = entity_hash
-        entry.prob = prob
+            # TODO features and vectors
+            entry.vector_rows = &dummy_value
+            entry.feats_row = dummy_value
 
-        # TODO
-        cdef int32_t dummy_value = 342
-        entry.vector_rows = &dummy_value
-        entry.feats_row = dummy_value
+            # TODO: use set instead of push_back to ensure the index remains the same?
+            self._entries.push_back(entry)
+            result = reader.read(self.mem, &entry_id, &entity_hash, &prob)
 
 cdef class Writer:
     def __init__(self, object loc):
@@ -199,10 +211,7 @@ cdef class Writer:
         assert status == 0
 
     cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1:
-        cdef int i = 0
-
         # TODO: feats_rows and vector rows
-
         _write(&entry_id, sizeof(entry_id), self._fp)
         _write(&entry_hash, sizeof(entry_hash), self._fp)
         _write(&entry_prob, sizeof(entry_prob), self._fp)
@@ -227,21 +236,30 @@ cdef class Reader:
         fclose(self._fp)
 
     cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
-        status = fread(entry_id, sizeof(entry_id), 1, self._fp)
+        """ 
+        Return values:
+        -1: error during current read (EOF during call)
+        0: means we read the last line succesfully (EOF after call)
+        1: we can continue reading this file """
+        status = fread(entry_id, sizeof(int64_t), 1, self._fp)
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
             raise IOError("error reading entry ID from input file")
 
-        #status = fread(&entity_hash, sizeof(entity_hash), 1, self._fp)
-        status = fread(entity_hash, sizeof(entity_hash), 1, self._fp)
+        status = fread(entity_hash, sizeof(hash_t), 1, self._fp)
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
             raise IOError("error reading entity hash from input file")
 
-        status = fread(prob, sizeof(prob), 1, self._fp)
+        status = fread(prob, sizeof(float), 1, self._fp)
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
             raise IOError("error reading entity prob from input file")
+
+        if feof(self._fp):
+            return 0
+        else:
+            return 1

From 6e3223f23494a8c3361290a748de39f5768438d4 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 24 Apr 2019 11:26:38 +0200
Subject: [PATCH 013/148] bulk loading in proper order of entity indices

---
 examples/pipeline/wikidata_entity_linking.py | 13 ++--
 spacy/kb.pxd                                 | 57 +++++------------
 spacy/kb.pyx                                 | 65 +++++++++++++-------
 spacy/structs.pxd                            | 37 +++++++++++
 4 files changed, 100 insertions(+), 72 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index db8d4577c..674c6166c 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -424,9 +424,8 @@ if __name__ == "__main__":
     # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
 
     # STEP 3 : write KB to file
-    # TODO
-    nlp = spacy.load('en_core_web_sm')
-    kb1 = KnowledgeBase(vocab=nlp.vocab)
+    nlp1 = spacy.load('en_core_web_sm')
+    kb1 = KnowledgeBase(vocab=nlp1.vocab)
 
     kb1.add_entity(entity="Q53", prob=0.33)
     kb1.add_entity(entity="Q17", prob=0.1)
@@ -437,11 +436,11 @@ if __name__ == "__main__":
     kb1.dump(KB_FILE)
 
     # STEP 4 : read KB back in from file
-    # TODO
 
-    kb2 = KnowledgeBase(vocab=nlp.vocab)
-    kb2.load(KB_FILE)
-    print("kb2 size:", len(kb2), kb2.get_size_entities(), kb2.get_size_aliases())
+    nlp3 = spacy.load('en_core_web_sm')
+    kb3 = KnowledgeBase(vocab=nlp3.vocab)
+    kb3.load_bulk(7, KB_FILE)
+    print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases())
 
     # STEP 5 : actually use the EL functionality
     # add_el(my_kb, nlp)
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index c655c6bff..817b7ff25 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -1,48 +1,17 @@
 """Knowledge-base for entity or concept linking."""
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
+
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
+from libc.stdio cimport FILE
 
 from spacy.vocab cimport Vocab
 from .typedefs cimport hash_t
 
-from libc.stdio cimport FILE
-
-
-# Internal struct, for storage and disambiguation. This isn't what we return
-# to the user as the answer to "here's your entity". It's the minimum number
-# of bits we need to keep track of the answers.
-cdef struct _EntryC:
-
-    # The hash of this entry's unique ID/name in the kB
-    hash_t entity_hash
-
-    # Allows retrieval of one or more vectors.
-    # Each element of vector_rows should be an index into a vectors table.
-    # Every entry should have the same number of vectors, so we can avoid storing
-    # the number of vectors in each knowledge-base struct
-    int32_t* vector_rows
-
-    # Allows retrieval of a struct of non-vector features. We could make this a
-    # pointer, but we have 32 bits left over in the struct after prob, so we'd
-    # like this to only be 32 bits. We can also set this to -1, for the common
-    # case where there are no features.
-    int32_t feats_row
-
-    # log probability of entity, based on corpus frequency
-    float prob
-
-
-# Each alias struct stores a list of Entry pointers with their prior probabilities
-# for this specific mention/alias.
-cdef struct _AliasC:
-
-    # All entry candidates for this alias
-    vector[int64_t] entry_indices
-
-    # Prior probability P(entity|alias) - should sum up to (at most) 1.
-    vector[float] probs
+from .structs cimport EntryC, AliasC
+ctypedef vector[EntryC] entry_vec
+ctypedef vector[AliasC] alias_vec
 
 
 # Object used by the Entity Linker that summarizes one entity-alias candidate combination.
@@ -68,7 +37,7 @@ cdef class KnowledgeBase:
     # over allocation.
     # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
     # Storing 1m entries would take 41.6mb under this scheme.
-    cdef vector[_EntryC] _entries
+    cdef entry_vec _entries
 
     # This maps 64bit keys (hash of unique alias string)
     # to 64bit values (position of the _AliasC struct in the _aliases_table vector).
@@ -78,7 +47,7 @@ cdef class KnowledgeBase:
     # should be P(entity | mention), which is pretty important to know.
     # We can pack both pieces of information into a 64-bit value, to keep things
     # efficient.
-    cdef vector[_AliasC] _aliases_table
+    cdef alias_vec _aliases_table
 
     # This is the part which might take more space: storing various
     # categorical features for the entries, and storing vectors for disambiguation
@@ -98,6 +67,7 @@ cdef class KnowledgeBase:
     # optional data, we can let users configure a DB as the backend for this.
     cdef object _features_table
 
+
     cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
                                      int32_t* vector_rows, int feats_row) nogil:
         """Add an entry to the vector of entries.
@@ -107,7 +77,7 @@ cdef class KnowledgeBase:
         cdef int64_t new_index = self._entries.size()
 
         # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
-        cdef _EntryC entry
+        cdef EntryC entry
         entry.entity_hash = entity_hash
         entry.vector_rows = vector_rows
         entry.feats_row = feats_row
@@ -124,7 +94,7 @@ cdef class KnowledgeBase:
         cdef int64_t new_index = self._aliases_table.size()
 
         # Avoid struct initializer to enable nogil
-        cdef _AliasC alias
+        cdef AliasC alias
         alias.entry_indices = entry_indices
         alias.probs = probs
 
@@ -140,7 +110,7 @@ cdef class KnowledgeBase:
         cdef int32_t dummy_value = 0
 
         # Avoid struct initializer to enable nogil
-        cdef _EntryC entry
+        cdef EntryC entry
         entry.entity_hash = dummy_hash
         entry.vector_rows = &dummy_value
         entry.feats_row = dummy_value
@@ -152,20 +122,21 @@ cdef class KnowledgeBase:
         cdef vector[float] dummy_probs
         dummy_probs.push_back(0)
 
-        cdef _AliasC alias
+        cdef AliasC alias
         alias.entry_indices = dummy_entry_indices
         alias.probs = dummy_probs
 
         self._entries.push_back(entry)
         self._aliases_table.push_back(alias)
 
+    cpdef load_bulk(self, int nr_entities, loc)
+
 
 cdef class Writer:
     cdef FILE* _fp
 
     cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1
 
-
 cdef class Reader:
     cdef FILE* _fp
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 4ec910b03..c967654d3 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -7,6 +7,9 @@ from cpython.exc cimport PyErr_CheckSignals
 from spacy import util
 from spacy.errors import Errors, Warnings, user_warning
 
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+
 from cpython.mem cimport PyMem_Malloc
 from cpython.exc cimport PyErr_SetFromErrno
 
@@ -17,6 +20,8 @@ from libc.stdlib cimport qsort
 from .typedefs cimport hash_t
 
 from os import path
+from libcpp.vector cimport vector
+
 
 
 cdef class Candidate:
@@ -53,7 +58,6 @@ cdef class Candidate:
 
 
 cdef class KnowledgeBase:
-
     def __init__(self, Vocab vocab):
         self.vocab = vocab
         self.mem = Pool()
@@ -67,13 +71,13 @@ cdef class KnowledgeBase:
         return self.get_size_entities()
 
     def get_size_entities(self):
-        return self._entries.size() - 1  # not counting dummy element on index 0
+        return len(self._entry_index)
 
     def get_entity_strings(self):
         return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0
 
     def get_size_aliases(self):
-        return self._aliases_table.size() - 1 # not counting dummy element on index
+        return len(self._alias_index)
 
     def get_alias_strings(self):
         return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0
@@ -159,33 +163,44 @@ cdef class KnowledgeBase:
     def dump(self, loc):
         cdef Writer writer = Writer(loc)
 
-        for key, entry_index in self._entry_index.items():
+        # dumping the entry records in the order in which they are in the _entries vector.
+        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
+        i = 1
+        for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
             entry = self._entries[entry_index]
             print("dumping")
             print("index", entry_index)
             print("hash", entry.entity_hash)
+            assert entry.entity_hash ==  entry_hash
+            assert entry_index == i
             print("prob", entry.prob)
             print("")
             writer.write(entry_index, entry.entity_hash, entry.prob)
+            i = i+1
 
         writer.close()
 
-    def load(self, loc):
+    cpdef load_bulk(self, int nr_entities, loc):
+        # TODO: nr_entities from header in file (Reader constructor)
         cdef int64_t entry_id
         cdef hash_t entity_hash
         cdef float prob
-        cdef _EntryC entry
+        cdef EntryC entry
         cdef int32_t dummy_value = 342
 
         cdef Reader reader = Reader(loc)
-        result = reader.read(self.mem, &entry_id, &entity_hash, &prob)  # -1: error, 0: eof after this one
-        while result:
-            print("loading")
-            print("entryID", entry_id)
-            print("hash", entity_hash)
-            print("prob", prob)
-            print("result:", result)
-            print("")
+        to_read = self.get_size_entities()
+
+        self._entry_index = PreshMap(nr_entities+1)
+        self._entries = entry_vec(nr_entities+1)
+
+        # we assume the data was written in sequence
+        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
+        # TODO: should we initialize the dummy objects ?
+        cdef int i = 1
+        while reader.read(self.mem, &entry_id, &entity_hash, &prob) and i <= nr_entities:
+            assert i == entry_id
+
             entry.entity_hash = entity_hash
             entry.prob = prob
 
@@ -193,9 +208,18 @@ cdef class KnowledgeBase:
             entry.vector_rows = &dummy_value
             entry.feats_row = dummy_value
 
-            # TODO: use set instead of push_back to ensure the index remains the same?
-            self._entries.push_back(entry)
-            result = reader.read(self.mem, &entry_id, &entity_hash, &prob)
+            print("bulk loading")
+            print("i", i)
+            print("entryID", entry_id)
+            print("hash", entry.entity_hash)
+            print("prob", entry.prob)
+            print("")
+
+            self._entries[i] = entry
+            self._entry_index[entity_hash] = i
+
+            i += 1
+
 
 cdef class Writer:
     def __init__(self, object loc):
@@ -236,11 +260,6 @@ cdef class Reader:
         fclose(self._fp)
 
     cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
-        """ 
-        Return values:
-        -1: error during current read (EOF during call)
-        0: means we read the last line succesfully (EOF after call)
-        1: we can continue reading this file """
         status = fread(entry_id, sizeof(int64_t), 1, self._fp)
         if status < 1:
             if feof(self._fp):
@@ -263,3 +282,5 @@ cdef class Reader:
             return 0
         else:
             return 1
+
+
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 154202c0d..69a1f4961 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -3,6 +3,10 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
 from .typedefs cimport flags_t, attr_t, hash_t
 from .parts_of_speech cimport univ_pos_t
 
+from libcpp.vector cimport vector
+from libc.stdint cimport int32_t, int64_t
+
+
 
 cdef struct LexemeC:
     flags_t flags
@@ -72,3 +76,36 @@ cdef struct TokenC:
     attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
     attr_t ent_kb_id
     hash_t ent_id
+
+
+# Internal struct, for storage and disambiguation of entities.
+cdef struct EntryC:
+
+    # The hash of this entry's unique ID/name in the kB
+    hash_t entity_hash
+
+    # Allows retrieval of one or more vectors.
+    # Each element of vector_rows should be an index into a vectors table.
+    # Every entry should have the same number of vectors, so we can avoid storing
+    # the number of vectors in each knowledge-base struct
+    int32_t* vector_rows
+
+    # Allows retrieval of a struct of non-vector features. We could make this a
+    # pointer, but we have 32 bits left over in the struct after prob, so we'd
+    # like this to only be 32 bits. We can also set this to -1, for the common
+    # case where there are no features.
+    int32_t feats_row
+
+    # log probability of entity, based on corpus frequency
+    float prob
+
+
+# Each alias struct stores a list of Entry pointers with their prior probabilities
+# for this specific mention/alias.
+cdef struct AliasC:
+
+    # All entry candidates for this alias
+    vector[int64_t] entry_indices
+
+    # Prior probability P(entity|alias) - should sum up to (at most) 1.
+    vector[float] probs

From ad6c5e581cd4a99300102e68cb6bdd463b51d380 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 24 Apr 2019 15:31:44 +0200
Subject: [PATCH 014/148] writing and reading number of entries to/from header

---
 examples/pipeline/wikidata_entity_linking.py |  6 +-
 spacy/kb.pxd                                 | 10 ++-
 spacy/kb.pyx                                 | 65 ++++++++++----------
 3 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 674c6166c..8628c54a9 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -432,6 +432,7 @@ if __name__ == "__main__":
     kb1.add_entity(entity="Q007", prob=0.7)
     kb1.add_entity(entity="Q44", prob=0.4)
     print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases())
+    print("dumping kb1")
 
     kb1.dump(KB_FILE)
 
@@ -439,7 +440,10 @@ if __name__ == "__main__":
 
     nlp3 = spacy.load('en_core_web_sm')
     kb3 = KnowledgeBase(vocab=nlp3.vocab)
-    kb3.load_bulk(7, KB_FILE)
+
+    kb3.load_bulk(KB_FILE)
+
+    print("loading kb3")
     print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases())
 
     # STEP 5 : actually use the EL functionality
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 817b7ff25..9c393e5f2 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -129,16 +129,20 @@ cdef class KnowledgeBase:
         self._entries.push_back(entry)
         self._aliases_table.push_back(alias)
 
-    cpdef load_bulk(self, int nr_entities, loc)
+    cpdef load_bulk(self, loc)
 
 
 cdef class Writer:
     cdef FILE* _fp
 
-    cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1
+    cdef int write_header(self, int64_t nr_entries) except -1
+    cdef int write_entry(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1
+    cdef int _write(self, void* value, size_t size) except -1
 
 cdef class Reader:
     cdef FILE* _fp
 
-    cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1
+    cdef int read_header(self, int64_t* nr_entries) except -1
+    cdef int read_entry(self, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1
+    cdef int _read(self, void* value, size_t size) except -1
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index c967654d3..21c6d9049 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -64,6 +64,8 @@ cdef class KnowledgeBase:
         self._entry_index = PreshMap()
         self._alias_index = PreshMap()
 
+        # TODO initialize self._entries and self._aliases_table ?
+
         self.vocab.strings.add("")
         self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
 
@@ -162,26 +164,21 @@ cdef class KnowledgeBase:
 
     def dump(self, loc):
         cdef Writer writer = Writer(loc)
+        writer.write_header(self.get_size_entities())
 
         # dumping the entry records in the order in which they are in the _entries vector.
         # index 0 is a dummy object not stored in the _entry_index and can be ignored.
         i = 1
         for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
             entry = self._entries[entry_index]
-            print("dumping")
-            print("index", entry_index)
-            print("hash", entry.entity_hash)
             assert entry.entity_hash ==  entry_hash
             assert entry_index == i
-            print("prob", entry.prob)
-            print("")
-            writer.write(entry_index, entry.entity_hash, entry.prob)
+            writer.write_entry(entry_index, entry.entity_hash, entry.prob)
             i = i+1
 
         writer.close()
 
-    cpdef load_bulk(self, int nr_entities, loc):
-        # TODO: nr_entities from header in file (Reader constructor)
+    cpdef load_bulk(self, loc):
         cdef int64_t entry_id
         cdef hash_t entity_hash
         cdef float prob
@@ -189,7 +186,8 @@ cdef class KnowledgeBase:
         cdef int32_t dummy_value = 342
 
         cdef Reader reader = Reader(loc)
-        to_read = self.get_size_entities()
+        cdef int64_t nr_entities
+        reader.read_header(&nr_entities)
 
         self._entry_index = PreshMap(nr_entities+1)
         self._entries = entry_vec(nr_entities+1)
@@ -198,23 +196,15 @@ cdef class KnowledgeBase:
         # index 0 is a dummy object not stored in the _entry_index and can be ignored.
         # TODO: should we initialize the dummy objects ?
         cdef int i = 1
-        while reader.read(self.mem, &entry_id, &entity_hash, &prob) and i <= nr_entities:
+        while reader.read_entry(&entry_id, &entity_hash, &prob) and i <= nr_entities:
             assert i == entry_id
 
+            # TODO features and vectors
             entry.entity_hash = entity_hash
             entry.prob = prob
-
-            # TODO features and vectors
             entry.vector_rows = &dummy_value
             entry.feats_row = dummy_value
 
-            print("bulk loading")
-            print("i", i)
-            print("entryID", entry_id)
-            print("hash", entry.entity_hash)
-            print("prob", entry.prob)
-            print("")
-
             self._entries[i] = entry
             self._entry_index[entity_hash] = i
 
@@ -234,16 +224,18 @@ cdef class Writer:
         cdef size_t status = fclose(self._fp)
         assert status == 0
 
-    cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1:
+    cdef int write_header(self, int64_t nr_entries) except -1:
+        self._write(&nr_entries, sizeof(nr_entries))
+
+    cdef int write_entry(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1:
         # TODO: feats_rows and vector rows
-        _write(&entry_id, sizeof(entry_id), self._fp)
-        _write(&entry_hash, sizeof(entry_hash), self._fp)
-        _write(&entry_prob, sizeof(entry_prob), self._fp)
+        self._write(&entry_id, sizeof(entry_id))
+        self._write(&entry_hash, sizeof(entry_hash))
+        self._write(&entry_prob, sizeof(entry_prob))
 
-
-cdef int _write(void* value, size_t size, FILE* fp) except -1:
-    status = fwrite(value, size, 1, fp)
-    assert status == 1, status
+    cdef int _write(self, void* value, size_t size) except -1:
+        status = fwrite(value, size, 1, self._fp)
+        assert status == 1, status
 
 
 cdef class Reader:
@@ -259,20 +251,27 @@ cdef class Reader:
     def __dealloc__(self):
         fclose(self._fp)
 
-    cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
-        status = fread(entry_id, sizeof(int64_t), 1, self._fp)
+    cdef int read_header(self, int64_t* nr_entries) except -1:
+        status = self._read(nr_entries, sizeof(int64_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading header from input file")
+
+    cdef int read_entry(self, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
+        status = self._read(entry_id, sizeof(int64_t))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
             raise IOError("error reading entry ID from input file")
 
-        status = fread(entity_hash, sizeof(hash_t), 1, self._fp)
+        status = self._read(entity_hash, sizeof(hash_t))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
             raise IOError("error reading entity hash from input file")
 
-        status = fread(prob, sizeof(float), 1, self._fp)
+        status = self._read(prob, sizeof(float))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
@@ -283,4 +282,8 @@ cdef class Reader:
         else:
             return 1
 
+    cdef int _read(self, void* value, size_t size) except -1:
+        status = fread(value, size, 1, self._fp)
+        return status
+
 

From 3e0cb690653fa5fa6ebdc094d4cb65a4084578d0 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 24 Apr 2019 20:24:24 +0200
Subject: [PATCH 015/148] KB aliases to and from file

---
 examples/pipeline/wikidata_entity_linking.py |  23 +++-
 spacy/kb.pxd                                 |  14 +-
 spacy/kb.pyx                                 | 129 ++++++++++++++++---
 3 files changed, 141 insertions(+), 25 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 8628c54a9..a8a3eec1e 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -425,26 +425,37 @@ if __name__ == "__main__":
 
     # STEP 3 : write KB to file
     nlp1 = spacy.load('en_core_web_sm')
-    kb1 = KnowledgeBase(vocab=nlp1.vocab)
+    my_vocab = nlp1.vocab
+    kb1 = KnowledgeBase(vocab=my_vocab)
 
     kb1.add_entity(entity="Q53", prob=0.33)
     kb1.add_entity(entity="Q17", prob=0.1)
     kb1.add_entity(entity="Q007", prob=0.7)
     kb1.add_entity(entity="Q44", prob=0.4)
-    print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases())
-    print("dumping kb1")
+    kb1.add_alias(alias="double07", entities=["Q007", "Q17"], probabilities=[0.9, 0.1])
+    kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
+    kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
 
+    print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases())
+    print("kb1 entities:", kb1.get_entity_strings())
+    print("kb1 aliases:", kb1.get_alias_strings())
+
+    print()
+    print("dumping kb1")
     kb1.dump(KB_FILE)
 
     # STEP 4 : read KB back in from file
 
     nlp3 = spacy.load('en_core_web_sm')
-    kb3 = KnowledgeBase(vocab=nlp3.vocab)
-
-    kb3.load_bulk(KB_FILE)
+    kb3 = KnowledgeBase(vocab=my_vocab)
 
     print("loading kb3")
+    kb3.load_bulk(KB_FILE)
+
+    print()
     print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases())
+    print("kb3 entities:", kb3.get_entity_strings())
+    print("kb3 aliases:", kb3.get_alias_strings())
 
     # STEP 5 : actually use the EL functionality
     # add_el(my_kb, nlp)
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 9c393e5f2..5f7bfa46c 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -136,13 +136,23 @@ cdef class Writer:
     cdef FILE* _fp
 
     cdef int write_header(self, int64_t nr_entries) except -1
-    cdef int write_entry(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1
+    cdef int write_entry(self, hash_t entry_hash, float entry_prob) except -1
+
+    cdef int write_alias_length(self, int64_t alias_length) except -1
+    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
+    cdef int write_alias(self, int64_t entry_index, float prob) except -1
+
     cdef int _write(self, void* value, size_t size) except -1
 
 cdef class Reader:
     cdef FILE* _fp
 
     cdef int read_header(self, int64_t* nr_entries) except -1
-    cdef int read_entry(self, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1
+    cdef int read_entry(self, hash_t* entity_hash, float* prob) except -1
+
+    cdef int read_alias_length(self, int64_t* alias_length) except -1
+    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
+    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
+
     cdef int _read(self, void* value, size_t size) except -1
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 21c6d9049..f3d5ecaa9 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -76,13 +76,13 @@ cdef class KnowledgeBase:
         return len(self._entry_index)
 
     def get_entity_strings(self):
-        return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0
+        return [self.vocab.strings[x] for x in self._entry_index]
 
     def get_size_aliases(self):
         return len(self._alias_index)
 
     def get_alias_strings(self):
-        return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0
+        return [self.vocab.strings[x] for x in self._alias_index]
 
     def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None):
         """
@@ -173,31 +173,52 @@ cdef class KnowledgeBase:
             entry = self._entries[entry_index]
             assert entry.entity_hash ==  entry_hash
             assert entry_index == i
-            writer.write_entry(entry_index, entry.entity_hash, entry.prob)
+            writer.write_entry(entry.entity_hash, entry.prob)
+            i = i+1
+
+        writer.write_alias_length(self.get_size_aliases())
+
+        # dumping the aliases in the order in which they are in the _alias_index vector.
+        # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
+        i = 1
+        for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
+            alias = self._aliases_table[alias_index]
+            assert alias_index == i
+
+            candidate_length = len(alias.entry_indices)
+            writer.write_alias_header(alias_hash, candidate_length)
+
+            for j in range(0, candidate_length):
+                writer.write_alias(alias.entry_indices[j], alias.probs[j])
+
             i = i+1
 
         writer.close()
 
     cpdef load_bulk(self, loc):
-        cdef int64_t entry_id
         cdef hash_t entity_hash
+        cdef hash_t alias_hash
+        cdef int64_t entry_index
         cdef float prob
         cdef EntryC entry
+        cdef AliasC alias
         cdef int32_t dummy_value = 342
 
         cdef Reader reader = Reader(loc)
+
+        # Step 1: load entities
+
         cdef int64_t nr_entities
         reader.read_header(&nr_entities)
-
         self._entry_index = PreshMap(nr_entities+1)
         self._entries = entry_vec(nr_entities+1)
 
-        # we assume the data was written in sequence
+        # we assume that the entity data was written in sequence
         # index 0 is a dummy object not stored in the _entry_index and can be ignored.
         # TODO: should we initialize the dummy objects ?
         cdef int i = 1
-        while reader.read_entry(&entry_id, &entity_hash, &prob) and i <= nr_entities:
-            assert i == entry_id
+        while i <= nr_entities:
+            reader.read_entry(&entity_hash, &prob)
 
             # TODO features and vectors
             entry.entity_hash = entity_hash
@@ -210,6 +231,43 @@ cdef class KnowledgeBase:
 
             i += 1
 
+        # check that all entities were read in properly
+        assert nr_entities == self.get_size_entities()
+
+        # Step 2: load aliases
+        cdef int64_t nr_aliases
+        reader.read_alias_length(&nr_aliases)
+        self._alias_index = PreshMap(nr_aliases+1)
+        self._aliases_table = alias_vec(nr_aliases+1)
+
+        cdef int64_t nr_candidates
+        cdef vector[int64_t] entry_indices
+        cdef vector[float] probs
+
+        i = 1
+        # we assume the alias data was written in sequence
+        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
+        while i <= nr_aliases:
+            reader.read_alias_header(&alias_hash, &nr_candidates)
+            entry_indices = vector[int64_t](nr_candidates)
+            probs = vector[float](nr_candidates)
+
+            for j in range(0, nr_candidates):
+                reader.read_alias(&entry_index, &prob)
+                entry_indices[j] = entry_index
+                probs[j] = prob
+
+            alias.entry_indices = entry_indices
+            alias.probs = probs
+
+            self._aliases_table[i] = alias
+            self._alias_index[alias_hash] = i
+
+            i += 1
+
+        # check that all aliases were read in properly
+        assert nr_aliases == self.get_size_aliases()
+
 
 cdef class Writer:
     def __init__(self, object loc):
@@ -227,12 +285,22 @@ cdef class Writer:
     cdef int write_header(self, int64_t nr_entries) except -1:
         self._write(&nr_entries, sizeof(nr_entries))
 
-    cdef int write_entry(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1:
+    cdef int write_entry(self, hash_t entry_hash, float entry_prob) except -1:
         # TODO: feats_rows and vector rows
-        self._write(&entry_id, sizeof(entry_id))
         self._write(&entry_hash, sizeof(entry_hash))
         self._write(&entry_prob, sizeof(entry_prob))
 
+    cdef int write_alias_length(self, int64_t alias_length) except -1:
+        self._write(&alias_length, sizeof(alias_length))
+
+    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
+        self._write(&alias_hash, sizeof(alias_hash))
+        self._write(&candidate_length, sizeof(candidate_length))
+
+    cdef int write_alias(self, int64_t entry_index, float prob) except -1:
+        self._write(&entry_index, sizeof(entry_index))
+        self._write(&prob, sizeof(prob))
+
     cdef int _write(self, void* value, size_t size) except -1:
         status = fwrite(value, size, 1, self._fp)
         assert status == 1, status
@@ -258,13 +326,7 @@ cdef class Reader:
                 return 0  # end of file
             raise IOError("error reading header from input file")
 
-    cdef int read_entry(self, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
-        status = self._read(entry_id, sizeof(int64_t))
-        if status < 1:
-            if feof(self._fp):
-                return 0  # end of file
-            raise IOError("error reading entry ID from input file")
-
+    cdef int read_entry(self, hash_t* entity_hash, float* prob) except -1:
         status = self._read(entity_hash, sizeof(hash_t))
         if status < 1:
             if feof(self._fp):
@@ -282,6 +344,39 @@ cdef class Reader:
         else:
             return 1
 
+    cdef int read_alias_length(self, int64_t* alias_length) except -1:
+        status = self._read(alias_length, sizeof(int64_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading alias length from input file")
+
+    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
+        status = self._read(alias_hash, sizeof(hash_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading alias hash from input file")
+
+        status = self._read(candidate_length, sizeof(int64_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading candidate length from input file")
+
+    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1:
+        status = self._read(entry_index, sizeof(int64_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading entry index for alias from input file")
+
+        status = self._read(prob, sizeof(float))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading prob for entity/alias from input file")
+
     cdef int _read(self, void* value, size_t size) except -1:
         status = fread(value, size, 1, self._fp)
         return status

From 54d0cea0626fd9977b15c87284e16ccb063e076f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 24 Apr 2019 23:52:34 +0200
Subject: [PATCH 016/148] unit test for KB serialization

---
 examples/pipeline/wikidata_entity_linking.py |  2 +-
 spacy/kb.pxd                                 |  1 +
 spacy/kb.pyx                                 |  8 ++-
 spacy/tests/serialize/test_serialize_kb.py   | 64 ++++++++++++++++++++
 4 files changed, 73 insertions(+), 2 deletions(-)
 create mode 100644 spacy/tests/serialize/test_serialize_kb.py

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index a8a3eec1e..3b0943167 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -442,11 +442,11 @@ if __name__ == "__main__":
 
     print()
     print("dumping kb1")
+    print(KB_FILE, type(KB_FILE))
     kb1.dump(KB_FILE)
 
     # STEP 4 : read KB back in from file
 
-    nlp3 = spacy.load('en_core_web_sm')
     kb3 = KnowledgeBase(vocab=my_vocab)
 
     print("loading kb3")
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 5f7bfa46c..82b06d192 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -19,6 +19,7 @@ cdef class Candidate:
 
     cdef readonly KnowledgeBase kb
     cdef hash_t entity_hash
+    cdef float entity_freq
     cdef hash_t alias_hash
     cdef float prior_prob
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index f3d5ecaa9..ad2e13b5e 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -26,9 +26,10 @@ from libcpp.vector cimport vector
 
 cdef class Candidate:
 
-    def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob):
+    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, alias_hash, prior_prob):
         self.kb = kb
         self.entity_hash = entity_hash
+        self.entity_freq = entity_freq
         self.alias_hash = alias_hash
         self.prior_prob = prior_prob
 
@@ -52,6 +53,10 @@ cdef class Candidate:
         """RETURNS (unicode): ID of the original alias"""
         return self.kb.vocab.strings[self.alias_hash]
 
+    @property
+    def entity_freq(self):
+        return self.entity_freq
+
     @property
     def prior_prob(self):
         return self.prior_prob
@@ -156,6 +161,7 @@ cdef class KnowledgeBase:
 
         return [Candidate(kb=self,
                           entity_hash=self._entries[entry_index].entity_hash,
+                          entity_freq=self._entries[entry_index].prob,
                           alias_hash=alias_hash,
                           prior_prob=prob)
                 for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
new file mode 100644
index 000000000..ae0eedeeb
--- /dev/null
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -0,0 +1,64 @@
+from ..util import make_tempdir
+from ...util import ensure_path
+
+from spacy.kb import KnowledgeBase
+
+
+def test_serialize_kb_disk(en_vocab):
+    kb1 = KnowledgeBase(vocab=en_vocab)
+
+    kb1.add_entity(entity="Q53", prob=0.33)
+    kb1.add_entity(entity="Q17", prob=0.2)
+    kb1.add_entity(entity="Q007", prob=0.7)
+    kb1.add_entity(entity="Q44", prob=0.4)
+    kb1.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
+    kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
+    kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
+
+    # baseline assertions
+    _check_kb(kb1)
+
+    # dumping to file & loading back in
+    with make_tempdir() as d:
+        dir_path = ensure_path(d)
+        if not dir_path.exists():
+            dir_path.mkdir()
+        file_path = dir_path / "kb"
+        print(file_path, type(file_path))
+        kb1.dump(str(file_path))
+
+        kb2 = KnowledgeBase(vocab=en_vocab)
+        kb2.load_bulk(str(file_path))
+
+    # final assertions
+    _check_kb(kb2)
+
+
+def _check_kb(kb):
+    # check entities
+    assert kb.get_size_entities() == 4
+    for entity_string in ["Q53", "Q17", "Q007", "Q44"]:
+        assert entity_string in kb.get_entity_strings()
+    for entity_string in ["", "Q0"]:
+        assert entity_string not in kb.get_entity_strings()
+
+    # check aliases
+    assert kb.get_size_aliases() == 3
+    for alias_string in ["double07", "guy", "random"]:
+        assert alias_string in kb.get_alias_strings()
+    for alias_string in ["nothingness", "", "randomnoise"]:
+        assert alias_string not in kb.get_alias_strings()
+
+    # check candidates & probabilities
+    candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_)
+    assert len(candidates) == 2
+
+    assert candidates[0].entity_ == "Q007"
+    assert candidates[0].entity_freq < 0.701 and candidates[0].entity_freq > 0.699
+    assert candidates[0].alias_ == "double07"
+    assert candidates[0].prior_prob < 0.901 and candidates[0].prior_prob > 0.899
+
+    assert candidates[1].entity_ == "Q17"
+    assert candidates[1].entity_freq < 0.201 and candidates[1].entity_freq > 0.199
+    assert candidates[1].alias_ == "double07"
+    assert candidates[1].prior_prob < 0.101 and candidates[1].prior_prob > 0.099

From 387263d618369aaaffaa9561791c4dc3ce988dd7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 29 Apr 2019 13:58:07 +0200
Subject: [PATCH 017/148] simplify chains

---
 spacy/tests/serialize/test_serialize_kb.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index ae0eedeeb..3ff6eaef6 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -54,11 +54,11 @@ def _check_kb(kb):
     assert len(candidates) == 2
 
     assert candidates[0].entity_ == "Q007"
-    assert candidates[0].entity_freq < 0.701 and candidates[0].entity_freq > 0.699
+    assert 0.6999 < candidates[0].entity_freq < 0.701
     assert candidates[0].alias_ == "double07"
-    assert candidates[0].prior_prob < 0.901 and candidates[0].prior_prob > 0.899
+    assert 0.899 < candidates[0].prior_prob < 0.901
 
     assert candidates[1].entity_ == "Q17"
-    assert candidates[1].entity_freq < 0.201 and candidates[1].entity_freq > 0.199
+    assert 0.199 < candidates[1].entity_freq < 0.201
     assert candidates[1].alias_ == "double07"
-    assert candidates[1].prior_prob < 0.101 and candidates[1].prior_prob > 0.099
+    assert 0.099 < candidates[1].prior_prob < 0.101

From 19e8f339cb3a125bbd7e5ae387e27dd417054dd7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 29 Apr 2019 17:37:29 +0200
Subject: [PATCH 018/148] deduce entity freq from WP corpus and serialize vocab
 in WP test

---
 examples/pipeline/wikidata_entity_linking.py | 258 +++++++++++--------
 spacy/tests/serialize/test_serialize_kb.py   |  27 +-
 2 files changed, 171 insertions(+), 114 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 3b0943167..2a544674f 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -1,7 +1,10 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
+from spacy.vocab import Vocab
+
+"""
+Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
 """
 import re
 import json
@@ -17,6 +20,7 @@ ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-ar
 PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
 
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
+VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
 
 
 # these will/should be matched ignoring case
@@ -40,12 +44,16 @@ map_alias_to_link = dict()
 def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     kb = KnowledgeBase(vocab=vocab)
 
-    id_to_title = _read_wikidata(limit=1000)
-    title_to_id = {v:k for k,v in id_to_title.items()}
+    id_to_title = _read_wikidata_entities(limit=None)
+    title_to_id = {v: k for k, v in id_to_title.items()}
+
+    entity_list = list(id_to_title.keys())
+    title_list = [id_to_title[x] for x in entity_list]
+    entity_frequencies = _get_entity_frequencies(entities=title_list, to_print=False)
 
     _add_entities(kb,
-                  entities=id_to_title.keys(),
-                  probs=[0.4 for x in id_to_title.keys()],
+                  entities=entity_list,
+                  probs=entity_frequencies,
                   to_print=to_print)
 
     _add_aliases(kb,
@@ -64,6 +72,38 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     return kb
 
 
+def _get_entity_frequencies(entities, to_print=False):
+    count_entities = [0 for _ in entities]
+    total_count = 0
+
+    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
+        # skip header
+        prior_file.readline()
+        line = prior_file.readline()
+        # we can read this file sequentially, it's sorted by alias, and then by count
+
+        while line:
+            splits = line.replace('\n', "").split(sep='|')
+            # alias = splits[0]
+            count = int(splits[1])
+            entity = splits[2]
+
+            if entity in entities:
+                index = entities.index(entity)
+                count_entities[index] = count_entities[index] + count
+
+            total_count += count
+
+            line = prior_file.readline()
+
+    if to_print:
+        for entity, count in zip(entities, count_entities):
+            print("Entity count:", entity, count)
+        print("Total count:", total_count)
+
+    return [x*100 / total_count for x in count_entities]
+
+
 def _add_entities(kb, entities, probs, to_print=False):
     for entity, prob in zip(entities, probs):
         kb.add_entity(entity=entity, prob=prob)
@@ -76,7 +116,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals
     wp_titles = title_to_id.keys()
 
     if to_print:
-        print("wp titles", wp_titles)
+        print("wp titles:", wp_titles)
 
     # adding aliases with prior probabilities
     with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
@@ -125,89 +165,100 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals
         print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
 
 
-def _read_wikidata(limit=None, to_print=False):
-    """ Read the JSON wiki data """
+def _read_wikidata_entities(limit=None, to_print=False):
+    """ Read the JSON wiki data and parse out the entities"""
 
     languages = {'en', 'de'}
     prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
-    sites = {'enwiki'}
+    site_filter = 'enwiki'
 
     entity_dict = dict()
 
+    # parse appropriate fields - depending on what we need in the KB
+    parse_properties = False
+    parse_sitelinks = True
+    parse_labels = False
+    parse_descriptions = False
+    parse_aliases = False
+
     with bz2.open(WIKIDATA_JSON, mode='rb') as file:
         line = file.readline()
-        cnt = 1
+        cnt = 0
         while line and (not limit or cnt < limit):
+            if cnt % 100000 == 0:
+                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
             clean_line = line.strip()
             if clean_line.endswith(b","):
                 clean_line = clean_line[:-1]
             if len(clean_line) > 1:
                 obj = json.loads(clean_line)
-                keep = False
+                unique_id = obj["id"]
+                entry_type = obj["type"]
 
-                # filtering records on their properties
-                # TODO: filter on rank:  preferred, normal or deprecated
-                claims = obj["claims"]
-                for prop, value_set in prop_filter.items():
-                    claim_property = claims.get(prop, None)
-                    if claim_property:
-                        for cp in claim_property:
-                            cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
-                            if cp_id in value_set:
-                                keep = True
+                if unique_id[0] == 'Q' and entry_type == "item":
+                    # filtering records on their properties
+                    keep = False
+                    claims = obj["claims"]
+                    for prop, value_set in prop_filter.items():
+                        claim_property = claims.get(prop, None)
+                        if claim_property:
+                            for cp in claim_property:
+                                cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
+                                cp_rank = cp['rank']
+                                if cp_rank != "deprecated" and cp_id in value_set:
+                                    keep = True
 
-                if keep:
-                    unique_id = obj["id"]
-                    entry_type = obj["type"]
+                    if keep:
+                        if to_print:
+                            print("ID:", unique_id)
+                            print("type:", entry_type)
 
-                    if to_print:
-                        print("ID:", unique_id)
-                        print("type:", entry_type)
+                        # parsing all properties that refer to other entities
+                        if parse_properties:
+                            for prop, claim_property in claims.items():
+                                cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
+                                cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
+                                if cp_values:
+                                    if to_print:
+                                        print("prop:", prop, cp_values)
 
-                    # parsing all properties that refer to other entities
-                    for prop, claim_property in claims.items():
-                        cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
-                        cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
-                        if cp_values:
-                            if to_print:
-                                print("prop:", prop, cp_values)
-
-                    entry_sites = obj["sitelinks"]
-                    for site in sites:
-                        site_value = entry_sites.get(site, None)
-                        if site_value:
-                            if to_print:
-                                print(site, ":", site_value['title'])
-                            if site == "enwiki":
+                        if parse_sitelinks:
+                            site_value = obj["sitelinks"].get(site_filter, None)
+                            if site_value:
+                                if to_print:
+                                    print(site_filter, ":", site_value['title'])
                                 entity_dict[unique_id] = site_value['title']
 
-                    labels = obj["labels"]
-                    if labels:
-                        for lang in languages:
-                            lang_label = labels.get(lang, None)
-                            if lang_label:
-                                if to_print:
-                                    print("label (" + lang + "):", lang_label["value"])
+                        if parse_labels:
+                            labels = obj["labels"]
+                            if labels:
+                                for lang in languages:
+                                    lang_label = labels.get(lang, None)
+                                    if lang_label:
+                                        if to_print:
+                                            print("label (" + lang + "):", lang_label["value"])
 
-                    descriptions = obj["descriptions"]
-                    if descriptions:
-                        for lang in languages:
-                            lang_descr = descriptions.get(lang, None)
-                            if lang_descr:
-                                if to_print:
-                                    print("description (" + lang + "):", lang_descr["value"])
+                        if parse_descriptions:
+                            descriptions = obj["descriptions"]
+                            if descriptions:
+                                for lang in languages:
+                                    lang_descr = descriptions.get(lang, None)
+                                    if lang_descr:
+                                        if to_print:
+                                            print("description (" + lang + "):", lang_descr["value"])
 
-                    aliases = obj["aliases"]
-                    if aliases:
-                        for lang in languages:
-                            lang_aliases = aliases.get(lang, None)
-                            if lang_aliases:
-                                for item in lang_aliases:
-                                    if to_print:
-                                        print("alias (" + lang + "):", item["value"])
+                        if parse_aliases:
+                            aliases = obj["aliases"]
+                            if aliases:
+                                for lang in languages:
+                                    lang_aliases = aliases.get(lang, None)
+                                    if lang_aliases:
+                                        for item in lang_aliases:
+                                            if to_print:
+                                                print("alias (" + lang + "):", item["value"])
 
-                    if to_print:
-                        print()
+                        if to_print:
+                            print()
             line = file.readline()
             cnt += 1
 
@@ -236,7 +287,7 @@ def _read_wikipedia_prior_probs():
         cnt = 0
         while line:
             if cnt % 5000000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines")
+                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
             clean_line = line.strip().decode("utf-8")
 
             matches = link_regex.findall(clean_line)
@@ -394,7 +445,8 @@ def add_el(kb, nlp):
 
     text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
            "Douglas reminds us to always bring our towel. " \
-           "The main character in Doug's novel is the man Arthur Dent, but Douglas doesn't write about George Washington."
+           "The main character in Doug's novel is the man Arthur Dent, " \
+           "but Douglas doesn't write about George Washington or Homer Simpson."
     doc = nlp(text)
 
     print()
@@ -414,48 +466,46 @@ def capitalize_first(text):
         result += text[1:]
     return result
 
+
 if __name__ == "__main__":
+    to_create_prior_probs = False
+    to_create_kb = True
+    to_read_kb = False
+
     # STEP 1 : create prior probabilities from WP
     # run only once !
-    # _read_wikipedia_prior_probs()
+    if to_create_prior_probs:
+        _read_wikipedia_prior_probs()
 
-    # STEP 2 : create KB
-    # nlp = spacy.load('en_core_web_sm')
-    # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
+    if to_create_kb:
+        # STEP 2 : create KB
+        my_nlp = spacy.load('en_core_web_sm')
+        my_vocab = my_nlp.vocab
+        my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
+        print("kb entities:", my_kb.get_size_entities())
+        print("kb aliases:", my_kb.get_size_aliases())
 
-    # STEP 3 : write KB to file
-    nlp1 = spacy.load('en_core_web_sm')
-    my_vocab = nlp1.vocab
-    kb1 = KnowledgeBase(vocab=my_vocab)
+        # STEP 3 : write KB to file
+        my_kb.dump(KB_FILE)
+        my_vocab.to_disk(VOCAB_DIR)
 
-    kb1.add_entity(entity="Q53", prob=0.33)
-    kb1.add_entity(entity="Q17", prob=0.1)
-    kb1.add_entity(entity="Q007", prob=0.7)
-    kb1.add_entity(entity="Q44", prob=0.4)
-    kb1.add_alias(alias="double07", entities=["Q007", "Q17"], probabilities=[0.9, 0.1])
-    kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
-    kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
+    if to_read_kb:
+        # STEP 4 : read KB back in from file
+        my_vocab = Vocab()
+        my_vocab.from_disk(VOCAB_DIR)
+        my_kb = KnowledgeBase(vocab=my_vocab)
+        my_kb.load_bulk(KB_FILE)
+        print("kb entities:", my_kb.get_size_entities())
+        print("kb aliases:", my_kb.get_size_aliases())
 
-    print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases())
-    print("kb1 entities:", kb1.get_entity_strings())
-    print("kb1 aliases:", kb1.get_alias_strings())
+        # test KB
+        candidates = my_kb.get_candidates("Bush")
+        for c in candidates:
+            print()
+            print("entity:", c.entity_)
+            print("entity freq:", c.entity_freq)
+            print("alias:", c.alias_)
+            print("prior prob:", c.prior_prob)
 
-    print()
-    print("dumping kb1")
-    print(KB_FILE, type(KB_FILE))
-    kb1.dump(KB_FILE)
-
-    # STEP 4 : read KB back in from file
-
-    kb3 = KnowledgeBase(vocab=my_vocab)
-
-    print("loading kb3")
-    kb3.load_bulk(KB_FILE)
-
-    print()
-    print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases())
-    print("kb3 entities:", kb3.get_entity_strings())
-    print("kb3 aliases:", kb3.get_alias_strings())
-
-    # STEP 5 : actually use the EL functionality
+    # STEP 5: add KB to NLP pipeline
     # add_el(my_kb, nlp)
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 3ff6eaef6..7b1380623 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -1,3 +1,5 @@
+import spacy
+from spacy.lang.en import English
 from ..util import make_tempdir
 from ...util import ensure_path
 
@@ -5,17 +7,8 @@ from spacy.kb import KnowledgeBase
 
 
 def test_serialize_kb_disk(en_vocab):
-    kb1 = KnowledgeBase(vocab=en_vocab)
-
-    kb1.add_entity(entity="Q53", prob=0.33)
-    kb1.add_entity(entity="Q17", prob=0.2)
-    kb1.add_entity(entity="Q007", prob=0.7)
-    kb1.add_entity(entity="Q44", prob=0.4)
-    kb1.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
-    kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
-    kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
-
     # baseline assertions
+    kb1 = _get_dummy_kb(en_vocab)
     _check_kb(kb1)
 
     # dumping to file & loading back in
@@ -34,6 +27,20 @@ def test_serialize_kb_disk(en_vocab):
     _check_kb(kb2)
 
 
+def _get_dummy_kb(vocab):
+    kb = KnowledgeBase(vocab=vocab)
+
+    kb.add_entity(entity="Q53", prob=0.33)
+    kb.add_entity(entity="Q17", prob=0.2)
+    kb.add_entity(entity="Q007", prob=0.7)
+    kb.add_entity(entity="Q44", prob=0.4)
+    kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
+    kb.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
+    kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
+
+    return kb
+
+
 def _check_kb(kb):
     # check entities
     assert kb.get_size_entities() == 4

From 653b7d9c87e62c8e37c96f0bac76f5c18ca4889c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 30 Apr 2019 11:39:42 +0200
Subject: [PATCH 019/148] calculate entity raw counts offline to speed up KB
 construction

---
 examples/pipeline/wikidata_entity_linking.py | 91 ++++++++++++++------
 1 file changed, 64 insertions(+), 27 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 2a544674f..43ba7d8d3 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -1,23 +1,25 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from spacy.vocab import Vocab
-
 """
 Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
 """
 import re
+import csv
 import json
 import spacy
 import datetime
 import bz2
 from spacy.kb import KnowledgeBase
+from spacy.vocab import Vocab
 
 # TODO: remove hardcoded paths
 WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
 ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
 ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
+
 PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
+ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
 
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
 VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
@@ -44,18 +46,30 @@ map_alias_to_link = dict()
 def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     kb = KnowledgeBase(vocab=vocab)
 
-    id_to_title = _read_wikidata_entities(limit=None)
-    title_to_id = {v: k for k, v in id_to_title.items()}
+    print()
+    print("1. _read_wikidata_entities", datetime.datetime.now())
+    print()
+    title_to_id = _read_wikidata_entities(limit=100000)
 
-    entity_list = list(id_to_title.keys())
-    title_list = [id_to_title[x] for x in entity_list]
-    entity_frequencies = _get_entity_frequencies(entities=title_list, to_print=False)
+    title_list = list(title_to_id.keys())
+    entity_list = [title_to_id[x] for x in title_list]
 
+    print()
+    print("2. _get_entity_frequencies", datetime.datetime.now())
+    print()
+    entity_frequencies = _get_entity_frequencies(entities=title_list)
+
+    print()
+    print("3. _add_entities", datetime.datetime.now())
+    print()
     _add_entities(kb,
                   entities=entity_list,
                   probs=entity_frequencies,
                   to_print=to_print)
 
+    print()
+    print("4. _add_aliases", datetime.datetime.now())
+    print()
     _add_aliases(kb,
                  title_to_id=title_to_id,
                  max_entities_per_alias=max_entities_per_alias,
@@ -72,15 +86,26 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     return kb
 
 
-def _get_entity_frequencies(entities, to_print=False):
-    count_entities = [0 for _ in entities]
+def _get_entity_frequencies(entities):
+    entity_to_count = dict()
+    with open(ENTITY_COUNTS, 'r', encoding='utf8') as csvfile:
+        csvreader = csv.reader(csvfile, delimiter='|')
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            entity_to_count[row[0]] = int(row[1])
+
+    return [entity_to_count.get(e, 0) for e in entities]
+
+
+def _write_entity_counts(to_print=False):
+    entity_to_count = dict()
     total_count = 0
 
     with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
         # skip header
         prior_file.readline()
         line = prior_file.readline()
-        # we can read this file sequentially, it's sorted by alias, and then by count
 
         while line:
             splits = line.replace('\n', "").split(sep='|')
@@ -88,23 +113,26 @@ def _get_entity_frequencies(entities, to_print=False):
             count = int(splits[1])
             entity = splits[2]
 
-            if entity in entities:
-                index = entities.index(entity)
-                count_entities[index] = count_entities[index] + count
+            current_count = entity_to_count.get(entity, 0)
+            entity_to_count[entity] = current_count + count
 
             total_count += count
 
             line = prior_file.readline()
 
+    with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file:
+        entity_file.write("entity" + "|" + "count" + "\n")
+        for entity, count in entity_to_count.items():
+            entity_file.write(entity + "|" + str(count) + "\n")
+
     if to_print:
-        for entity, count in zip(entities, count_entities):
+        for entity, count in entity_to_count.items():
             print("Entity count:", entity, count)
         print("Total count:", total_count)
 
-    return [x*100 / total_count for x in count_entities]
-
 
 def _add_entities(kb, entities, probs, to_print=False):
+    # TODO: this should be a bulk method
     for entity, prob in zip(entities, probs):
         kb.add_entity(entity=entity, prob=prob)
 
@@ -166,13 +194,13 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals
 
 
 def _read_wikidata_entities(limit=None, to_print=False):
-    """ Read the JSON wiki data and parse out the entities"""
+    """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
 
     languages = {'en', 'de'}
     prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
     site_filter = 'enwiki'
 
-    entity_dict = dict()
+    title_to_id = dict()
 
     # parse appropriate fields - depending on what we need in the KB
     parse_properties = False
@@ -192,12 +220,12 @@ def _read_wikidata_entities(limit=None, to_print=False):
                 clean_line = clean_line[:-1]
             if len(clean_line) > 1:
                 obj = json.loads(clean_line)
-                unique_id = obj["id"]
                 entry_type = obj["type"]
 
-                if unique_id[0] == 'Q' and entry_type == "item":
+                if entry_type == "item":
                     # filtering records on their properties
                     keep = False
+
                     claims = obj["claims"]
                     for prop, value_set in prop_filter.items():
                         claim_property = claims.get(prop, None)
@@ -209,6 +237,8 @@ def _read_wikidata_entities(limit=None, to_print=False):
                                     keep = True
 
                     if keep:
+                        unique_id = obj["id"]
+
                         if to_print:
                             print("ID:", unique_id)
                             print("type:", entry_type)
@@ -225,9 +255,10 @@ def _read_wikidata_entities(limit=None, to_print=False):
                         if parse_sitelinks:
                             site_value = obj["sitelinks"].get(site_filter, None)
                             if site_value:
+                                site = site_value['title']
                                 if to_print:
-                                    print(site_filter, ":", site_value['title'])
-                                entity_dict[unique_id] = site_value['title']
+                                    print(site_filter, ":", site)
+                                title_to_id[site] = unique_id
 
                         if parse_labels:
                             labels = obj["labels"]
@@ -262,7 +293,7 @@ def _read_wikidata_entities(limit=None, to_print=False):
             line = file.readline()
             cnt += 1
 
-    return entity_dict
+    return title_to_id
 
 
 def _read_wikipedia_prior_probs():
@@ -469,6 +500,7 @@ def capitalize_first(text):
 
 if __name__ == "__main__":
     to_create_prior_probs = False
+    to_create_entity_counts = False
     to_create_kb = True
     to_read_kb = False
 
@@ -477,20 +509,25 @@ if __name__ == "__main__":
     if to_create_prior_probs:
         _read_wikipedia_prior_probs()
 
+    # STEP 2 : deduce entity frequencies from WP
+    # run only once !
+    if to_create_entity_counts:
+        _write_entity_counts()
+
     if to_create_kb:
-        # STEP 2 : create KB
+        # STEP 3 : create KB
         my_nlp = spacy.load('en_core_web_sm')
         my_vocab = my_nlp.vocab
         my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
         print("kb entities:", my_kb.get_size_entities())
         print("kb aliases:", my_kb.get_size_aliases())
 
-        # STEP 3 : write KB to file
+        # STEP 4 : write KB to file
         my_kb.dump(KB_FILE)
         my_vocab.to_disk(VOCAB_DIR)
 
     if to_read_kb:
-        # STEP 4 : read KB back in from file
+        # STEP 5 : read KB back in from file
         my_vocab = Vocab()
         my_vocab.from_disk(VOCAB_DIR)
         my_kb = KnowledgeBase(vocab=my_vocab)
@@ -507,5 +544,5 @@ if __name__ == "__main__":
             print("alias:", c.alias_)
             print("prior prob:", c.prior_prob)
 
-    # STEP 5: add KB to NLP pipeline
+    # STEP 6: add KB to NLP pipeline
     # add_el(my_kb, nlp)

From 60b54ae8ce4ca5ad2bbb59153af283032a8905fc Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 1 May 2019 00:00:38 +0200
Subject: [PATCH 020/148] bulk entity writing and experiment with regex
 wikidata reader to speed up processing

---
 examples/pipeline/wikidata_entity_linking.py | 92 ++++++++++++++++----
 spacy/kb.pxd                                 |  2 +
 spacy/kb.pyx                                 | 56 ++++++++++++
 3 files changed, 135 insertions(+), 15 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 43ba7d8d3..0a373e5fa 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -49,7 +49,8 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     print()
     print("1. _read_wikidata_entities", datetime.datetime.now())
     print()
-    title_to_id = _read_wikidata_entities(limit=100000)
+    # title_to_id = _read_wikidata_entities_regex(limit=1000)
+    title_to_id = _read_wikidata_entities_json(limit=1000)
 
     title_list = list(title_to_id.keys())
     entity_list = [title_to_id[x] for x in title_list]
@@ -62,19 +63,13 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     print()
     print("3. _add_entities", datetime.datetime.now())
     print()
-    _add_entities(kb,
-                  entities=entity_list,
-                  probs=entity_frequencies,
-                  to_print=to_print)
+    kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None)
+    # _add_entities(kb, entities=entity_list, probs=entity_frequencies, to_print=to_print)
 
     print()
     print("4. _add_aliases", datetime.datetime.now())
     print()
-    _add_aliases(kb,
-                 title_to_id=title_to_id,
-                 max_entities_per_alias=max_entities_per_alias,
-                 min_occ=min_occ,
-                 to_print=to_print)
+    _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,)
 
     # TODO: read wikipedia texts for entity context
     # _read_wikipedia()
@@ -83,6 +78,8 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
         print()
         print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
 
+    print("done with kb", datetime.datetime.now())
+
     return kb
 
 
@@ -131,8 +128,7 @@ def _write_entity_counts(to_print=False):
         print("Total count:", total_count)
 
 
-def _add_entities(kb, entities, probs, to_print=False):
-    # TODO: this should be a bulk method
+def _add_entities_depr(kb, entities, probs, to_print=False):
     for entity, prob in zip(entities, probs):
         kb.add_entity(entity=entity, prob=prob)
 
@@ -193,7 +189,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals
         print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
 
 
-def _read_wikidata_entities(limit=None, to_print=False):
+def _read_wikidata_entities_json(limit=None, to_print=False):
     """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
 
     languages = {'en', 'de'}
@@ -259,6 +255,7 @@ def _read_wikidata_entities(limit=None, to_print=False):
                                 if to_print:
                                     print(site_filter, ":", site)
                                 title_to_id[site] = unique_id
+                                # print(site, "for", unique_id)
 
                         if parse_labels:
                             labels = obj["labels"]
@@ -296,6 +293,56 @@ def _read_wikidata_entities(limit=None, to_print=False):
     return title_to_id
 
 
+def _read_wikidata_entities_regex_depr(limit=None, to_print=False):
+    """ Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines. """
+
+    regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE)
+    regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
+    regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE)
+    regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
+
+    title_to_id = dict()
+
+    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
+        line = file.readline()
+        cnt = 0
+        while line and (not limit or cnt < limit):
+            if cnt % 100000 == 0:
+                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
+            clean_line = line.strip()
+            if clean_line.endswith(b","):
+                clean_line = clean_line[:-1]
+            if len(clean_line) > 1:
+                clean_line = line.strip().decode("utf-8")
+                keep = False
+
+                p31_matches = regex_p31.findall(clean_line)
+                if p31_matches:
+                    for p31_match in p31_matches:
+                        id_matches = regex_id.findall(p31_match)
+                        for id_match in id_matches:
+                            id_match = id_match[6:][:-1]
+                            if id_match == "Q5" or id_match == "Q15632617":
+                                keep = True
+
+                if keep:
+                    id_match = regex_id.search(clean_line).group(0)
+                    id_match = id_match[6:][:-1]
+
+                    enwiki_matches = regex_enwiki.findall(clean_line)
+                    if enwiki_matches:
+                        for enwiki_match in enwiki_matches:
+                            title_match = regex_title.search(enwiki_match).group(0)
+                            title = title_match[9:][:-1]
+                            title_to_id[title] = id_match
+                            # print(title, "for", id_match)
+
+            line = file.readline()
+            cnt += 1
+
+    return title_to_id
+
+
 def _read_wikipedia_prior_probs():
     """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
     The full file takes about 2h to parse 1100M lines (update printed every 5M lines)
@@ -499,50 +546,65 @@ def capitalize_first(text):
 
 
 if __name__ == "__main__":
+    print("START", datetime.datetime.now())
+
     to_create_prior_probs = False
     to_create_entity_counts = False
     to_create_kb = True
-    to_read_kb = False
+    to_read_kb = True
 
     # STEP 1 : create prior probabilities from WP
     # run only once !
     if to_create_prior_probs:
+        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
         _read_wikipedia_prior_probs()
+        print()
 
     # STEP 2 : deduce entity frequencies from WP
     # run only once !
     if to_create_entity_counts:
+        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
         _write_entity_counts()
+        print()
 
     if to_create_kb:
         # STEP 3 : create KB
+        print("STEP 3: to_create_kb", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_sm')
         my_vocab = my_nlp.vocab
         my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
         print("kb entities:", my_kb.get_size_entities())
         print("kb aliases:", my_kb.get_size_aliases())
+        print()
 
         # STEP 4 : write KB to file
+        print("STEP 4: write KB", datetime.datetime.now())
         my_kb.dump(KB_FILE)
         my_vocab.to_disk(VOCAB_DIR)
+        print()
 
     if to_read_kb:
         # STEP 5 : read KB back in from file
+        print("STEP 5: to_read_kb", datetime.datetime.now())
         my_vocab = Vocab()
         my_vocab.from_disk(VOCAB_DIR)
         my_kb = KnowledgeBase(vocab=my_vocab)
         my_kb.load_bulk(KB_FILE)
         print("kb entities:", my_kb.get_size_entities())
         print("kb aliases:", my_kb.get_size_aliases())
+        print()
 
         # test KB
         candidates = my_kb.get_candidates("Bush")
         for c in candidates:
-            print()
             print("entity:", c.entity_)
             print("entity freq:", c.entity_freq)
             print("alias:", c.alias_)
             print("prior prob:", c.prior_prob)
+            print()
 
     # STEP 6: add KB to NLP pipeline
+    # print("STEP 6: use KB", datetime.datetime.now())
     # add_el(my_kb, nlp)
+
+    print("STOP", datetime.datetime.now())
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 82b06d192..494848e5e 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -131,6 +131,8 @@ cdef class KnowledgeBase:
         self._aliases_table.push_back(alias)
 
     cpdef load_bulk(self, loc)
+    cpdef set_entities(self, entity_list, prob_list, vector_list, feature_list)
+    cpdef set_aliases(self, alias_list, entities_list, probabilities_list)
 
 
 cdef class Writer:
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ad2e13b5e..ba870661d 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -111,6 +111,62 @@ cdef class KnowledgeBase:
 
         return entity_hash
 
+    cpdef set_entities(self, entity_list, prob_list, vector_list, feature_list):
+        nr_entities = len(entity_list)
+        self._entry_index = PreshMap(nr_entities+1)
+        self._entries = entry_vec(nr_entities+1)
+
+        i = 0
+        cdef EntryC entry
+        cdef int32_t dummy_value = 342
+        while i < nr_entities:
+            # TODO features and vectors
+            entity_hash = self.vocab.strings.add(entity_list[i])
+            entry.entity_hash = entity_hash
+            entry.prob = prob_list[i]
+            entry.vector_rows = &dummy_value
+            entry.feats_row = dummy_value
+
+            self._entries[i+1] = entry
+            self._entry_index[entity_hash] = i+1
+
+            i += 1
+
+    # TODO: this method is untested
+    cpdef set_aliases(self, alias_list, entities_list, probabilities_list):
+        nr_aliases = len(alias_list)
+        self._alias_index = PreshMap(nr_aliases+1)
+        self._aliases_table = alias_vec(nr_aliases+1)
+
+        i = 0
+        cdef AliasC alias
+        cdef int32_t dummy_value = 342
+        while i <= nr_aliases:
+            alias_hash = self.vocab.strings.add(alias_list[i])
+            entities = entities_list[i]
+            probabilities = probabilities_list[i]
+
+            nr_candidates = len(entities)
+            entry_indices = vector[int64_t](nr_candidates)
+            probs = vector[float](nr_candidates)
+
+            for j in range(0, nr_candidates):
+                entity = entities[j]
+                entity_hash = self.vocab.strings[entity]
+                if not entity_hash in self._entry_index:
+                    raise ValueError(Errors.E134.format(alias=alias, entity=entity))
+
+                entry_index = <int64_t>self._entry_index.get(entity_hash)
+                entry_indices[j] = entry_index
+
+            alias.entry_indices = entry_indices
+            alias.probs = probs
+
+            self._aliases_table[i] = alias
+            self._alias_index[alias_hash] = i
+
+            i += 1
+
     def add_alias(self, unicode alias, entities, probabilities):
         """
         For a given alias, add its potential entities and prior probabilies to the KB.

From 3629a52ede3479cbf494e5e9472ceefff78ea74b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 1 May 2019 01:00:59 +0200
Subject: [PATCH 021/148] reading all persons in wikidata

---
 examples/pipeline/wikidata_entity_linking.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 0a373e5fa..287e4a50b 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -50,7 +50,7 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     print("1. _read_wikidata_entities", datetime.datetime.now())
     print()
     # title_to_id = _read_wikidata_entities_regex(limit=1000)
-    title_to_id = _read_wikidata_entities_json(limit=1000)
+    title_to_id = _read_wikidata_entities_json(limit=None)
 
     title_list = list(title_to_id.keys())
     entity_list = [title_to_id[x] for x in title_list]
@@ -209,7 +209,7 @@ def _read_wikidata_entities_json(limit=None, to_print=False):
         line = file.readline()
         cnt = 0
         while line and (not limit or cnt < limit):
-            if cnt % 100000 == 0:
+            if cnt % 500000 == 0:
                 print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
             clean_line = line.strip()
             if clean_line.endswith(b","):
@@ -307,7 +307,7 @@ def _read_wikidata_entities_regex_depr(limit=None, to_print=False):
         line = file.readline()
         cnt = 0
         while line and (not limit or cnt < limit):
-            if cnt % 100000 == 0:
+            if cnt % 500000 == 0:
                 print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
             clean_line = line.strip()
             if clean_line.endswith(b","):

From 1ae41daaa92e4099d4e15c7b5a9801ad7994ad68 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 1 May 2019 23:05:40 +0200
Subject: [PATCH 022/148] allow small rounding errors

---
 examples/pipeline/wikidata_entity_linking.py | 9 ++++++---
 spacy/kb.pyx                                 | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 287e4a50b..02a766d0f 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -61,13 +61,13 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     entity_frequencies = _get_entity_frequencies(entities=title_list)
 
     print()
-    print("3. _add_entities", datetime.datetime.now())
+    print("3. adding", len(entity_list), "entities", datetime.datetime.now())
     print()
     kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None)
     # _add_entities(kb, entities=entity_list, probs=entity_frequencies, to_print=to_print)
 
     print()
-    print("4. _add_aliases", datetime.datetime.now())
+    print("4. adding aliases", datetime.datetime.now())
     print()
     _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,)
 
@@ -171,7 +171,10 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals
                             prior_probs.append(p_entity_givenalias)
 
                     if selected_entities:
-                        kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
+                        try:
+                            kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
+                        except ValueError as e:
+                            print(e)
                 total_count = 0
                 counts = list()
                 entities = list()
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ba870661d..d471130d0 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -179,9 +179,9 @@ cdef class KnowledgeBase:
                                                 entities_length=len(entities),
                                                 probabilities_length=len(probabilities)))
 
-        # Throw an error if the probabilities sum up to more than 1
+        # Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
         prob_sum = sum(probabilities)
-        if prob_sum > 1:
+        if prob_sum > 1.00001:
             raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
 
         cdef hash_t alias_hash = self.vocab.strings.add(alias)

From 835355219123d4502eb3157a3700b6a7d3ae06d2 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 1 May 2019 23:26:16 +0200
Subject: [PATCH 023/148] cleanup

---
 examples/pipeline/wikidata_entity_linking.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 02a766d0f..e293be90f 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -49,7 +49,7 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     print()
     print("1. _read_wikidata_entities", datetime.datetime.now())
     print()
-    # title_to_id = _read_wikidata_entities_regex(limit=1000)
+    # title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
     title_to_id = _read_wikidata_entities_json(limit=None)
 
     title_list = list(title_to_id.keys())
@@ -64,7 +64,6 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     print("3. adding", len(entity_list), "entities", datetime.datetime.now())
     print()
     kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None)
-    # _add_entities(kb, entities=entity_list, probs=entity_frequencies, to_print=to_print)
 
     print()
     print("4. adding aliases", datetime.datetime.now())
@@ -128,14 +127,6 @@ def _write_entity_counts(to_print=False):
         print("Total count:", total_count)
 
 
-def _add_entities_depr(kb, entities, probs, to_print=False):
-    for entity, prob in zip(entities, probs):
-        kb.add_entity(entity=entity, prob=prob)
-
-    if to_print:
-        print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings())
-
-
 def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False):
     wp_titles = title_to_id.keys()
 
@@ -553,7 +544,7 @@ if __name__ == "__main__":
 
     to_create_prior_probs = False
     to_create_entity_counts = False
-    to_create_kb = True
+    to_create_kb = False
     to_read_kb = True
 
     # STEP 1 : create prior probabilities from WP

From 581dc9742d2a7dc790bab7fe59993de8b1279b3b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 2 May 2019 17:09:56 +0200
Subject: [PATCH 024/148] parsing clean text from WP articles to use as input
 data for NER and NEL

---
 examples/pipeline/wikidata_entity_linking.py | 492 ++++++++++++-------
 1 file changed, 320 insertions(+), 172 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index e293be90f..e6df39631 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -10,9 +10,13 @@ import json
 import spacy
 import datetime
 import bz2
+
 from spacy.kb import KnowledgeBase
 from spacy.vocab import Vocab
 
+# requires: pip install neuralcoref --no-binary neuralcoref
+# import neuralcoref
+
 # TODO: remove hardcoded paths
 WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
 ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
@@ -20,6 +24,7 @@ ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-ar
 
 PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
 ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
+ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
 
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
 VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
@@ -43,7 +48,151 @@ wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
 map_alias_to_link = dict()
 
 
-def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
+def read_wikipedia_prior_probs():
+    """
+    STEP 1: Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
+    The full file takes about 2h to parse 1100M lines (update printed every 5M lines).
+    It works relatively fast because we don't care about which article we parsed the interwiki from,
+    we just process line by line.
+    """
+
+    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
+        line = file.readline()
+        cnt = 0
+        while line:
+            if cnt % 5000000 == 0:
+                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
+            clean_line = line.strip().decode("utf-8")
+
+            aliases, entities, normalizations = _get_wp_links(clean_line)
+            for alias, entity, norm in zip(aliases, entities, normalizations):
+                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
+                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
+
+            line = file.readline()
+            cnt += 1
+
+    # write all aliases and their entities and occurrences to file
+    with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile:
+        outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
+        for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
+            for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
+                outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
+
+
+# find the links
+link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
+
+# match on interwiki links, e.g. `en:` or `:fr:`
+ns_regex = r":?" + "[a-z][a-z]" + ":"
+
+# match on Namespace: optionally preceded by a :
+for ns in wiki_namespaces:
+    ns_regex += "|" + ":?" + ns + ":"
+
+ns_regex = re.compile(ns_regex, re.IGNORECASE)
+
+
+def _get_wp_links(text):
+    aliases = []
+    entities = []
+    normalizations = []
+
+    matches = link_regex.findall(text)
+    for match in matches:
+        match = match[2:][:-2].replace("_", " ").strip()
+
+        if ns_regex.match(match):
+            pass  # ignore namespaces at the beginning of the string
+
+        # this is a simple link, with the alias the same as the mention
+        elif "|" not in match:
+            aliases.append(match)
+            entities.append(match)
+            normalizations.append(True)
+
+        # in wiki format, the link is written as [[entity|alias]]
+        else:
+            splits = match.split("|")
+            entity = splits[0].strip()
+            alias = splits[1].strip()
+            # specific wiki format  [[alias (specification)|]]
+            if len(alias) == 0 and "(" in entity:
+                alias = entity.split("(")[0]
+                aliases.append(alias)
+                entities.append(entity)
+                normalizations.append(False)
+            else:
+                aliases.append(alias)
+                entities.append(entity)
+                normalizations.append(False)
+
+    return aliases, entities, normalizations
+
+
+def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
+    alias = alias.strip()
+    entity = entity.strip()
+
+    # remove everything after # as this is not part of the title but refers to a specific paragraph
+    if normalize_entity:
+        # wikipedia titles are always capitalized
+        entity = _capitalize_first(entity.split("#")[0])
+    if normalize_alias:
+        alias = alias.split("#")[0]
+
+    if alias and entity:
+        alias_dict = map_alias_to_link.get(alias, dict())
+        entity_count = alias_dict.get(entity, 0)
+        alias_dict[entity] = entity_count + 1
+        map_alias_to_link[alias] = alias_dict
+
+
+def _capitalize_first(text):
+    if not text:
+        return None
+    result = text[0].capitalize()
+    if len(result) > 0:
+        result += text[1:]
+    return result
+
+
+def write_entity_counts(to_print=False):
+    """ STEP 2: write entity counts  """
+    entity_to_count = dict()
+    total_count = 0
+
+    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
+        # skip header
+        prior_file.readline()
+        line = prior_file.readline()
+
+        while line:
+            splits = line.replace('\n', "").split(sep='|')
+            # alias = splits[0]
+            count = int(splits[1])
+            entity = splits[2]
+
+            current_count = entity_to_count.get(entity, 0)
+            entity_to_count[entity] = current_count + count
+
+            total_count += count
+
+            line = prior_file.readline()
+
+    with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file:
+        entity_file.write("entity" + "|" + "count" + "\n")
+        for entity, count in entity_to_count.items():
+            entity_file.write(entity + "|" + str(count) + "\n")
+
+    if to_print:
+        for entity, count in entity_to_count.items():
+            print("Entity count:", entity, count)
+        print("Total count:", total_count)
+
+
+def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False, write_entity_defs=True):
+    """ STEP 3: create the knowledge base """
     kb = KnowledgeBase(vocab=vocab)
 
     print()
@@ -52,6 +201,13 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
     # title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
     title_to_id = _read_wikidata_entities_json(limit=None)
 
+    # write the title-ID mapping to file
+    if write_entity_defs:
+        with open(ENTITY_DEFS, mode='w', encoding='utf8') as entity_file:
+            entity_file.write("WP_title" + "|" + "WD_id" + "\n")
+            for title, qid in title_to_id.items():
+                entity_file.write(title + "|" + str(qid) + "\n")
+
     title_list = list(title_to_id.keys())
     entity_list = [title_to_id[x] for x in title_list]
 
@@ -94,37 +250,16 @@ def _get_entity_frequencies(entities):
     return [entity_to_count.get(e, 0) for e in entities]
 
 
-def _write_entity_counts(to_print=False):
-    entity_to_count = dict()
-    total_count = 0
-
-    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
+def _get_entity_to_id():
+    entity_to_id = dict()
+    with open(ENTITY_DEFS, 'r', encoding='utf8') as csvfile:
+        csvreader = csv.reader(csvfile, delimiter='|')
         # skip header
-        prior_file.readline()
-        line = prior_file.readline()
+        next(csvreader)
+        for row in csvreader:
+            entity_to_id[row[0]] = row[1]
 
-        while line:
-            splits = line.replace('\n', "").split(sep='|')
-            # alias = splits[0]
-            count = int(splits[1])
-            entity = splits[2]
-
-            current_count = entity_to_count.get(entity, 0)
-            entity_to_count[entity] = current_count + count
-
-            total_count += count
-
-            line = prior_file.readline()
-
-    with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file:
-        entity_file.write("entity" + "|" + "count" + "\n")
-        for entity, count in entity_to_count.items():
-            entity_file.write(entity + "|" + str(count) + "\n")
-
-    if to_print:
-        for entity, count in entity_to_count.items():
-            print("Entity count:", entity, count)
-        print("Total count:", total_count)
+    return entity_to_id
 
 
 def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False):
@@ -337,85 +472,60 @@ def _read_wikidata_entities_regex_depr(limit=None, to_print=False):
     return title_to_id
 
 
-def _read_wikipedia_prior_probs():
-    """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
-    The full file takes about 2h to parse 1100M lines (update printed every 5M lines)
-     """
+def test_kb(kb):
+    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
+    nlp = spacy.load('en_core_web_sm')
 
-    # find the links
-    link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
+    nlp.add_pipe(el_pipe, last=True)
 
-    # match on interwiki links, e.g. `en:` or `:fr:`
-    ns_regex = r":?" + "[a-z][a-z]" + ":"
+    candidates = my_kb.get_candidates("Bush")
 
-    # match on Namespace: optionally preceded by a :
-    for ns in wiki_namespaces:
-        ns_regex += "|" + ":?" + ns + ":"
+    print("generating candidates for 'Bush' :")
+    for c in candidates:
+        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
+    print()
 
-    ns_regex = re.compile(ns_regex, re.IGNORECASE)
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
+           "Douglas reminds us to always bring our towel. " \
+           "The main character in Doug's novel is the man Arthur Dent, " \
+           "but Douglas doesn't write about George Washington or Homer Simpson."
+    doc = nlp(text)
 
-    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
-        line = file.readline()
-        cnt = 0
-        while line:
-            if cnt % 5000000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
-            clean_line = line.strip().decode("utf-8")
-
-            matches = link_regex.findall(clean_line)
-            for match in matches:
-                match = match[2:][:-2].replace("_", " ").strip()
-
-                if ns_regex.match(match):
-                    pass  # ignore namespaces at the beginning of the string
-
-                # this is a simple link, with the alias the same as the mention
-                elif "|" not in match:
-                    _store_alias(match, match, normalize_alias=True, normalize_entity=True)
-
-                # in wiki format, the link is written as [[entity|alias]]
-                else:
-                    splits = match.split("|")
-                    entity = splits[0].strip()
-                    alias = splits[1].strip()
-                    # specific wiki format  [[alias (specification)|]]
-                    if len(alias) == 0 and "(" in entity:
-                        alias = entity.split("(")[0]
-                        _store_alias(alias, entity, normalize_alias=False, normalize_entity=True)
-                    else:
-                        _store_alias(alias, entity, normalize_alias=False, normalize_entity=True)
-
-            line = file.readline()
-            cnt += 1
-
-    # write all aliases and their entities and occurrences to file
-    with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile:
-        outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
-        for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
-            for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
-                outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
 
 
-def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
-    alias = alias.strip()
-    entity = entity.strip()
+def add_coref():
+    """ STEP 5: add coreference resolution to our model """
+    nlp = spacy.load('en_core_web_sm')
+    # nlp = spacy.load('en')
 
-    # remove everything after # as this is not part of the title but refers to a specific paragraph
-    if normalize_entity:
-        # wikipedia titles are always capitalized
-        entity = capitalize_first(entity.split("#")[0])
-    if normalize_alias:
-        alias = alias.split("#")[0]
+    # TODO: this doesn't work yet
+    # neuralcoref.add_to_pipe(nlp)
+    print("done adding to pipe")
 
-    if alias and entity:
-        alias_dict = map_alias_to_link.get(alias, dict())
-        entity_count = alias_dict.get(entity, 0)
-        alias_dict[entity] = entity_count + 1
-        map_alias_to_link[alias] = alias_dict
+    doc = nlp(u'My sister has a dog. She loves him.')
+    print("done doc")
+
+    print(doc._.has_coref)
+    print(doc._.coref_clusters)
 
 
-def _read_wikipedia():
-    """ Read the XML wikipedia data """
+def create_training():
+    nlp = spacy.load('en_core_web_sm')
+    wp_to_id = _get_entity_to_id()
+    _read_wikipedia(nlp, wp_to_id, limit=10000)
+
+
+def _read_wikipedia(nlp, wp_to_id, limit=None):
+    """ Read the XML wikipedia data to parse out training data """
+
+    # regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
+    # regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
+
+    title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
+    id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
 
     with bz2.open(ENWIKI_DUMP, mode='rb') as file:
         line = file.readline()
@@ -424,19 +534,19 @@ def _read_wikipedia():
         article_title = None
         article_id = None
         reading_text = False
-        while line and cnt < 1000000:
+        while line and (not limit or cnt < limit):
             clean_line = line.strip().decode("utf-8")
 
             # Start reading new page
             if clean_line == "<page>":
                 article_text = ""
                 article_title = None
-                article_id = 342
+                article_id = None
 
             # finished reading this page
             elif clean_line == "</page>":
                 if article_id:
-                    _store_wp_article(article_id, article_title, article_text.strip())
+                    _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text.strip())
 
             # start reading text within a page
             if "<text" in clean_line:
@@ -445,17 +555,17 @@ def _read_wikipedia():
             if reading_text:
                 article_text += " " + clean_line
 
-            # stop reading text within a page
+            # stop reading text within a page (we assume a new page doesn't start on the same line)
             if "</text" in clean_line:
                 reading_text = False
 
             # read the ID of this article
-            ids = re.findall(r"(?<=<id>)\d*(?=</id>)", clean_line)
+            ids = id_regex.search(clean_line)
             if ids:
                 article_id = ids[0]
 
             # read the title of this article
-            titles = re.findall(r"(?<=<title>).*(?=</title>)", clean_line)
+            titles = title_regex.search(clean_line)
             if titles:
                 article_title = titles[0].strip()
 
@@ -463,107 +573,145 @@ def _read_wikipedia():
             cnt += 1
 
 
-def _store_wp_article(article_id, article_title, article_text):
-    pass
+def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text):
+    # remove the text tags
+    text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text>)')
+    text = text_regex.search(article_text).group(0)
+
+    # stop processing if this is a redirect page
+    if text.startswith("#REDIRECT"):
+        return
+
     print("WP article", article_id, ":", article_title)
-    print(article_text)
-    print(_get_clean_wp_text(article_text))
+
+    article_dict = dict()
+    aliases, entities, normalizations = _get_wp_links(text)
+    for alias, entity, norm in zip(aliases, entities, normalizations):
+        entity_id = wp_to_id.get(entity)
+        if entity_id:
+            # print(" ", alias, '-->', entity, '-->', entity_id)
+            article_dict[alias] = entity_id
+            article_dict[entity] = entity_id
+
+    # get the raw text without markup etc
+    clean_text = _get_clean_wp_text(text)
+
+    #print(text)
+    print(clean_text)
     print()
 
+    _run_ner(nlp, article_id, article_title, clean_text, article_dict)
+
+
+info_regex = re.compile(r'{[^{]*?}')
+interwiki_regex = re.compile(r'\[\[([^|]*?)]]')
+interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]')
+htlm_regex = re.compile(r'&lt;!--[^!]*--&gt;')
+category_regex = re.compile(r'\[\[Category:[^\[]*]]')
+file_regex = re.compile(r'\[\[File:[^[\]]+]]')
+ref_regex = re.compile(r'&lt;ref.*?&gt;')     # non-greedy
+ref_2_regex = re.compile(r'&lt;/ref.*?&gt;')  # non-greedy
+
 
 def _get_clean_wp_text(article_text):
-    # TODO: compile the regular expressions
+    clean_text = article_text.strip()
 
-    # remove Category and File statements
-    clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', article_text)
-    print("1", clean_text)
-    clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text)       # TODO: this doesn't work yet
-    print("2", clean_text)
-
-    # remove bolding markup
-    clean_text = re.sub('\'\'\'', '', clean_text)
-    clean_text = re.sub('\'\'', '', clean_text)
+    # remove bolding & italic markup
+    clean_text = clean_text.replace('\'\'\'', '')
+    clean_text = clean_text.replace('\'\'', '')
 
     # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
     try_again = True
     previous_length = len(clean_text)
     while try_again:
-        clean_text = re.sub('{[^{]*?}', '', clean_text)  # non-greedy match excluding a nested {
+        clean_text = info_regex.sub('', clean_text)  # non-greedy match excluding a nested {
         if len(clean_text) < previous_length:
             try_again = True
         else:
             try_again = False
         previous_length = len(clean_text)
 
-    # remove multiple spaces
-    while '  ' in clean_text:
-        clean_text = re.sub('  ', ' ', clean_text)
-
     # remove simple interwiki links (no alternative name)
-    clean_text = re.sub('\[\[([^|]*?)]]', r'\1', clean_text)
+    clean_text = interwiki_regex.sub(r'\1', clean_text)
 
     # remove simple interwiki links by picking the alternative name
-    clean_text = re.sub(r'\[\[[^|]*?\|([^|]*?)]]', r'\1', clean_text)
+    clean_text = interwiki_2_regex.sub(r'\1', clean_text)
 
     # remove HTML comments
-    clean_text = re.sub('&lt;!--[^!]*--&gt;', '', clean_text)
+    clean_text = htlm_regex.sub('', clean_text)
 
-    return clean_text
+    # remove Category and File statements
+    clean_text = category_regex.sub('', clean_text)
+    clean_text = file_regex.sub('', clean_text)
+
+    # remove multiple =
+    while '==' in clean_text:
+        clean_text = clean_text.replace("==", "=")
+
+    clean_text = clean_text.replace(". =", ".")
+    clean_text = clean_text.replace(" = ", ". ")
+    clean_text = clean_text.replace("= ", ".")
+    clean_text = clean_text.replace(" =", "")
+
+    # remove refs (non-greedy match)
+    clean_text = ref_regex.sub('', clean_text)
+    clean_text = ref_2_regex.sub('', clean_text)
+
+    # remove additional wikiformatting
+    clean_text = re.sub(r'&lt;blockquote&gt;', '', clean_text)
+    clean_text = re.sub(r'&lt;/blockquote&gt;', '', clean_text)
+
+    # change special characters back to normal ones
+    clean_text = clean_text.replace(r'&lt;', '<')
+    clean_text = clean_text.replace(r'&gt;', '>')
+    clean_text = clean_text.replace(r'&quot;', '"')
+    clean_text = clean_text.replace(r'&amp;nbsp;', ' ')
+    clean_text = clean_text.replace(r'&amp;', '&')
+
+    # remove multiple spaces
+    while '  ' in clean_text:
+        clean_text = clean_text.replace('  ', ' ')
+
+    return clean_text.strip()
 
 
-def add_el(kb, nlp):
-    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
-    nlp.add_pipe(el_pipe, last=True)
-
-    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
-           "Douglas reminds us to always bring our towel. " \
-           "The main character in Doug's novel is the man Arthur Dent, " \
-           "but Douglas doesn't write about George Washington or Homer Simpson."
-    doc = nlp(text)
-
-    print()
-    for token in doc:
-        print("token", token.text, token.ent_type_, token.ent_kb_id_)
-
-    print()
-    for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
-
-
-def capitalize_first(text):
-    if not text:
-        return None
-    result = text[0].capitalize()
-    if len(result) > 0:
-        result += text[1:]
-    return result
-
+def _run_ner(nlp, article_id, article_title, clean_text, article_dict):
+    pass # TODO
 
 if __name__ == "__main__":
     print("START", datetime.datetime.now())
+    print()
+    my_kb = None
 
+    # one-time methods to create KB and write to file
     to_create_prior_probs = False
     to_create_entity_counts = False
     to_create_kb = False
-    to_read_kb = True
+
+    # read KB back in from file
+    to_read_kb = False
+    to_test_kb = False
+
+    create_wp_training = True
 
     # STEP 1 : create prior probabilities from WP
     # run only once !
     if to_create_prior_probs:
         print("STEP 1: to_create_prior_probs", datetime.datetime.now())
-        _read_wikipedia_prior_probs()
+        read_wikipedia_prior_probs()
         print()
 
     # STEP 2 : deduce entity frequencies from WP
     # run only once !
     if to_create_entity_counts:
         print("STEP 2: to_create_entity_counts", datetime.datetime.now())
-        _write_entity_counts()
+        write_entity_counts()
         print()
 
+    # STEP 3 : create KB and write to file
+    # run only once !
     if to_create_kb:
-        # STEP 3 : create KB
-        print("STEP 3: to_create_kb", datetime.datetime.now())
+        print("STEP 3a: to_create_kb", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_sm')
         my_vocab = my_nlp.vocab
         my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
@@ -571,15 +719,14 @@ if __name__ == "__main__":
         print("kb aliases:", my_kb.get_size_aliases())
         print()
 
-        # STEP 4 : write KB to file
-        print("STEP 4: write KB", datetime.datetime.now())
+        print("STEP 3b: write KB", datetime.datetime.now())
         my_kb.dump(KB_FILE)
         my_vocab.to_disk(VOCAB_DIR)
         print()
 
+    # STEP 4 : read KB back in from file
     if to_read_kb:
-        # STEP 5 : read KB back in from file
-        print("STEP 5: to_read_kb", datetime.datetime.now())
+        print("STEP 4: to_read_kb", datetime.datetime.now())
         my_vocab = Vocab()
         my_vocab.from_disk(VOCAB_DIR)
         my_kb = KnowledgeBase(vocab=my_vocab)
@@ -589,16 +736,17 @@ if __name__ == "__main__":
         print()
 
         # test KB
-        candidates = my_kb.get_candidates("Bush")
-        for c in candidates:
-            print("entity:", c.entity_)
-            print("entity freq:", c.entity_freq)
-            print("alias:", c.alias_)
-            print("prior prob:", c.prior_prob)
+        if to_test_kb:
+            test_kb(my_kb)
             print()
 
-    # STEP 6: add KB to NLP pipeline
-    # print("STEP 6: use KB", datetime.datetime.now())
-    # add_el(my_kb, nlp)
+    # STEP 5: create a training dataset from WP
+    if create_wp_training:
+        print("STEP 5: create training dataset", datetime.datetime.now())
+        create_training()
 
+    # TODO coreference resolution
+    # add_coref()
+
+    print()
     print("STOP", datetime.datetime.now())

From cba9680d13cd2cc1b5a2af9d82acf378dce8fede Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 2 May 2019 17:24:52 +0200
Subject: [PATCH 025/148] run NER on clean WP text and link to gold-standard
 entity IDs

---
 examples/pipeline/wikidata_entity_linking.py | 22 +++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index e6df39631..a0ffc3618 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -515,15 +515,12 @@ def add_coref():
 def create_training():
     nlp = spacy.load('en_core_web_sm')
     wp_to_id = _get_entity_to_id()
-    _read_wikipedia(nlp, wp_to_id, limit=10000)
+    _read_wikipedia_texts(nlp, wp_to_id, limit=10000)
 
 
-def _read_wikipedia(nlp, wp_to_id, limit=None):
+def _read_wikipedia_texts(nlp, wp_to_id, limit=None):
     """ Read the XML wikipedia data to parse out training data """
 
-    # regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
-    # regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
-
     title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
     id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
 
@@ -589,18 +586,15 @@ def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text):
     for alias, entity, norm in zip(aliases, entities, normalizations):
         entity_id = wp_to_id.get(entity)
         if entity_id:
-            # print(" ", alias, '-->', entity, '-->', entity_id)
             article_dict[alias] = entity_id
             article_dict[entity] = entity_id
 
     # get the raw text without markup etc
     clean_text = _get_clean_wp_text(text)
-
-    #print(text)
     print(clean_text)
-    print()
 
     _run_ner(nlp, article_id, article_title, clean_text, article_dict)
+    print()
 
 
 info_regex = re.compile(r'{[^{]*?}')
@@ -676,7 +670,15 @@ def _get_clean_wp_text(article_text):
 
 
 def _run_ner(nlp, article_id, article_title, clean_text, article_dict):
-    pass # TODO
+    doc = nlp(clean_text)
+    for ent in doc.ents:
+        if ent.label_ == "PERSON":           # TODO: expand to non-persons
+            ent_id = article_dict.get(ent.text)
+            if ent_id:
+                print(" -", ent.text, ent.label_, ent_id)
+            else:
+                print(" -", ent.text, ent.label_, '???')  # TODO: investigate these cases
+
 
 if __name__ == "__main__":
     print("START", datetime.datetime.now())

From bbcb9da466d33c7ac118d8aa6cce67961a39ec9f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 3 May 2019 10:44:29 +0200
Subject: [PATCH 026/148] creating training data with clean WP texts and QID
 entities true/false

---
 examples/pipeline/wikidata_entity_linking.py | 92 ++++++++++++++++----
 1 file changed, 76 insertions(+), 16 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index a0ffc3618..cf388773a 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -29,6 +29,8 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
 VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
 
+TRAINING_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
+
 
 # these will/should be matched ignoring case
 wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
@@ -224,7 +226,7 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False, write_enti
     print()
     print("4. adding aliases", datetime.datetime.now())
     print()
-    _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,)
+    _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ)
 
     # TODO: read wikipedia texts for entity context
     # _read_wikipedia()
@@ -512,18 +514,27 @@ def add_coref():
     print(doc._.coref_clusters)
 
 
-def create_training():
-    nlp = spacy.load('en_core_web_sm')
+def create_training(kb):
+    if not kb:
+        raise ValueError("kb should be defined")
+    # nlp = spacy.load('en_core_web_sm')
     wp_to_id = _get_entity_to_id()
-    _read_wikipedia_texts(nlp, wp_to_id, limit=10000)
+    _read_wikipedia_texts(kb, wp_to_id, limit=None)
 
 
-def _read_wikipedia_texts(nlp, wp_to_id, limit=None):
+def _read_wikipedia_texts(kb, wp_to_id, limit=None):
     """ Read the XML wikipedia data to parse out training data """
 
     title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
     id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
 
+    # read entity training header file
+    _write_training_entity(article_id="article_id",
+                           alias="alias",
+                           entity="entity",
+                           correct="correct",
+                           append=False)
+
     with bz2.open(ENWIKI_DUMP, mode='rb') as file:
         line = file.readline()
         cnt = 1
@@ -532,6 +543,8 @@ def _read_wikipedia_texts(nlp, wp_to_id, limit=None):
         article_id = None
         reading_text = False
         while line and (not limit or cnt < limit):
+            if cnt % 500000 == 0:
+                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
             clean_line = line.strip().decode("utf-8")
 
             # Start reading new page
@@ -543,7 +556,7 @@ def _read_wikipedia_texts(nlp, wp_to_id, limit=None):
             # finished reading this page
             elif clean_line == "</page>":
                 if article_id:
-                    _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text.strip())
+                    _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip())
 
             # start reading text within a page
             if "<text" in clean_line:
@@ -570,7 +583,7 @@ def _read_wikipedia_texts(nlp, wp_to_id, limit=None):
             cnt += 1
 
 
-def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text):
+def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
     # remove the text tags
     text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text>)')
     text = text_regex.search(article_text).group(0)
@@ -579,7 +592,14 @@ def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text):
     if text.startswith("#REDIRECT"):
         return
 
-    print("WP article", article_id, ":", article_title)
+    # print("WP article", article_id, ":", article_title)
+    # print()
+    # print(text)
+
+    # get the raw text without markup etc
+    clean_text = _get_clean_wp_text(text)
+    # print()
+    # print(clean_text)
 
     article_dict = dict()
     aliases, entities, normalizations = _get_wp_links(text)
@@ -589,12 +609,37 @@ def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text):
             article_dict[alias] = entity_id
             article_dict[entity] = entity_id
 
-    # get the raw text without markup etc
-    clean_text = _get_clean_wp_text(text)
-    print(clean_text)
+    # print("found entities:")
+    for alias, entity in article_dict.items():
+        # print(alias, "-->", entity)
+        candidates = kb.get_candidates(alias)
 
-    _run_ner(nlp, article_id, article_title, clean_text, article_dict)
-    print()
+        # as training data, we only store entities that are sufficiently ambiguous
+        if len(candidates) > 1:
+            _write_training_article(article_id=article_id, clean_text=clean_text)
+            # print("alias", alias)
+
+            # print all incorrect candidates
+            for c in candidates:
+                if entity != c.entity_:
+                    _write_training_entity(article_id=article_id,
+                                           alias=alias,
+                                           entity=c.entity_,
+                                           correct="0",
+                                           append=True)
+
+            # print the one correct candidate
+            _write_training_entity(article_id=article_id,
+                                   alias=alias,
+                                   entity=entity,
+                                   correct="1",
+                                   append=True)
+
+            # print("gold entity", entity)
+            # print()
+
+    # _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict)
+    # print()
 
 
 info_regex = re.compile(r'{[^{]*?}')
@@ -669,7 +714,22 @@ def _get_clean_wp_text(article_text):
     return clean_text.strip()
 
 
-def _run_ner(nlp, article_id, article_title, clean_text, article_dict):
+def _write_training_article(article_id, clean_text):
+    file_loc = TRAINING_SET_DIR + "/" + str(article_id) + ".txt"
+    with open(file_loc, mode='w', encoding='utf8') as outputfile:
+        outputfile.write(clean_text)
+
+
+def _write_training_entity(article_id, alias, entity, correct, append=True):
+    mode = "w"
+    if append:
+        mode = "a"
+    file_loc = TRAINING_SET_DIR + "/" + "gold_entities.csv"
+    with open(file_loc, mode=mode, encoding='utf8') as outputfile:
+        outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
+
+
+def _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict):
     doc = nlp(clean_text)
     for ent in doc.ents:
         if ent.label_ == "PERSON":           # TODO: expand to non-persons
@@ -691,7 +751,7 @@ if __name__ == "__main__":
     to_create_kb = False
 
     # read KB back in from file
-    to_read_kb = False
+    to_read_kb = True
     to_test_kb = False
 
     create_wp_training = True
@@ -745,7 +805,7 @@ if __name__ == "__main__":
     # STEP 5: create a training dataset from WP
     if create_wp_training:
         print("STEP 5: create training dataset", datetime.datetime.now())
-        create_training()
+        create_training(my_kb)
 
     # TODO coreference resolution
     # add_coref()

From 34600c92bd5be2948debf465b9de9c2f3f2f16ee Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 3 May 2019 15:10:09 +0200
Subject: [PATCH 027/148] try catch per article to ensure the pipeline goes on

---
 examples/pipeline/wikidata_entity_linking.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index cf388773a..a9be49742 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -537,7 +537,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
 
     with bz2.open(ENWIKI_DUMP, mode='rb') as file:
         line = file.readline()
-        cnt = 1
+        cnt = 0
         article_text = ""
         article_title = None
         article_id = None
@@ -556,7 +556,12 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
             # finished reading this page
             elif clean_line == "</page>":
                 if article_id:
-                    _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip())
+                    try:
+                        _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip())
+                    # on a previous run, an error occurred after 46M lines and 2h
+                    except Exception as e:
+                        print("Error processing article", article_id, article_title)
+                        print(e)
 
             # start reading text within a page
             if "<text" in clean_line:
@@ -585,7 +590,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
 
 def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
     # remove the text tags
-    text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text>)')
+    text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
     text = text_regex.search(article_text).group(0)
 
     # stop processing if this is a redirect page

From 4e929600e53f9650f254c7beb17292fca7a20df5 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 3 May 2019 17:37:47 +0200
Subject: [PATCH 028/148] fix WP id parsing, speed up processing and remove
 ambiguous strings in one doc (for now)

---
 examples/pipeline/wikidata_entity_linking.py | 187 +++++++++++--------
 1 file changed, 110 insertions(+), 77 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index a9be49742..0db7f4665 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -29,7 +29,8 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
 VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
 
-TRAINING_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
+TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
+TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
 
 
 # these will/should be matched ignoring case
@@ -523,74 +524,104 @@ def create_training(kb):
 
 
 def _read_wikipedia_texts(kb, wp_to_id, limit=None):
-    """ Read the XML wikipedia data to parse out training data """
+    """
+    Read the XML wikipedia data to parse out training data:
+    raw text data + positive and negative instances
+    """
 
     title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
     id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
 
-    # read entity training header file
-    _write_training_entity(article_id="article_id",
-                           alias="alias",
-                           entity="entity",
-                           correct="correct",
-                           append=False)
+    read_ids = set()
 
-    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
-        line = file.readline()
-        cnt = 0
-        article_text = ""
-        article_title = None
-        article_id = None
-        reading_text = False
-        while line and (not limit or cnt < limit):
-            if cnt % 500000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
-            clean_line = line.strip().decode("utf-8")
-
-            # Start reading new page
-            if clean_line == "<page>":
-                article_text = ""
-                article_title = None
-                article_id = None
-
-            # finished reading this page
-            elif clean_line == "</page>":
-                if article_id:
-                    try:
-                        _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip())
-                    # on a previous run, an error occurred after 46M lines and 2h
-                    except Exception as e:
-                        print("Error processing article", article_id, article_title)
-                        print(e)
-
-            # start reading text within a page
-            if "<text" in clean_line:
-                reading_text = True
-
-            if reading_text:
-                article_text += " " + clean_line
-
-            # stop reading text within a page (we assume a new page doesn't start on the same line)
-            if "</text" in clean_line:
-                reading_text = False
-
-            # read the ID of this article
-            ids = id_regex.search(clean_line)
-            if ids:
-                article_id = ids[0]
-
-            # read the title of this article
-            titles = title_regex.search(clean_line)
-            if titles:
-                article_title = titles[0].strip()
+    entityfile_loc = TRAINING_OUTPUT_SET_DIR + "/" + "gold_entities.csv"
+    with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
+        # write entity training header file
+        _write_training_entity(outputfile=entityfile,
+                               article_id="article_id",
+                               alias="alias",
+                               entity="entity",
+                               correct="correct")
 
+        with bz2.open(ENWIKI_DUMP, mode='rb') as file:
             line = file.readline()
-            cnt += 1
+            cnt = 0
+            article_text = ""
+            article_title = None
+            article_id = None
+            reading_text = False
+            reading_revision = False
+            while line and (not limit or cnt < limit):
+                if cnt % 500000 == 0:
+                    print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
+                clean_line = line.strip().decode("utf-8")
+                # print(clean_line)
+
+                if clean_line == "<revision>":
+                    reading_revision = True
+                elif clean_line == "</revision>":
+                    reading_revision = False
+
+                # Start reading new page
+                if clean_line == "<page>":
+                    article_text = ""
+                    article_title = None
+                    article_id = None
+
+                # finished reading this page
+                elif clean_line == "</page>":
+                    if article_id:
+                        try:
+                            _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text.strip())
+                        # on a previous run, an error occurred after 46M lines and 2h
+                        except Exception as e:
+                            print("Error processing article", article_id, article_title)
+                            print(e)
+                    else:
+                        print("Done processing a page, but couldn't find an article_id ?")
+                        print(article_title)
+                        print(article_text)
+                    article_text = ""
+                    article_title = None
+                    article_id = None
+                    reading_text = False
+                    reading_revision = False
+
+                # start reading text within a page
+                if "<text" in clean_line:
+                    reading_text = True
+
+                if reading_text:
+                    article_text += " " + clean_line
+
+                # stop reading text within a page (we assume a new page doesn't start on the same line)
+                if "</text" in clean_line:
+                    reading_text = False
+
+                # read the ID of this article (outside the revision portion of the document)
+                if not reading_revision:
+                    ids = id_regex.search(clean_line)
+                    if ids:
+                        article_id = ids[0]
+                        if article_id in read_ids:
+                            print("Found duplicate article ID", article_id, clean_line) # This should never happen ...
+                        read_ids.add(article_id)
+
+                # read the title of this article  (outside the revision portion of the document)
+                if not reading_revision:
+                    titles = title_regex.search(clean_line)
+                    if titles:
+                        article_title = titles[0].strip()
+
+                line = file.readline()
+                cnt += 1
 
 
-def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
+text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
+
+
+def _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text):
     # remove the text tags
-    text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
     text = text_regex.search(article_text).group(0)
 
     # stop processing if this is a redirect page
@@ -607,12 +638,19 @@ def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
     # print(clean_text)
 
     article_dict = dict()
+    ambiguous_aliases = set()
     aliases, entities, normalizations = _get_wp_links(text)
     for alias, entity, norm in zip(aliases, entities, normalizations):
-        entity_id = wp_to_id.get(entity)
-        if entity_id:
-            article_dict[alias] = entity_id
-            article_dict[entity] = entity_id
+        if alias not in ambiguous_aliases:
+            entity_id = wp_to_id.get(entity)
+            if entity_id:
+                # TODO: take care of these conflicts ! Currently they are being removed from the dataset
+                if article_dict.get(alias) and article_dict[alias] != entity_id:
+                    ambiguous_aliases.add(alias)
+                    article_dict.pop(alias)
+                    # print("Found conflicting alias", alias, "in article", article_id, article_title)
+                else:
+                    article_dict[alias] = entity_id
 
     # print("found entities:")
     for alias, entity in article_dict.items():
@@ -627,18 +665,18 @@ def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
             # print all incorrect candidates
             for c in candidates:
                 if entity != c.entity_:
-                    _write_training_entity(article_id=article_id,
+                    _write_training_entity(outputfile=entityfile,
+                                           article_id=article_id,
                                            alias=alias,
                                            entity=c.entity_,
-                                           correct="0",
-                                           append=True)
+                                           correct="0")
 
             # print the one correct candidate
-            _write_training_entity(article_id=article_id,
+            _write_training_entity(outputfile=entityfile,
+                                   article_id=article_id,
                                    alias=alias,
                                    entity=entity,
-                                   correct="1",
-                                   append=True)
+                                   correct="1")
 
             # print("gold entity", entity)
             # print()
@@ -720,18 +758,13 @@ def _get_clean_wp_text(article_text):
 
 
 def _write_training_article(article_id, clean_text):
-    file_loc = TRAINING_SET_DIR + "/" + str(article_id) + ".txt"
+    file_loc = TRAINING_OUTPUT_SET_DIR + "/" + str(article_id) + ".txt"
     with open(file_loc, mode='w', encoding='utf8') as outputfile:
         outputfile.write(clean_text)
 
 
-def _write_training_entity(article_id, alias, entity, correct, append=True):
-    mode = "w"
-    if append:
-        mode = "a"
-    file_loc = TRAINING_SET_DIR + "/" + "gold_entities.csv"
-    with open(file_loc, mode=mode, encoding='utf8') as outputfile:
-        outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
+def _write_training_entity(outputfile, article_id, alias, entity, correct):
+    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
 
 
 def _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict):

From f5190267e7ba24d395b9933d9fd2ab63c8e5e866 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 3 May 2019 18:09:09 +0200
Subject: [PATCH 029/148] run only 100M of WP data as training dataset (9%)

---
 examples/pipeline/wikidata_entity_linking.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 0db7f4665..4fe97e874 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -520,7 +520,7 @@ def create_training(kb):
         raise ValueError("kb should be defined")
     # nlp = spacy.load('en_core_web_sm')
     wp_to_id = _get_entity_to_id()
-    _read_wikipedia_texts(kb, wp_to_id, limit=None)
+    _read_wikipedia_texts(kb, wp_to_id, limit=100000000) # TODO: full dataset
 
 
 def _read_wikipedia_texts(kb, wp_to_id, limit=None):
@@ -552,7 +552,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
             reading_text = False
             reading_revision = False
             while line and (not limit or cnt < limit):
-                if cnt % 500000 == 0:
+                if cnt % 1000000 == 0:
                     print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
                 clean_line = line.strip().decode("utf-8")
                 # print(clean_line)

From 69612155782d586c26532dce0f3816d8befcf41a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 6 May 2019 10:56:56 +0200
Subject: [PATCH 030/148] refactor code to separate functionality into
 different files

---
 examples/pipeline/dummy_entity_linking.py     |   6 +-
 .../pipeline/wiki_entity_linking/__init__.py  |   0
 .../wiki_entity_linking/kb_creator.py         | 137 +++
 .../pipeline/wiki_entity_linking/run_el.py    |  36 +
 .../training_set_creator.py                   | 276 ++++++
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 103 +++
 .../wiki_entity_linking/wikidata_processor.py | 166 ++++
 .../wikipedia_processor.py                    | 187 ++++
 examples/pipeline/wikidata_entity_linking.py  | 852 ------------------
 9 files changed, 908 insertions(+), 855 deletions(-)
 create mode 100644 examples/pipeline/wiki_entity_linking/__init__.py
 create mode 100644 examples/pipeline/wiki_entity_linking/kb_creator.py
 create mode 100644 examples/pipeline/wiki_entity_linking/run_el.py
 create mode 100644 examples/pipeline/wiki_entity_linking/training_set_creator.py
 create mode 100644 examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
 create mode 100644 examples/pipeline/wiki_entity_linking/wikidata_processor.py
 create mode 100644 examples/pipeline/wiki_entity_linking/wikipedia_processor.py
 delete mode 100644 examples/pipeline/wikidata_entity_linking.py

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
index e93e3e20b..ae36a57b3 100644
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -66,6 +66,6 @@ def add_el(kb, nlp):
 
 
 if __name__ == "__main__":
-    nlp = spacy.load('en_core_web_sm')
-    my_kb = create_kb(nlp.vocab)
-    add_el(my_kb, nlp)
+    my_nlp = spacy.load('en_core_web_sm')
+    my_kb = create_kb(my_nlp.vocab)
+    add_el(my_kb, my_nlp)
diff --git a/examples/pipeline/wiki_entity_linking/__init__.py b/examples/pipeline/wiki_entity_linking/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
new file mode 100644
index 000000000..7ca7cfad1
--- /dev/null
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -0,0 +1,137 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import spacy
+from spacy.kb import KnowledgeBase
+
+import datetime
+
+from . import wikipedia_processor as wp
+from . import wikidata_processor as wd
+
+
+def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input, prior_prob_input,
+              to_print=False, write_entity_defs=True):
+    """ Create the knowledge base from Wikidata entries """
+    kb = KnowledgeBase(vocab=vocab)
+
+    print()
+    print("1. _read_wikidata_entities", datetime.datetime.now())
+    print()
+    # title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
+    title_to_id = wd.read_wikidata_entities_json(limit=None)
+
+    # write the title-ID mapping to file
+    if write_entity_defs:
+        with open(entity_output, mode='w', encoding='utf8') as entity_file:
+            entity_file.write("WP_title" + "|" + "WD_id" + "\n")
+            for title, qid in title_to_id.items():
+                entity_file.write(title + "|" + str(qid) + "\n")
+
+    title_list = list(title_to_id.keys())
+    entity_list = [title_to_id[x] for x in title_list]
+
+    print()
+    print("2. _get_entity_frequencies", datetime.datetime.now())
+    print()
+    entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list)
+
+    print()
+    print("3. adding", len(entity_list), "entities", datetime.datetime.now())
+    print()
+    kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None)
+
+    print()
+    print("4. adding aliases", datetime.datetime.now())
+    print()
+    _add_aliases(kb, title_to_id=title_to_id,
+                 max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
+                 prior_prob_input=prior_prob_input)
+
+    if to_print:
+        print()
+        print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
+
+    print("done with kb", datetime.datetime.now())
+
+    return kb
+
+
+def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input, to_print=False):
+    wp_titles = title_to_id.keys()
+
+    if to_print:
+        print("wp titles:", wp_titles)
+
+    # adding aliases with prior probabilities
+    with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
+        # skip header
+        prior_file.readline()
+        line = prior_file.readline()
+        # we can read this file sequentially, it's sorted by alias, and then by count
+        previous_alias = None
+        total_count = 0
+        counts = list()
+        entities = list()
+        while line:
+            splits = line.replace('\n', "").split(sep='|')
+            new_alias = splits[0]
+            count = int(splits[1])
+            entity = splits[2]
+
+            if new_alias != previous_alias and previous_alias:
+                # done reading the previous alias --> output
+                if len(entities) > 0:
+                    selected_entities = list()
+                    prior_probs = list()
+                    for ent_count, ent_string in zip(counts, entities):
+                        if ent_string in wp_titles:
+                            wd_id = title_to_id[ent_string]
+                            p_entity_givenalias = ent_count / total_count
+                            selected_entities.append(wd_id)
+                            prior_probs.append(p_entity_givenalias)
+
+                    if selected_entities:
+                        try:
+                            kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
+                        except ValueError as e:
+                            print(e)
+                total_count = 0
+                counts = list()
+                entities = list()
+
+            total_count += count
+
+            if len(entities) < max_entities_per_alias and count >= min_occ:
+                counts.append(count)
+                entities.append(entity)
+            previous_alias = new_alias
+
+            line = prior_file.readline()
+
+    if to_print:
+        print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
+
+
+def test_kb(kb):
+    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
+    nlp = spacy.load('en_core_web_sm')
+
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
+    nlp.add_pipe(el_pipe, last=True)
+
+    candidates = kb.get_candidates("Bush")
+
+    print("generating candidates for 'Bush' :")
+    for c in candidates:
+        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
+    print()
+
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
+           "Douglas reminds us to always bring our towel. " \
+           "The main character in Doug's novel is the man Arthur Dent, " \
+           "but Douglas doesn't write about George Washington or Homer Simpson."
+    doc = nlp(text)
+
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
new file mode 100644
index 000000000..eb8343722
--- /dev/null
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import spacy
+
+# requires: pip install neuralcoref --no-binary neuralcoref
+# import neuralcoref
+
+
+# TODO
+def add_coref():
+    """ Add coreference resolution to our model """
+    nlp = spacy.load('en_core_web_sm')
+    # nlp = spacy.load('en')
+
+    # TODO: this doesn't work yet
+    # neuralcoref.add_to_pipe(nlp)
+    print("done adding to pipe")
+
+    doc = nlp(u'My sister has a dog. She loves him.')
+    print("done doc")
+
+    print(doc._.has_coref)
+    print(doc._.coref_clusters)
+
+
+# TODO
+def _run_ner_depr(nlp, clean_text, article_dict):
+    doc = nlp(clean_text)
+    for ent in doc.ents:
+        if ent.label_ == "PERSON":           # TODO: expand to non-persons
+            ent_id = article_dict.get(ent.text)
+            if ent_id:
+                print(" -", ent.text, ent.label_, ent_id)
+            else:
+                print(" -", ent.text, ent.label_, '???')  # TODO: investigate these cases
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
new file mode 100644
index 000000000..e46aeec5b
--- /dev/null
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -0,0 +1,276 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import csv
+import bz2
+import datetime
+
+from . import wikipedia_processor as wp
+
+"""
+Process Wikipedia interlinks to generate a training dataset for the EL algorithm
+"""
+
+
+def create_training(kb, entity_input, training_output):
+    if not kb:
+        raise ValueError("kb should be defined")
+    # nlp = spacy.load('en_core_web_sm')
+    wp_to_id = _get_entity_to_id(entity_input)
+    _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000)  # TODO: full dataset
+
+
+def _get_entity_to_id(entity_input):
+    entity_to_id = dict()
+    with open(entity_input, 'r', encoding='utf8') as csvfile:
+        csvreader = csv.reader(csvfile, delimiter='|')
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            entity_to_id[row[0]] = row[1]
+
+    return entity_to_id
+
+
+def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
+    """
+    Read the XML wikipedia data to parse out training data:
+    raw text data + positive and negative instances
+    """
+
+    title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
+    id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
+
+    read_ids = set()
+
+    entityfile_loc = training_output + "/" + "gold_entities.csv"
+    with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
+        # write entity training header file
+        _write_training_entity(outputfile=entityfile,
+                               article_id="article_id",
+                               alias="alias",
+                               entity="entity",
+                               correct="correct")
+
+        with bz2.open(wp.ENWIKI_DUMP, mode='rb') as file:
+            line = file.readline()
+            cnt = 0
+            article_text = ""
+            article_title = None
+            article_id = None
+            reading_text = False
+            reading_revision = False
+            while line and (not limit or cnt < limit):
+                if cnt % 1000000 == 0:
+                    print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
+                clean_line = line.strip().decode("utf-8")
+                # print(clean_line)
+
+                if clean_line == "<revision>":
+                    reading_revision = True
+                elif clean_line == "</revision>":
+                    reading_revision = False
+
+                # Start reading new page
+                if clean_line == "<page>":
+                    article_text = ""
+                    article_title = None
+                    article_id = None
+
+                # finished reading this page
+                elif clean_line == "</page>":
+                    if article_id:
+                        try:
+                            _process_wp_text(kb, wp_to_id, entityfile, article_id, article_text.strip(), training_output)
+                        # on a previous run, an error occurred after 46M lines and 2h
+                        except Exception as e:
+                            print("Error processing article", article_id, article_title, e)
+                    else:
+                        print("Done processing a page, but couldn't find an article_id ?")
+                        print(article_title)
+                        print(article_text)
+                    article_text = ""
+                    article_title = None
+                    article_id = None
+                    reading_text = False
+                    reading_revision = False
+
+                # start reading text within a page
+                if "<text" in clean_line:
+                    reading_text = True
+
+                if reading_text:
+                    article_text += " " + clean_line
+
+                # stop reading text within a page (we assume a new page doesn't start on the same line)
+                if "</text" in clean_line:
+                    reading_text = False
+
+                # read the ID of this article (outside the revision portion of the document)
+                if not reading_revision:
+                    ids = id_regex.search(clean_line)
+                    if ids:
+                        article_id = ids[0]
+                        if article_id in read_ids:
+                            print("Found duplicate article ID", article_id, clean_line)  # This should never happen ...
+                        read_ids.add(article_id)
+
+                # read the title of this article  (outside the revision portion of the document)
+                if not reading_revision:
+                    titles = title_regex.search(clean_line)
+                    if titles:
+                        article_title = titles[0].strip()
+
+                line = file.readline()
+                cnt += 1
+
+
+text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
+
+
+def _process_wp_text(kb, wp_to_id, entityfile, article_id, article_text, training_output):
+    # remove the text tags
+    text = text_regex.search(article_text).group(0)
+
+    # stop processing if this is a redirect page
+    if text.startswith("#REDIRECT"):
+        return
+
+    # print("WP article", article_id, ":", article_title)
+    # print()
+    # print(text)
+
+    # get the raw text without markup etc
+    clean_text = _get_clean_wp_text(text)
+    # print()
+    # print(clean_text)
+
+    article_dict = dict()
+    ambiguous_aliases = set()
+    aliases, entities, normalizations = wp.get_wp_links(text)
+    for alias, entity, norm in zip(aliases, entities, normalizations):
+        if alias not in ambiguous_aliases:
+            entity_id = wp_to_id.get(entity)
+            if entity_id:
+                # TODO: take care of these conflicts ! Currently they are being removed from the dataset
+                if article_dict.get(alias) and article_dict[alias] != entity_id:
+                    ambiguous_aliases.add(alias)
+                    article_dict.pop(alias)
+                    # print("Found conflicting alias", alias, "in article", article_id, article_title)
+                else:
+                    article_dict[alias] = entity_id
+
+    # print("found entities:")
+    for alias, entity in article_dict.items():
+        # print(alias, "-->", entity)
+        candidates = kb.get_candidates(alias)
+
+        # as training data, we only store entities that are sufficiently ambiguous
+        if len(candidates) > 1:
+            _write_training_article(article_id=article_id, clean_text=clean_text, training_output=training_output)
+            # print("alias", alias)
+
+            # print all incorrect candidates
+            for c in candidates:
+                if entity != c.entity_:
+                    _write_training_entity(outputfile=entityfile,
+                                           article_id=article_id,
+                                           alias=alias,
+                                           entity=c.entity_,
+                                           correct="0")
+
+            # print the one correct candidate
+            _write_training_entity(outputfile=entityfile,
+                                   article_id=article_id,
+                                   alias=alias,
+                                   entity=entity,
+                                   correct="1")
+
+            # print("gold entity", entity)
+            # print()
+
+    # _run_ner_depr(nlp, clean_text, article_dict)
+    # print()
+
+
+info_regex = re.compile(r'{[^{]*?}')
+interwiki_regex = re.compile(r'\[\[([^|]*?)]]')
+interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]')
+htlm_regex = re.compile(r'&lt;!--[^!]*--&gt;')
+category_regex = re.compile(r'\[\[Category:[^\[]*]]')
+file_regex = re.compile(r'\[\[File:[^[\]]+]]')
+ref_regex = re.compile(r'&lt;ref.*?&gt;')     # non-greedy
+ref_2_regex = re.compile(r'&lt;/ref.*?&gt;')  # non-greedy
+
+
+def _get_clean_wp_text(article_text):
+    clean_text = article_text.strip()
+
+    # remove bolding & italic markup
+    clean_text = clean_text.replace('\'\'\'', '')
+    clean_text = clean_text.replace('\'\'', '')
+
+    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
+    try_again = True
+    previous_length = len(clean_text)
+    while try_again:
+        clean_text = info_regex.sub('', clean_text)  # non-greedy match excluding a nested {
+        if len(clean_text) < previous_length:
+            try_again = True
+        else:
+            try_again = False
+        previous_length = len(clean_text)
+
+    # remove simple interwiki links (no alternative name)
+    clean_text = interwiki_regex.sub(r'\1', clean_text)
+
+    # remove simple interwiki links by picking the alternative name
+    clean_text = interwiki_2_regex.sub(r'\1', clean_text)
+
+    # remove HTML comments
+    clean_text = htlm_regex.sub('', clean_text)
+
+    # remove Category and File statements
+    clean_text = category_regex.sub('', clean_text)
+    clean_text = file_regex.sub('', clean_text)
+
+    # remove multiple =
+    while '==' in clean_text:
+        clean_text = clean_text.replace("==", "=")
+
+    clean_text = clean_text.replace(". =", ".")
+    clean_text = clean_text.replace(" = ", ". ")
+    clean_text = clean_text.replace("= ", ".")
+    clean_text = clean_text.replace(" =", "")
+
+    # remove refs (non-greedy match)
+    clean_text = ref_regex.sub('', clean_text)
+    clean_text = ref_2_regex.sub('', clean_text)
+
+    # remove additional wikiformatting
+    clean_text = re.sub(r'&lt;blockquote&gt;', '', clean_text)
+    clean_text = re.sub(r'&lt;/blockquote&gt;', '', clean_text)
+
+    # change special characters back to normal ones
+    clean_text = clean_text.replace(r'&lt;', '<')
+    clean_text = clean_text.replace(r'&gt;', '>')
+    clean_text = clean_text.replace(r'&quot;', '"')
+    clean_text = clean_text.replace(r'&amp;nbsp;', ' ')
+    clean_text = clean_text.replace(r'&amp;', '&')
+
+    # remove multiple spaces
+    while '  ' in clean_text:
+        clean_text = clean_text.replace('  ', ' ')
+
+    return clean_text.strip()
+
+
+def _write_training_article(article_id, clean_text, training_output):
+    file_loc = training_output + "/" + str(article_id) + ".txt"
+    with open(file_loc, mode='w', encoding='utf8') as outputfile:
+        outputfile.write(clean_text)
+
+
+def _write_training_entity(outputfile, article_id, alias, entity, correct):
+    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
new file mode 100644
index 000000000..20d4f5953
--- /dev/null
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from . import wikipedia_processor as wp, kb_creator, training_set_creator
+
+import spacy
+from spacy.vocab import Vocab
+from spacy.kb import KnowledgeBase
+import datetime
+
+"""
+Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
+"""
+
+PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
+ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
+ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
+
+KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
+VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
+
+TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
+TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
+
+
+if __name__ == "__main__":
+    print("START", datetime.datetime.now())
+    print()
+    my_kb = None
+
+    # one-time methods to create KB and write to file
+    to_create_prior_probs = False
+    to_create_entity_counts = False
+    to_create_kb = False
+
+    # read KB back in from file
+    to_read_kb = True
+    to_test_kb = False
+
+    create_wp_training = False
+
+    # STEP 1 : create prior probabilities from WP
+    # run only once !
+    if to_create_prior_probs:
+        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
+        wp.read_wikipedia_prior_probs(prior_prob_output=PRIOR_PROB)
+        print()
+
+    # STEP 2 : deduce entity frequencies from WP
+    # run only once !
+    if to_create_entity_counts:
+        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
+        wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False)
+        print()
+
+    # STEP 3 : create KB and write to file
+    # run only once !
+    if to_create_kb:
+        print("STEP 3a: to_create_kb", datetime.datetime.now())
+        my_nlp = spacy.load('en_core_web_sm')
+        my_vocab = my_nlp.vocab
+        my_kb = kb_creator.create_kb(my_vocab,
+                                     max_entities_per_alias=10,
+                                     min_occ=5,
+                                     entity_output=ENTITY_DEFS,
+                                     count_input=ENTITY_COUNTS,
+                                     prior_prob_input=PRIOR_PROB,
+                                     to_print=False)
+        print("kb entities:", my_kb.get_size_entities())
+        print("kb aliases:", my_kb.get_size_aliases())
+        print()
+
+        print("STEP 3b: write KB", datetime.datetime.now())
+        my_kb.dump(KB_FILE)
+        my_vocab.to_disk(VOCAB_DIR)
+        print()
+
+    # STEP 4 : read KB back in from file
+    if to_read_kb:
+        print("STEP 4: to_read_kb", datetime.datetime.now())
+        my_vocab = Vocab()
+        my_vocab.from_disk(VOCAB_DIR)
+        my_kb = KnowledgeBase(vocab=my_vocab)
+        my_kb.load_bulk(KB_FILE)
+        print("kb entities:", my_kb.get_size_entities())
+        print("kb aliases:", my_kb.get_size_aliases())
+        print()
+
+        # test KB
+        if to_test_kb:
+            kb_creator.test_kb(my_kb)
+            print()
+
+    # STEP 5: create a training dataset from WP
+    if create_wp_training:
+        print("STEP 5: create training dataset", datetime.datetime.now())
+        training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR)
+
+    # TODO coreference resolution
+    # add_coref()
+
+    print()
+    print("STOP", datetime.datetime.now())
diff --git a/examples/pipeline/wiki_entity_linking/wikidata_processor.py b/examples/pipeline/wiki_entity_linking/wikidata_processor.py
new file mode 100644
index 000000000..03db05414
--- /dev/null
+++ b/examples/pipeline/wiki_entity_linking/wikidata_processor.py
@@ -0,0 +1,166 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import bz2
+import json
+import datetime
+
+# TODO: remove hardcoded paths
+WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
+
+
+def read_wikidata_entities_json(limit=None, to_print=False):
+    """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
+
+    languages = {'en', 'de'}
+    prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
+    site_filter = 'enwiki'
+
+    title_to_id = dict()
+
+    # parse appropriate fields - depending on what we need in the KB
+    parse_properties = False
+    parse_sitelinks = True
+    parse_labels = False
+    parse_descriptions = False
+    parse_aliases = False
+
+    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
+        line = file.readline()
+        cnt = 0
+        while line and (not limit or cnt < limit):
+            if cnt % 500000 == 0:
+                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
+            clean_line = line.strip()
+            if clean_line.endswith(b","):
+                clean_line = clean_line[:-1]
+            if len(clean_line) > 1:
+                obj = json.loads(clean_line)
+                entry_type = obj["type"]
+
+                if entry_type == "item":
+                    # filtering records on their properties
+                    keep = False
+
+                    claims = obj["claims"]
+                    for prop, value_set in prop_filter.items():
+                        claim_property = claims.get(prop, None)
+                        if claim_property:
+                            for cp in claim_property:
+                                cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
+                                cp_rank = cp['rank']
+                                if cp_rank != "deprecated" and cp_id in value_set:
+                                    keep = True
+
+                    if keep:
+                        unique_id = obj["id"]
+
+                        if to_print:
+                            print("ID:", unique_id)
+                            print("type:", entry_type)
+
+                        # parsing all properties that refer to other entities
+                        if parse_properties:
+                            for prop, claim_property in claims.items():
+                                cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
+                                cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
+                                if cp_values:
+                                    if to_print:
+                                        print("prop:", prop, cp_values)
+
+                        if parse_sitelinks:
+                            site_value = obj["sitelinks"].get(site_filter, None)
+                            if site_value:
+                                site = site_value['title']
+                                if to_print:
+                                    print(site_filter, ":", site)
+                                title_to_id[site] = unique_id
+                                # print(site, "for", unique_id)
+
+                        if parse_labels:
+                            labels = obj["labels"]
+                            if labels:
+                                for lang in languages:
+                                    lang_label = labels.get(lang, None)
+                                    if lang_label:
+                                        if to_print:
+                                            print("label (" + lang + "):", lang_label["value"])
+
+                        if parse_descriptions:
+                            descriptions = obj["descriptions"]
+                            if descriptions:
+                                for lang in languages:
+                                    lang_descr = descriptions.get(lang, None)
+                                    if lang_descr:
+                                        if to_print:
+                                            print("description (" + lang + "):", lang_descr["value"])
+
+                        if parse_aliases:
+                            aliases = obj["aliases"]
+                            if aliases:
+                                for lang in languages:
+                                    lang_aliases = aliases.get(lang, None)
+                                    if lang_aliases:
+                                        for item in lang_aliases:
+                                            if to_print:
+                                                print("alias (" + lang + "):", item["value"])
+
+                        if to_print:
+                            print()
+            line = file.readline()
+            cnt += 1
+
+    return title_to_id
+
+
+def _read_wikidata_entities_regex_depr(limit=None):
+    """
+    Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines.
+    TODO: doesn't work yet. may be deleted ?
+    """
+
+    regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE)
+    regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
+    regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE)
+    regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
+
+    title_to_id = dict()
+
+    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
+        line = file.readline()
+        cnt = 0
+        while line and (not limit or cnt < limit):
+            if cnt % 500000 == 0:
+                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
+            clean_line = line.strip()
+            if clean_line.endswith(b","):
+                clean_line = clean_line[:-1]
+            if len(clean_line) > 1:
+                clean_line = line.strip().decode("utf-8")
+                keep = False
+
+                p31_matches = regex_p31.findall(clean_line)
+                if p31_matches:
+                    for p31_match in p31_matches:
+                        id_matches = regex_id.findall(p31_match)
+                        for id_match in id_matches:
+                            id_match = id_match[6:][:-1]
+                            if id_match == "Q5" or id_match == "Q15632617":
+                                keep = True
+
+                if keep:
+                    id_match = regex_id.search(clean_line).group(0)
+                    id_match = id_match[6:][:-1]
+
+                    enwiki_matches = regex_enwiki.findall(clean_line)
+                    if enwiki_matches:
+                        for enwiki_match in enwiki_matches:
+                            title_match = regex_title.search(enwiki_match).group(0)
+                            title = title_match[9:][:-1]
+                            title_to_id[title] = id_match
+
+            line = file.readline()
+            cnt += 1
+
+    return title_to_id
diff --git a/examples/pipeline/wiki_entity_linking/wikipedia_processor.py b/examples/pipeline/wiki_entity_linking/wikipedia_processor.py
new file mode 100644
index 000000000..0461cb19f
--- /dev/null
+++ b/examples/pipeline/wiki_entity_linking/wikipedia_processor.py
@@ -0,0 +1,187 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import bz2
+import csv
+import datetime
+
+"""
+Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
+"""
+
+
+# TODO: remove hardcoded paths
+ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
+ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
+
+map_alias_to_link = dict()
+
+# these will/should be matched ignoring case
+wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
+                   "d", "dbdump", "download", "Draft", "Education", "Foundation",
+                   "Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator",
+                   "m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki",
+                   "MediaZilla", "Meta", "Metawikipedia", "Module",
+                   "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki",
+                   "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev",
+                   "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn",
+                   "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki",
+                   "User", "User talk", "v", "voy",
+                   "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews",
+                   "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech",
+                   "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"]
+
+# find the links
+link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
+
+# match on interwiki links, e.g. `en:` or `:fr:`
+ns_regex = r":?" + "[a-z][a-z]" + ":"
+
+# match on Namespace: optionally preceded by a :
+for ns in wiki_namespaces:
+    ns_regex += "|" + ":?" + ns + ":"
+
+ns_regex = re.compile(ns_regex, re.IGNORECASE)
+
+
+def read_wikipedia_prior_probs(prior_prob_output):
+    """
+    Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
+    The full file takes about 2h to parse 1100M lines (update printed every 5M lines).
+    It works relatively fast because we don't care about which article we parsed the interwiki from,
+    we just process line by line.
+    """
+
+    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
+        line = file.readline()
+        cnt = 0
+        while line:
+            if cnt % 5000000 == 0:
+                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
+            clean_line = line.strip().decode("utf-8")
+
+            aliases, entities, normalizations = get_wp_links(clean_line)
+            for alias, entity, norm in zip(aliases, entities, normalizations):
+                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
+                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
+
+            line = file.readline()
+            cnt += 1
+
+    # write all aliases and their entities and occurrences to file
+    with open(prior_prob_output, mode='w', encoding='utf8') as outputfile:
+        outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
+        for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
+            for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
+                outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
+
+
+def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
+    alias = alias.strip()
+    entity = entity.strip()
+
+    # remove everything after # as this is not part of the title but refers to a specific paragraph
+    if normalize_entity:
+        # wikipedia titles are always capitalized
+        entity = _capitalize_first(entity.split("#")[0])
+    if normalize_alias:
+        alias = alias.split("#")[0]
+
+    if alias and entity:
+        alias_dict = map_alias_to_link.get(alias, dict())
+        entity_count = alias_dict.get(entity, 0)
+        alias_dict[entity] = entity_count + 1
+        map_alias_to_link[alias] = alias_dict
+
+
+def get_wp_links(text):
+    aliases = []
+    entities = []
+    normalizations = []
+
+    matches = link_regex.findall(text)
+    for match in matches:
+        match = match[2:][:-2].replace("_", " ").strip()
+
+        if ns_regex.match(match):
+            pass  # ignore namespaces at the beginning of the string
+
+        # this is a simple link, with the alias the same as the mention
+        elif "|" not in match:
+            aliases.append(match)
+            entities.append(match)
+            normalizations.append(True)
+
+        # in wiki format, the link is written as [[entity|alias]]
+        else:
+            splits = match.split("|")
+            entity = splits[0].strip()
+            alias = splits[1].strip()
+            # specific wiki format  [[alias (specification)|]]
+            if len(alias) == 0 and "(" in entity:
+                alias = entity.split("(")[0]
+                aliases.append(alias)
+                entities.append(entity)
+                normalizations.append(False)
+            else:
+                aliases.append(alias)
+                entities.append(entity)
+                normalizations.append(False)
+
+    return aliases, entities, normalizations
+
+
+def _capitalize_first(text):
+    if not text:
+        return None
+    result = text[0].capitalize()
+    if len(result) > 0:
+        result += text[1:]
+    return result
+
+
+def write_entity_counts(prior_prob_input, count_output, to_print=False):
+    """ Write entity counts for quick access later  """
+    entity_to_count = dict()
+    total_count = 0
+
+    with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
+        # skip header
+        prior_file.readline()
+        line = prior_file.readline()
+
+        while line:
+            splits = line.replace('\n', "").split(sep='|')
+            # alias = splits[0]
+            count = int(splits[1])
+            entity = splits[2]
+
+            current_count = entity_to_count.get(entity, 0)
+            entity_to_count[entity] = current_count + count
+
+            total_count += count
+
+            line = prior_file.readline()
+
+    with open(count_output, mode='w', encoding='utf8') as entity_file:
+        entity_file.write("entity" + "|" + "count" + "\n")
+        for entity, count in entity_to_count.items():
+            entity_file.write(entity + "|" + str(count) + "\n")
+
+    if to_print:
+        for entity, count in entity_to_count.items():
+            print("Entity count:", entity, count)
+        print("Total count:", total_count)
+
+
+def get_entity_frequencies(count_input, entities):
+    entity_to_count = dict()
+    with open(count_input, 'r', encoding='utf8') as csvfile:
+        csvreader = csv.reader(csvfile, delimiter='|')
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            entity_to_count[row[0]] = int(row[1])
+
+    return [entity_to_count.get(e, 0) for e in entities]
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
deleted file mode 100644
index 4fe97e874..000000000
--- a/examples/pipeline/wikidata_entity_linking.py
+++ /dev/null
@@ -1,852 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-"""
-Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
-"""
-import re
-import csv
-import json
-import spacy
-import datetime
-import bz2
-
-from spacy.kb import KnowledgeBase
-from spacy.vocab import Vocab
-
-# requires: pip install neuralcoref --no-binary neuralcoref
-# import neuralcoref
-
-# TODO: remove hardcoded paths
-WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
-ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
-ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
-
-PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
-ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
-ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
-
-KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
-VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
-
-TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
-TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
-
-
-# these will/should be matched ignoring case
-wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
-                   "d", "dbdump", "download", "Draft", "Education", "Foundation",
-                   "Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator",
-                   "m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki",
-                   "MediaZilla", "Meta", "Metawikipedia", "Module",
-                   "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki",
-                   "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev",
-                   "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn",
-                   "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki",
-                   "User", "User talk", "v", "voy",
-                   "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews",
-                   "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech",
-                   "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"]
-
-map_alias_to_link = dict()
-
-
-def read_wikipedia_prior_probs():
-    """
-    STEP 1: Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
-    The full file takes about 2h to parse 1100M lines (update printed every 5M lines).
-    It works relatively fast because we don't care about which article we parsed the interwiki from,
-    we just process line by line.
-    """
-
-    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
-        line = file.readline()
-        cnt = 0
-        while line:
-            if cnt % 5000000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
-            clean_line = line.strip().decode("utf-8")
-
-            aliases, entities, normalizations = _get_wp_links(clean_line)
-            for alias, entity, norm in zip(aliases, entities, normalizations):
-                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
-                _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
-
-            line = file.readline()
-            cnt += 1
-
-    # write all aliases and their entities and occurrences to file
-    with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile:
-        outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
-        for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
-            for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
-                outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
-
-
-# find the links
-link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
-
-# match on interwiki links, e.g. `en:` or `:fr:`
-ns_regex = r":?" + "[a-z][a-z]" + ":"
-
-# match on Namespace: optionally preceded by a :
-for ns in wiki_namespaces:
-    ns_regex += "|" + ":?" + ns + ":"
-
-ns_regex = re.compile(ns_regex, re.IGNORECASE)
-
-
-def _get_wp_links(text):
-    aliases = []
-    entities = []
-    normalizations = []
-
-    matches = link_regex.findall(text)
-    for match in matches:
-        match = match[2:][:-2].replace("_", " ").strip()
-
-        if ns_regex.match(match):
-            pass  # ignore namespaces at the beginning of the string
-
-        # this is a simple link, with the alias the same as the mention
-        elif "|" not in match:
-            aliases.append(match)
-            entities.append(match)
-            normalizations.append(True)
-
-        # in wiki format, the link is written as [[entity|alias]]
-        else:
-            splits = match.split("|")
-            entity = splits[0].strip()
-            alias = splits[1].strip()
-            # specific wiki format  [[alias (specification)|]]
-            if len(alias) == 0 and "(" in entity:
-                alias = entity.split("(")[0]
-                aliases.append(alias)
-                entities.append(entity)
-                normalizations.append(False)
-            else:
-                aliases.append(alias)
-                entities.append(entity)
-                normalizations.append(False)
-
-    return aliases, entities, normalizations
-
-
-def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
-    alias = alias.strip()
-    entity = entity.strip()
-
-    # remove everything after # as this is not part of the title but refers to a specific paragraph
-    if normalize_entity:
-        # wikipedia titles are always capitalized
-        entity = _capitalize_first(entity.split("#")[0])
-    if normalize_alias:
-        alias = alias.split("#")[0]
-
-    if alias and entity:
-        alias_dict = map_alias_to_link.get(alias, dict())
-        entity_count = alias_dict.get(entity, 0)
-        alias_dict[entity] = entity_count + 1
-        map_alias_to_link[alias] = alias_dict
-
-
-def _capitalize_first(text):
-    if not text:
-        return None
-    result = text[0].capitalize()
-    if len(result) > 0:
-        result += text[1:]
-    return result
-
-
-def write_entity_counts(to_print=False):
-    """ STEP 2: write entity counts  """
-    entity_to_count = dict()
-    total_count = 0
-
-    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
-        # skip header
-        prior_file.readline()
-        line = prior_file.readline()
-
-        while line:
-            splits = line.replace('\n', "").split(sep='|')
-            # alias = splits[0]
-            count = int(splits[1])
-            entity = splits[2]
-
-            current_count = entity_to_count.get(entity, 0)
-            entity_to_count[entity] = current_count + count
-
-            total_count += count
-
-            line = prior_file.readline()
-
-    with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file:
-        entity_file.write("entity" + "|" + "count" + "\n")
-        for entity, count in entity_to_count.items():
-            entity_file.write(entity + "|" + str(count) + "\n")
-
-    if to_print:
-        for entity, count in entity_to_count.items():
-            print("Entity count:", entity, count)
-        print("Total count:", total_count)
-
-
-def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False, write_entity_defs=True):
-    """ STEP 3: create the knowledge base """
-    kb = KnowledgeBase(vocab=vocab)
-
-    print()
-    print("1. _read_wikidata_entities", datetime.datetime.now())
-    print()
-    # title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
-    title_to_id = _read_wikidata_entities_json(limit=None)
-
-    # write the title-ID mapping to file
-    if write_entity_defs:
-        with open(ENTITY_DEFS, mode='w', encoding='utf8') as entity_file:
-            entity_file.write("WP_title" + "|" + "WD_id" + "\n")
-            for title, qid in title_to_id.items():
-                entity_file.write(title + "|" + str(qid) + "\n")
-
-    title_list = list(title_to_id.keys())
-    entity_list = [title_to_id[x] for x in title_list]
-
-    print()
-    print("2. _get_entity_frequencies", datetime.datetime.now())
-    print()
-    entity_frequencies = _get_entity_frequencies(entities=title_list)
-
-    print()
-    print("3. adding", len(entity_list), "entities", datetime.datetime.now())
-    print()
-    kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None)
-
-    print()
-    print("4. adding aliases", datetime.datetime.now())
-    print()
-    _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ)
-
-    # TODO: read wikipedia texts for entity context
-    # _read_wikipedia()
-
-    if to_print:
-        print()
-        print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
-
-    print("done with kb", datetime.datetime.now())
-
-    return kb
-
-
-def _get_entity_frequencies(entities):
-    entity_to_count = dict()
-    with open(ENTITY_COUNTS, 'r', encoding='utf8') as csvfile:
-        csvreader = csv.reader(csvfile, delimiter='|')
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            entity_to_count[row[0]] = int(row[1])
-
-    return [entity_to_count.get(e, 0) for e in entities]
-
-
-def _get_entity_to_id():
-    entity_to_id = dict()
-    with open(ENTITY_DEFS, 'r', encoding='utf8') as csvfile:
-        csvreader = csv.reader(csvfile, delimiter='|')
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            entity_to_id[row[0]] = row[1]
-
-    return entity_to_id
-
-
-def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False):
-    wp_titles = title_to_id.keys()
-
-    if to_print:
-        print("wp titles:", wp_titles)
-
-    # adding aliases with prior probabilities
-    with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
-        # skip header
-        prior_file.readline()
-        line = prior_file.readline()
-        # we can read this file sequentially, it's sorted by alias, and then by count
-        previous_alias = None
-        total_count = 0
-        counts = list()
-        entities = list()
-        while line:
-            splits = line.replace('\n', "").split(sep='|')
-            new_alias = splits[0]
-            count = int(splits[1])
-            entity = splits[2]
-
-            if new_alias != previous_alias and previous_alias:
-                # done reading the previous alias --> output
-                if len(entities) > 0:
-                    selected_entities = list()
-                    prior_probs = list()
-                    for ent_count, ent_string in zip(counts, entities):
-                        if ent_string in wp_titles:
-                            wd_id = title_to_id[ent_string]
-                            p_entity_givenalias = ent_count / total_count
-                            selected_entities.append(wd_id)
-                            prior_probs.append(p_entity_givenalias)
-
-                    if selected_entities:
-                        try:
-                            kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
-                        except ValueError as e:
-                            print(e)
-                total_count = 0
-                counts = list()
-                entities = list()
-
-            total_count += count
-
-            if len(entities) < max_entities_per_alias and count >= min_occ:
-                counts.append(count)
-                entities.append(entity)
-            previous_alias = new_alias
-
-            line = prior_file.readline()
-
-    if to_print:
-        print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
-
-
-def _read_wikidata_entities_json(limit=None, to_print=False):
-    """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
-
-    languages = {'en', 'de'}
-    prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
-    site_filter = 'enwiki'
-
-    title_to_id = dict()
-
-    # parse appropriate fields - depending on what we need in the KB
-    parse_properties = False
-    parse_sitelinks = True
-    parse_labels = False
-    parse_descriptions = False
-    parse_aliases = False
-
-    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
-        line = file.readline()
-        cnt = 0
-        while line and (not limit or cnt < limit):
-            if cnt % 500000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
-            clean_line = line.strip()
-            if clean_line.endswith(b","):
-                clean_line = clean_line[:-1]
-            if len(clean_line) > 1:
-                obj = json.loads(clean_line)
-                entry_type = obj["type"]
-
-                if entry_type == "item":
-                    # filtering records on their properties
-                    keep = False
-
-                    claims = obj["claims"]
-                    for prop, value_set in prop_filter.items():
-                        claim_property = claims.get(prop, None)
-                        if claim_property:
-                            for cp in claim_property:
-                                cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
-                                cp_rank = cp['rank']
-                                if cp_rank != "deprecated" and cp_id in value_set:
-                                    keep = True
-
-                    if keep:
-                        unique_id = obj["id"]
-
-                        if to_print:
-                            print("ID:", unique_id)
-                            print("type:", entry_type)
-
-                        # parsing all properties that refer to other entities
-                        if parse_properties:
-                            for prop, claim_property in claims.items():
-                                cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
-                                cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
-                                if cp_values:
-                                    if to_print:
-                                        print("prop:", prop, cp_values)
-
-                        if parse_sitelinks:
-                            site_value = obj["sitelinks"].get(site_filter, None)
-                            if site_value:
-                                site = site_value['title']
-                                if to_print:
-                                    print(site_filter, ":", site)
-                                title_to_id[site] = unique_id
-                                # print(site, "for", unique_id)
-
-                        if parse_labels:
-                            labels = obj["labels"]
-                            if labels:
-                                for lang in languages:
-                                    lang_label = labels.get(lang, None)
-                                    if lang_label:
-                                        if to_print:
-                                            print("label (" + lang + "):", lang_label["value"])
-
-                        if parse_descriptions:
-                            descriptions = obj["descriptions"]
-                            if descriptions:
-                                for lang in languages:
-                                    lang_descr = descriptions.get(lang, None)
-                                    if lang_descr:
-                                        if to_print:
-                                            print("description (" + lang + "):", lang_descr["value"])
-
-                        if parse_aliases:
-                            aliases = obj["aliases"]
-                            if aliases:
-                                for lang in languages:
-                                    lang_aliases = aliases.get(lang, None)
-                                    if lang_aliases:
-                                        for item in lang_aliases:
-                                            if to_print:
-                                                print("alias (" + lang + "):", item["value"])
-
-                        if to_print:
-                            print()
-            line = file.readline()
-            cnt += 1
-
-    return title_to_id
-
-
-def _read_wikidata_entities_regex_depr(limit=None, to_print=False):
-    """ Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines. """
-
-    regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE)
-    regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
-    regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE)
-    regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
-
-    title_to_id = dict()
-
-    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
-        line = file.readline()
-        cnt = 0
-        while line and (not limit or cnt < limit):
-            if cnt % 500000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
-            clean_line = line.strip()
-            if clean_line.endswith(b","):
-                clean_line = clean_line[:-1]
-            if len(clean_line) > 1:
-                clean_line = line.strip().decode("utf-8")
-                keep = False
-
-                p31_matches = regex_p31.findall(clean_line)
-                if p31_matches:
-                    for p31_match in p31_matches:
-                        id_matches = regex_id.findall(p31_match)
-                        for id_match in id_matches:
-                            id_match = id_match[6:][:-1]
-                            if id_match == "Q5" or id_match == "Q15632617":
-                                keep = True
-
-                if keep:
-                    id_match = regex_id.search(clean_line).group(0)
-                    id_match = id_match[6:][:-1]
-
-                    enwiki_matches = regex_enwiki.findall(clean_line)
-                    if enwiki_matches:
-                        for enwiki_match in enwiki_matches:
-                            title_match = regex_title.search(enwiki_match).group(0)
-                            title = title_match[9:][:-1]
-                            title_to_id[title] = id_match
-                            # print(title, "for", id_match)
-
-            line = file.readline()
-            cnt += 1
-
-    return title_to_id
-
-
-def test_kb(kb):
-    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
-    nlp = spacy.load('en_core_web_sm')
-
-    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
-    nlp.add_pipe(el_pipe, last=True)
-
-    candidates = my_kb.get_candidates("Bush")
-
-    print("generating candidates for 'Bush' :")
-    for c in candidates:
-        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
-    print()
-
-    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
-           "Douglas reminds us to always bring our towel. " \
-           "The main character in Doug's novel is the man Arthur Dent, " \
-           "but Douglas doesn't write about George Washington or Homer Simpson."
-    doc = nlp(text)
-
-    for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
-
-
-def add_coref():
-    """ STEP 5: add coreference resolution to our model """
-    nlp = spacy.load('en_core_web_sm')
-    # nlp = spacy.load('en')
-
-    # TODO: this doesn't work yet
-    # neuralcoref.add_to_pipe(nlp)
-    print("done adding to pipe")
-
-    doc = nlp(u'My sister has a dog. She loves him.')
-    print("done doc")
-
-    print(doc._.has_coref)
-    print(doc._.coref_clusters)
-
-
-def create_training(kb):
-    if not kb:
-        raise ValueError("kb should be defined")
-    # nlp = spacy.load('en_core_web_sm')
-    wp_to_id = _get_entity_to_id()
-    _read_wikipedia_texts(kb, wp_to_id, limit=100000000) # TODO: full dataset
-
-
-def _read_wikipedia_texts(kb, wp_to_id, limit=None):
-    """
-    Read the XML wikipedia data to parse out training data:
-    raw text data + positive and negative instances
-    """
-
-    title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
-    id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
-
-    read_ids = set()
-
-    entityfile_loc = TRAINING_OUTPUT_SET_DIR + "/" + "gold_entities.csv"
-    with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
-        # write entity training header file
-        _write_training_entity(outputfile=entityfile,
-                               article_id="article_id",
-                               alias="alias",
-                               entity="entity",
-                               correct="correct")
-
-        with bz2.open(ENWIKI_DUMP, mode='rb') as file:
-            line = file.readline()
-            cnt = 0
-            article_text = ""
-            article_title = None
-            article_id = None
-            reading_text = False
-            reading_revision = False
-            while line and (not limit or cnt < limit):
-                if cnt % 1000000 == 0:
-                    print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
-                clean_line = line.strip().decode("utf-8")
-                # print(clean_line)
-
-                if clean_line == "<revision>":
-                    reading_revision = True
-                elif clean_line == "</revision>":
-                    reading_revision = False
-
-                # Start reading new page
-                if clean_line == "<page>":
-                    article_text = ""
-                    article_title = None
-                    article_id = None
-
-                # finished reading this page
-                elif clean_line == "</page>":
-                    if article_id:
-                        try:
-                            _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text.strip())
-                        # on a previous run, an error occurred after 46M lines and 2h
-                        except Exception as e:
-                            print("Error processing article", article_id, article_title)
-                            print(e)
-                    else:
-                        print("Done processing a page, but couldn't find an article_id ?")
-                        print(article_title)
-                        print(article_text)
-                    article_text = ""
-                    article_title = None
-                    article_id = None
-                    reading_text = False
-                    reading_revision = False
-
-                # start reading text within a page
-                if "<text" in clean_line:
-                    reading_text = True
-
-                if reading_text:
-                    article_text += " " + clean_line
-
-                # stop reading text within a page (we assume a new page doesn't start on the same line)
-                if "</text" in clean_line:
-                    reading_text = False
-
-                # read the ID of this article (outside the revision portion of the document)
-                if not reading_revision:
-                    ids = id_regex.search(clean_line)
-                    if ids:
-                        article_id = ids[0]
-                        if article_id in read_ids:
-                            print("Found duplicate article ID", article_id, clean_line) # This should never happen ...
-                        read_ids.add(article_id)
-
-                # read the title of this article  (outside the revision portion of the document)
-                if not reading_revision:
-                    titles = title_regex.search(clean_line)
-                    if titles:
-                        article_title = titles[0].strip()
-
-                line = file.readline()
-                cnt += 1
-
-
-text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
-
-
-def _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text):
-    # remove the text tags
-    text = text_regex.search(article_text).group(0)
-
-    # stop processing if this is a redirect page
-    if text.startswith("#REDIRECT"):
-        return
-
-    # print("WP article", article_id, ":", article_title)
-    # print()
-    # print(text)
-
-    # get the raw text without markup etc
-    clean_text = _get_clean_wp_text(text)
-    # print()
-    # print(clean_text)
-
-    article_dict = dict()
-    ambiguous_aliases = set()
-    aliases, entities, normalizations = _get_wp_links(text)
-    for alias, entity, norm in zip(aliases, entities, normalizations):
-        if alias not in ambiguous_aliases:
-            entity_id = wp_to_id.get(entity)
-            if entity_id:
-                # TODO: take care of these conflicts ! Currently they are being removed from the dataset
-                if article_dict.get(alias) and article_dict[alias] != entity_id:
-                    ambiguous_aliases.add(alias)
-                    article_dict.pop(alias)
-                    # print("Found conflicting alias", alias, "in article", article_id, article_title)
-                else:
-                    article_dict[alias] = entity_id
-
-    # print("found entities:")
-    for alias, entity in article_dict.items():
-        # print(alias, "-->", entity)
-        candidates = kb.get_candidates(alias)
-
-        # as training data, we only store entities that are sufficiently ambiguous
-        if len(candidates) > 1:
-            _write_training_article(article_id=article_id, clean_text=clean_text)
-            # print("alias", alias)
-
-            # print all incorrect candidates
-            for c in candidates:
-                if entity != c.entity_:
-                    _write_training_entity(outputfile=entityfile,
-                                           article_id=article_id,
-                                           alias=alias,
-                                           entity=c.entity_,
-                                           correct="0")
-
-            # print the one correct candidate
-            _write_training_entity(outputfile=entityfile,
-                                   article_id=article_id,
-                                   alias=alias,
-                                   entity=entity,
-                                   correct="1")
-
-            # print("gold entity", entity)
-            # print()
-
-    # _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict)
-    # print()
-
-
-info_regex = re.compile(r'{[^{]*?}')
-interwiki_regex = re.compile(r'\[\[([^|]*?)]]')
-interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]')
-htlm_regex = re.compile(r'&lt;!--[^!]*--&gt;')
-category_regex = re.compile(r'\[\[Category:[^\[]*]]')
-file_regex = re.compile(r'\[\[File:[^[\]]+]]')
-ref_regex = re.compile(r'&lt;ref.*?&gt;')     # non-greedy
-ref_2_regex = re.compile(r'&lt;/ref.*?&gt;')  # non-greedy
-
-
-def _get_clean_wp_text(article_text):
-    clean_text = article_text.strip()
-
-    # remove bolding & italic markup
-    clean_text = clean_text.replace('\'\'\'', '')
-    clean_text = clean_text.replace('\'\'', '')
-
-    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
-    try_again = True
-    previous_length = len(clean_text)
-    while try_again:
-        clean_text = info_regex.sub('', clean_text)  # non-greedy match excluding a nested {
-        if len(clean_text) < previous_length:
-            try_again = True
-        else:
-            try_again = False
-        previous_length = len(clean_text)
-
-    # remove simple interwiki links (no alternative name)
-    clean_text = interwiki_regex.sub(r'\1', clean_text)
-
-    # remove simple interwiki links by picking the alternative name
-    clean_text = interwiki_2_regex.sub(r'\1', clean_text)
-
-    # remove HTML comments
-    clean_text = htlm_regex.sub('', clean_text)
-
-    # remove Category and File statements
-    clean_text = category_regex.sub('', clean_text)
-    clean_text = file_regex.sub('', clean_text)
-
-    # remove multiple =
-    while '==' in clean_text:
-        clean_text = clean_text.replace("==", "=")
-
-    clean_text = clean_text.replace(". =", ".")
-    clean_text = clean_text.replace(" = ", ". ")
-    clean_text = clean_text.replace("= ", ".")
-    clean_text = clean_text.replace(" =", "")
-
-    # remove refs (non-greedy match)
-    clean_text = ref_regex.sub('', clean_text)
-    clean_text = ref_2_regex.sub('', clean_text)
-
-    # remove additional wikiformatting
-    clean_text = re.sub(r'&lt;blockquote&gt;', '', clean_text)
-    clean_text = re.sub(r'&lt;/blockquote&gt;', '', clean_text)
-
-    # change special characters back to normal ones
-    clean_text = clean_text.replace(r'&lt;', '<')
-    clean_text = clean_text.replace(r'&gt;', '>')
-    clean_text = clean_text.replace(r'&quot;', '"')
-    clean_text = clean_text.replace(r'&amp;nbsp;', ' ')
-    clean_text = clean_text.replace(r'&amp;', '&')
-
-    # remove multiple spaces
-    while '  ' in clean_text:
-        clean_text = clean_text.replace('  ', ' ')
-
-    return clean_text.strip()
-
-
-def _write_training_article(article_id, clean_text):
-    file_loc = TRAINING_OUTPUT_SET_DIR + "/" + str(article_id) + ".txt"
-    with open(file_loc, mode='w', encoding='utf8') as outputfile:
-        outputfile.write(clean_text)
-
-
-def _write_training_entity(outputfile, article_id, alias, entity, correct):
-    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
-
-
-def _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict):
-    doc = nlp(clean_text)
-    for ent in doc.ents:
-        if ent.label_ == "PERSON":           # TODO: expand to non-persons
-            ent_id = article_dict.get(ent.text)
-            if ent_id:
-                print(" -", ent.text, ent.label_, ent_id)
-            else:
-                print(" -", ent.text, ent.label_, '???')  # TODO: investigate these cases
-
-
-if __name__ == "__main__":
-    print("START", datetime.datetime.now())
-    print()
-    my_kb = None
-
-    # one-time methods to create KB and write to file
-    to_create_prior_probs = False
-    to_create_entity_counts = False
-    to_create_kb = False
-
-    # read KB back in from file
-    to_read_kb = True
-    to_test_kb = False
-
-    create_wp_training = True
-
-    # STEP 1 : create prior probabilities from WP
-    # run only once !
-    if to_create_prior_probs:
-        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
-        read_wikipedia_prior_probs()
-        print()
-
-    # STEP 2 : deduce entity frequencies from WP
-    # run only once !
-    if to_create_entity_counts:
-        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
-        write_entity_counts()
-        print()
-
-    # STEP 3 : create KB and write to file
-    # run only once !
-    if to_create_kb:
-        print("STEP 3a: to_create_kb", datetime.datetime.now())
-        my_nlp = spacy.load('en_core_web_sm')
-        my_vocab = my_nlp.vocab
-        my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
-        print("kb entities:", my_kb.get_size_entities())
-        print("kb aliases:", my_kb.get_size_aliases())
-        print()
-
-        print("STEP 3b: write KB", datetime.datetime.now())
-        my_kb.dump(KB_FILE)
-        my_vocab.to_disk(VOCAB_DIR)
-        print()
-
-    # STEP 4 : read KB back in from file
-    if to_read_kb:
-        print("STEP 4: to_read_kb", datetime.datetime.now())
-        my_vocab = Vocab()
-        my_vocab.from_disk(VOCAB_DIR)
-        my_kb = KnowledgeBase(vocab=my_vocab)
-        my_kb.load_bulk(KB_FILE)
-        print("kb entities:", my_kb.get_size_entities())
-        print("kb aliases:", my_kb.get_size_aliases())
-        print()
-
-        # test KB
-        if to_test_kb:
-            test_kb(my_kb)
-            print()
-
-    # STEP 5: create a training dataset from WP
-    if create_wp_training:
-        print("STEP 5: create training dataset", datetime.datetime.now())
-        create_training(my_kb)
-
-    # TODO coreference resolution
-    # add_coref()
-
-    print()
-    print("STOP", datetime.datetime.now())

From 7e348d7f7ff2d79beec90f8f9862fc52cad8b654 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 6 May 2019 15:13:50 +0200
Subject: [PATCH 031/148] baseline evaluation using highest-freq candidate

---
 .../wiki_entity_linking/kb_creator.py         |  23 ----
 .../pipeline/wiki_entity_linking/run_el.py    | 101 ++++++++++++++++++
 .../training_set_creator.py                   |  36 ++++++-
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  21 +++-
 4 files changed, 152 insertions(+), 29 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index 7ca7cfad1..b9e663bb9 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -112,26 +112,3 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
     if to_print:
         print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
 
-
-def test_kb(kb):
-    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
-    nlp = spacy.load('en_core_web_sm')
-
-    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
-    nlp.add_pipe(el_pipe, last=True)
-
-    candidates = kb.get_candidates("Bush")
-
-    print("generating candidates for 'Bush' :")
-    for c in candidates:
-        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
-    print()
-
-    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
-           "Douglas reminds us to always bring our towel. " \
-           "The main character in Doug's novel is the man Arthur Dent, " \
-           "but Douglas doesn't write about George Washington or Homer Simpson."
-    doc = nlp(text)
-
-    for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
index eb8343722..c2156e31b 100644
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -1,12 +1,113 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import os
 import spacy
+import datetime
+from os import listdir
+
+from examples.pipeline.wiki_entity_linking import training_set_creator
 
 # requires: pip install neuralcoref --no-binary neuralcoref
 # import neuralcoref
 
 
+def run_el_toy_example(nlp, kb):
+    _prepare_pipeline(nlp, kb)
+
+    candidates = kb.get_candidates("Bush")
+
+    print("generating candidates for 'Bush' :")
+    for c in candidates:
+        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
+    print()
+
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
+           "Douglas reminds us to always bring our towel. " \
+           "The main character in Doug's novel is the man Arthur Dent, " \
+           "but Douglas doesn't write about George Washington or Homer Simpson."
+    doc = nlp(text)
+
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
+
+
+def run_el_training(nlp, kb, training_dir, limit=None):
+    _prepare_pipeline(nlp, kb)
+
+    correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir,
+                                                                                 collect_correct=True,
+                                                                                 collect_incorrect=False)
+
+    predictions = list()
+    golds = list()
+
+    cnt = 0
+    for f in listdir(training_dir):
+        if not limit or cnt < limit:
+            if is_dev(f):
+                article_id = f.replace(".txt", "")
+                if cnt % 500 == 0:
+                    print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
+                cnt += 1
+                with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
+                    text = file.read()
+                    doc = nlp(text)
+                    for ent in doc.ents:
+                        if ent.label_ == "PERSON":  # TODO: expand to other types
+                            gold_entity = correct_entries_per_article[article_id].get(ent.text, None)
+                            # only evaluating gold entities we know, because the training data is not complete
+                            if gold_entity:
+                                predictions.append(ent.kb_id_)
+                                golds.append(gold_entity)
+
+    print("Processed", cnt, "dev articles")
+    print()
+    evaluate(predictions, golds)
+
+
+def is_dev(file_name):
+    return file_name.endswith("3.txt")
+
+
+def evaluate(predictions, golds):
+    if len(predictions) != len(golds):
+        raise ValueError("predictions and gold entities should have the same length")
+
+    print("Evaluating", len(golds), "entities")
+
+    tp = 0
+    fp = 0
+    fn = 0
+
+    for pred, gold in zip(predictions, golds):
+        is_correct = pred == gold
+        if not pred:
+            fn += 1
+        elif is_correct:
+            tp += 1
+        else:
+            fp += 1
+
+    print("tp", tp)
+    print("fp", fp)
+    print("fn", fn)
+
+    precision = tp / (tp + fp + 0.0000001)
+    recall = tp / (tp + fn + 0.0000001)
+    fscore = 2 * recall * precision / (recall + precision + 0.0000001)
+
+    print("precision", round(100 * precision, 1), "%")
+    print("recall", round(100 * recall, 1), "%")
+    print("Fscore", round(100 * fscore, 1), "%")
+
+
+def _prepare_pipeline(nlp, kb):
+    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
+    nlp.add_pipe(el_pipe, last=True)
+
+
 # TODO
 def add_coref():
     """ Add coreference resolution to our model """
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index e46aeec5b..47349d3dc 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -12,6 +12,7 @@ from . import wikipedia_processor as wp
 Process Wikipedia interlinks to generate a training dataset for the EL algorithm
 """
 
+ENTITY_FILE = "gold_entities.csv"
 
 def create_training(kb, entity_input, training_output):
     if not kb:
@@ -44,7 +45,7 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
 
     read_ids = set()
 
-    entityfile_loc = training_output + "/" + "gold_entities.csv"
+    entityfile_loc = training_output + "/" + ENTITY_FILE
     with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
         # write entity training header file
         _write_training_entity(outputfile=entityfile,
@@ -274,3 +275,36 @@ def _write_training_article(article_id, clean_text, training_output):
 
 def _write_training_entity(outputfile, article_id, alias, entity, correct):
     outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
+
+
+def read_training_entities(training_output, collect_correct=True, collect_incorrect=False):
+    entityfile_loc = training_output + "/" + ENTITY_FILE
+    incorrect_entries_per_article = dict()
+    correct_entries_per_article = dict()
+    with open(entityfile_loc, mode='r', encoding='utf8') as file:
+        for line in file:
+            fields = line.replace('\n', "").split(sep='|')
+            article_id = fields[0]
+            alias = fields[1]
+            entity = fields[2]
+            correct = fields[3]
+
+            if correct == "1" and collect_correct:
+                entry_dict = correct_entries_per_article.get(article_id, dict())
+                if alias in entry_dict:
+                    raise ValueError("Found alias", alias, "multiple times for article", article_id, "in", ENTITY_FILE)
+                entry_dict[alias] = entity
+                correct_entries_per_article[article_id] = entry_dict
+
+            if correct == "0" and collect_incorrect:
+                entry_dict = incorrect_entries_per_article.get(article_id, dict())
+                entities = entry_dict.get(alias, set())
+                entities.add(entity)
+                entry_dict[alias] = entities
+                incorrect_entries_per_article[article_id] = entry_dict
+
+    return correct_entries_per_article, incorrect_entries_per_article
+
+
+
+
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 20d4f5953..ebc1e7958 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -1,7 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from . import wikipedia_processor as wp, kb_creator, training_set_creator
+from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
 
 import spacy
 from spacy.vocab import Vocab
@@ -19,8 +19,7 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
 VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
 
-TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
-TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
+TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
 
 
 if __name__ == "__main__":
@@ -37,8 +36,12 @@ if __name__ == "__main__":
     to_read_kb = True
     to_test_kb = False
 
+    # create training dataset
     create_wp_training = False
 
+    # apply named entity linking to the training dataset
+    apply_to_training = True
+
     # STEP 1 : create prior probabilities from WP
     # run only once !
     if to_create_prior_probs:
@@ -88,13 +91,21 @@ if __name__ == "__main__":
 
         # test KB
         if to_test_kb:
-            kb_creator.test_kb(my_kb)
+            my_nlp = spacy.load('en_core_web_sm')
+            run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp)
             print()
 
     # STEP 5: create a training dataset from WP
     if create_wp_training:
         print("STEP 5: create training dataset", datetime.datetime.now())
-        training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR)
+        training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_DIR)
+
+    # STEP 6: apply the EL algorithm on the training dataset
+    if apply_to_training:
+        my_nlp = spacy.load('en_core_web_sm')
+        run_el.run_el_training(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=1000)
+        print()
+
 
     # TODO coreference resolution
     # add_coref()

From 9f33732b96310dc482097e1a6661415a08acc57a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 7 May 2019 16:03:42 +0200
Subject: [PATCH 032/148] using entity descriptions and article texts as input
 embedding vectors for training

---
 .../wiki_entity_linking/kb_creator.py         | 50 +++++++++--
 .../pipeline/wiki_entity_linking/run_el.py    |  4 +-
 .../pipeline/wiki_entity_linking/train_el.py  | 58 ++++++++++++
 .../training_set_creator.py                   | 19 +---
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 34 ++++---
 .../wiki_entity_linking/wikidata_processor.py | 90 ++++---------------
 6 files changed, 147 insertions(+), 108 deletions(-)
 create mode 100644 examples/pipeline/wiki_entity_linking/train_el.py

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index b9e663bb9..bb00f918d 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -4,13 +4,16 @@ from __future__ import unicode_literals
 import spacy
 from spacy.kb import KnowledgeBase
 
+import csv
 import datetime
 
 from . import wikipedia_processor as wp
 from . import wikidata_processor as wd
 
 
-def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input, prior_prob_input,
+def create_kb(vocab, max_entities_per_alias, min_occ,
+              entity_def_output, entity_descr_output,
+              count_input, prior_prob_input,
               to_print=False, write_entity_defs=True):
     """ Create the knowledge base from Wikidata entries """
     kb = KnowledgeBase(vocab=vocab)
@@ -18,15 +21,11 @@ def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input
     print()
     print("1. _read_wikidata_entities", datetime.datetime.now())
     print()
-    # title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
-    title_to_id = wd.read_wikidata_entities_json(limit=None)
+    title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None)
 
-    # write the title-ID mapping to file
+    # write the title-ID and ID-description mappings to file
     if write_entity_defs:
-        with open(entity_output, mode='w', encoding='utf8') as entity_file:
-            entity_file.write("WP_title" + "|" + "WD_id" + "\n")
-            for title, qid in title_to_id.items():
-                entity_file.write(title + "|" + str(qid) + "\n")
+        _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr)
 
     title_list = list(title_to_id.keys())
     entity_list = [title_to_id[x] for x in title_list]
@@ -57,6 +56,41 @@ def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input
     return kb
 
 
+def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr):
+    with open(entity_def_output, mode='w', encoding='utf8') as id_file:
+        id_file.write("WP_title" + "|" + "WD_id" + "\n")
+        for title, qid in title_to_id.items():
+            id_file.write(title + "|" + str(qid) + "\n")
+    with open(entity_descr_output, mode='w', encoding='utf8') as descr_file:
+        descr_file.write("WD_id" + "|" + "description" + "\n")
+        for qid, descr in id_to_descr.items():
+            descr_file.write(str(qid) + "|" + descr + "\n")
+
+
+def _get_entity_to_id(entity_def_output):
+    entity_to_id = dict()
+    with open(entity_def_output, 'r', encoding='utf8') as csvfile:
+        csvreader = csv.reader(csvfile, delimiter='|')
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            entity_to_id[row[0]] = row[1]
+
+    return entity_to_id
+
+
+def _get_id_to_description(entity_descr_output):
+    id_to_desc = dict()
+    with open(entity_descr_output, 'r', encoding='utf8') as csvfile:
+        csvreader = csv.reader(csvfile, delimiter='|')
+        # skip header
+        next(csvreader)
+        for row in csvreader:
+            id_to_desc[row[0]] = row[1]
+
+    return id_to_desc
+
+
 def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input, to_print=False):
     wp_titles = title_to_id.keys()
 
diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
index c2156e31b..96fe58740 100644
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -32,7 +32,7 @@ def run_el_toy_example(nlp, kb):
         print("ent", ent.text, ent.label_, ent.kb_id_)
 
 
-def run_el_training(nlp, kb, training_dir, limit=None):
+def run_el_dev(nlp, kb, training_dir, limit=None):
     _prepare_pipeline(nlp, kb)
 
     correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir,
@@ -48,7 +48,7 @@ def run_el_training(nlp, kb, training_dir, limit=None):
             if is_dev(f):
                 article_id = f.replace(".txt", "")
                 if cnt % 500 == 0:
-                    print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
+                    print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset")
                 cnt += 1
                 with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
                     text = file.read()
diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
new file mode 100644
index 000000000..b3ebb658f
--- /dev/null
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import os
+import datetime
+from os import listdir
+
+from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
+from examples.pipeline.wiki_entity_linking import wikidata_processor as wd
+
+""" TODO: this code needs to be implemented in pipes.pyx"""
+
+
+def train_model(kb, nlp, training_dir, entity_descr_output, limit=None):
+    run_el._prepare_pipeline(nlp, kb)
+
+    correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir,
+                                                                                     collect_correct=True,
+                                                                                     collect_incorrect=True)
+
+    entities = kb.get_entity_strings()
+
+    id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
+
+    cnt = 0
+    for f in listdir(training_dir):
+        if not limit or cnt < limit:
+            if not run_el.is_dev(f):
+                article_id = f.replace(".txt", "")
+                if cnt % 500 == 0:
+                    print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset")
+                cnt += 1
+                with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
+                    text = file.read()
+                    print()
+                    doc = nlp(text)
+                    doc_vector = doc.vector
+                    print("FILE", f, len(doc_vector), "D vector")
+
+                    for mention_pos, entity_pos in correct_entries[article_id].items():
+                        descr = id_to_descr.get(entity_pos)
+                        if descr:
+                            doc_descr = nlp(descr)
+                            descr_vector = doc_descr.vector
+                            print("GOLD POS", mention_pos, entity_pos, len(descr_vector), "D vector")
+
+                    for mention_neg, entity_negs in incorrect_entries[article_id].items():
+                        for entity_neg in entity_negs:
+                            descr = id_to_descr.get(entity_neg)
+                            if descr:
+                                doc_descr = nlp(descr)
+                                descr_vector = doc_descr.vector
+                                print("GOLD NEG", mention_neg, entity_neg, len(descr_vector), "D vector")
+
+    print()
+    print("Processed", cnt, "dev articles")
+    print()
+
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index 47349d3dc..b1c63c55c 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -6,7 +6,7 @@ import csv
 import bz2
 import datetime
 
-from . import wikipedia_processor as wp
+from . import wikipedia_processor as wp, kb_creator
 
 """
 Process Wikipedia interlinks to generate a training dataset for the EL algorithm
@@ -14,26 +14,15 @@ Process Wikipedia interlinks to generate a training dataset for the EL algorithm
 
 ENTITY_FILE = "gold_entities.csv"
 
-def create_training(kb, entity_input, training_output):
+
+def create_training(kb, entity_def_input, training_output):
     if not kb:
         raise ValueError("kb should be defined")
     # nlp = spacy.load('en_core_web_sm')
-    wp_to_id = _get_entity_to_id(entity_input)
+    wp_to_id = kb_creator._get_entity_to_id(entity_def_input)
     _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000)  # TODO: full dataset
 
 
-def _get_entity_to_id(entity_input):
-    entity_to_id = dict()
-    with open(entity_input, 'r', encoding='utf8') as csvfile:
-        csvreader = csv.reader(csvfile, delimiter='|')
-        # skip header
-        next(csvreader)
-        for row in csvreader:
-            entity_to_id[row[0]] = row[1]
-
-    return entity_to_id
-
-
 def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
     """
     Read the XML wikipedia data to parse out training data:
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index ebc1e7958..26e2a7ae2 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -1,7 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
+from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el, train_el
 
 import spacy
 from spacy.vocab import Vocab
@@ -15,11 +15,12 @@ Demonstrate how to build a knowledge base from WikiData and run an Entity Linkin
 PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
 ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
 ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
+ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'
 
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
 VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
 
-TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
+TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
 
 
 if __name__ == "__main__":
@@ -30,17 +31,20 @@ if __name__ == "__main__":
     # one-time methods to create KB and write to file
     to_create_prior_probs = False
     to_create_entity_counts = False
-    to_create_kb = False
+    to_create_kb = True
 
     # read KB back in from file
     to_read_kb = True
-    to_test_kb = False
+    to_test_kb = True
 
     # create training dataset
     create_wp_training = False
 
-    # apply named entity linking to the training dataset
-    apply_to_training = True
+    # run training
+    run_training = False
+
+    # apply named entity linking to the dev dataset
+    apply_to_dev = False
 
     # STEP 1 : create prior probabilities from WP
     # run only once !
@@ -65,7 +69,8 @@ if __name__ == "__main__":
         my_kb = kb_creator.create_kb(my_vocab,
                                      max_entities_per_alias=10,
                                      min_occ=5,
-                                     entity_output=ENTITY_DEFS,
+                                     entity_def_output=ENTITY_DEFS,
+                                     entity_descr_output=ENTITY_DESCR,
                                      count_input=ENTITY_COUNTS,
                                      prior_prob_input=PRIOR_PROB,
                                      to_print=False)
@@ -98,12 +103,19 @@ if __name__ == "__main__":
     # STEP 5: create a training dataset from WP
     if create_wp_training:
         print("STEP 5: create training dataset", datetime.datetime.now())
-        training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_DIR)
+        training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
 
-    # STEP 6: apply the EL algorithm on the training dataset
-    if apply_to_training:
+    # STEP 7: apply the EL algorithm on the training dataset
+    if run_training:
+        print("STEP 6: training ", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_sm')
-        run_el.run_el_training(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=1000)
+        train_el.train_model(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=5)
+        print()
+
+    # STEP 8: apply the EL algorithm on the dev dataset
+    if apply_to_dev:
+        my_nlp = spacy.load('en_core_web_sm')
+        run_el.run_el_dev(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=2000)
         print()
 
 
diff --git a/examples/pipeline/wiki_entity_linking/wikidata_processor.py b/examples/pipeline/wiki_entity_linking/wikidata_processor.py
index 03db05414..7d84b1a2a 100644
--- a/examples/pipeline/wiki_entity_linking/wikidata_processor.py
+++ b/examples/pipeline/wiki_entity_linking/wikidata_processor.py
@@ -13,17 +13,18 @@ WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.js
 def read_wikidata_entities_json(limit=None, to_print=False):
     """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
 
-    languages = {'en', 'de'}
+    lang = 'en'
     prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
     site_filter = 'enwiki'
 
     title_to_id = dict()
+    id_to_descr = dict()
 
     # parse appropriate fields - depending on what we need in the KB
     parse_properties = False
     parse_sitelinks = True
     parse_labels = False
-    parse_descriptions = False
+    parse_descriptions = True
     parse_aliases = False
 
     with bz2.open(WIKIDATA_JSON, mode='rb') as file:
@@ -76,91 +77,36 @@ def read_wikidata_entities_json(limit=None, to_print=False):
                                 if to_print:
                                     print(site_filter, ":", site)
                                 title_to_id[site] = unique_id
-                                # print(site, "for", unique_id)
 
                         if parse_labels:
                             labels = obj["labels"]
                             if labels:
-                                for lang in languages:
-                                    lang_label = labels.get(lang, None)
-                                    if lang_label:
-                                        if to_print:
-                                            print("label (" + lang + "):", lang_label["value"])
+                                lang_label = labels.get(lang, None)
+                                if lang_label:
+                                    if to_print:
+                                        print("label (" + lang + "):", lang_label["value"])
 
                         if parse_descriptions:
                             descriptions = obj["descriptions"]
                             if descriptions:
-                                for lang in languages:
-                                    lang_descr = descriptions.get(lang, None)
-                                    if lang_descr:
-                                        if to_print:
-                                            print("description (" + lang + "):", lang_descr["value"])
+                                lang_descr = descriptions.get(lang, None)
+                                if lang_descr:
+                                    if to_print:
+                                        print("description (" + lang + "):", lang_descr["value"])
+                                    id_to_descr[unique_id] = lang_descr["value"]
 
                         if parse_aliases:
                             aliases = obj["aliases"]
                             if aliases:
-                                for lang in languages:
-                                    lang_aliases = aliases.get(lang, None)
-                                    if lang_aliases:
-                                        for item in lang_aliases:
-                                            if to_print:
-                                                print("alias (" + lang + "):", item["value"])
+                                lang_aliases = aliases.get(lang, None)
+                                if lang_aliases:
+                                    for item in lang_aliases:
+                                        if to_print:
+                                            print("alias (" + lang + "):", item["value"])
 
                         if to_print:
                             print()
             line = file.readline()
             cnt += 1
 
-    return title_to_id
-
-
-def _read_wikidata_entities_regex_depr(limit=None):
-    """
-    Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines.
-    TODO: doesn't work yet. may be deleted ?
-    """
-
-    regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE)
-    regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
-    regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE)
-    regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
-
-    title_to_id = dict()
-
-    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
-        line = file.readline()
-        cnt = 0
-        while line and (not limit or cnt < limit):
-            if cnt % 500000 == 0:
-                print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
-            clean_line = line.strip()
-            if clean_line.endswith(b","):
-                clean_line = clean_line[:-1]
-            if len(clean_line) > 1:
-                clean_line = line.strip().decode("utf-8")
-                keep = False
-
-                p31_matches = regex_p31.findall(clean_line)
-                if p31_matches:
-                    for p31_match in p31_matches:
-                        id_matches = regex_id.findall(p31_match)
-                        for id_match in id_matches:
-                            id_match = id_match[6:][:-1]
-                            if id_match == "Q5" or id_match == "Q15632617":
-                                keep = True
-
-                if keep:
-                    id_match = regex_id.search(clean_line).group(0)
-                    id_match = id_match[6:][:-1]
-
-                    enwiki_matches = regex_enwiki.findall(clean_line)
-                    if enwiki_matches:
-                        for enwiki_match in enwiki_matches:
-                            title_match = regex_title.search(enwiki_match).group(0)
-                            title = title_match[9:][:-1]
-                            title_to_id[title] = id_match
-
-            line = file.readline()
-            cnt += 1
-
-    return title_to_id
+    return title_to_id, id_to_descr

From c6ca8649d7ab67af88af1682fa93a63fc635481c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 9 May 2019 17:23:19 +0200
Subject: [PATCH 033/148] first stab at model - not functional yet

---
 .../pipeline/wiki_entity_linking/train_el.py  | 179 ++++++++++++++----
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  20 +-
 2 files changed, 158 insertions(+), 41 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index b3ebb658f..8dcea9256 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -6,53 +6,168 @@ import datetime
 from os import listdir
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
-from examples.pipeline.wiki_entity_linking import wikidata_processor as wd
+
+from spacy._ml import SpacyVectors, create_default_optimizer, zero_init
+
+from thinc.api import chain
+from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
+from thinc.api import flatten_add_lengths
+from thinc.t2v import Pooling, sum_pool, mean_pool
+from thinc.t2t import ExtractWindow, ParametricAttention
+from thinc.misc import Residual
 
 """ TODO: this code needs to be implemented in pipes.pyx"""
 
 
-def train_model(kb, nlp, training_dir, entity_descr_output, limit=None):
-    run_el._prepare_pipeline(nlp, kb)
+class EL_Model():
 
-    correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir,
-                                                                                     collect_correct=True,
-                                                                                     collect_incorrect=True)
+    labels = ["MATCH", "NOMATCH"]
+    name = "entity_linker"
 
-    entities = kb.get_entity_strings()
+    def __init__(self, kb, nlp):
+        run_el._prepare_pipeline(nlp, kb)
+        self.nlp = nlp
+        self.kb = kb
 
-    id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
+        self.entity_encoder = self._simple_encoder(width=300)
+        self.article_encoder = self._simple_encoder(width=300)
 
-    cnt = 0
-    for f in listdir(training_dir):
-        if not limit or cnt < limit:
-            if not run_el.is_dev(f):
-                article_id = f.replace(".txt", "")
-                if cnt % 500 == 0:
-                    print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset")
-                cnt += 1
-                with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
-                    text = file.read()
-                    print()
-                    doc = nlp(text)
-                    doc_vector = doc.vector
-                    print("FILE", f, len(doc_vector), "D vector")
+    def train_model(self, training_dir, entity_descr_output, limit=None, to_print=True):
+        instances, gold_vectors, entity_descriptions, doc_by_article = self._get_training_data(training_dir,
+                                                                                               entity_descr_output,
+                                                                                               limit, to_print)
+
+        if to_print:
+            print("Training on", len(gold_vectors), "instances")
+            print(" - pos:", len([x for x in gold_vectors if x]), "instances")
+            print(" - pos:", len([x for x in gold_vectors if not x]), "instances")
+            print()
+
+        self.sgd_entity = self.begin_training(self.entity_encoder)
+        self.sgd_article = self.begin_training(self.article_encoder)
+
+        losses = {}
+
+        for inst, label, entity_descr in zip(instances, gold_vectors, entity_descriptions):
+            article = inst.split(sep="_")[0]
+            entity_id = inst.split(sep="_")[1]
+            article_doc = doc_by_article[article]
+            self.update(article_doc, entity_descr, label, losses=losses)
+
+    def _simple_encoder(self, width):
+        with Model.define_operators({">>": chain}):
+            encoder = SpacyVectors \
+                      >> flatten_add_lengths \
+                      >> ParametricAttention(width)\
+                      >> Pooling(sum_pool) \
+                      >> Residual(zero_init(Maxout(width, width)))
+
+        return encoder
+
+    def begin_training(self, model):
+        # TODO ? link_vectors_to_models(self.vocab)
+        sgd = create_default_optimizer(model.ops)
+        return sgd
+
+    def update(self, article_doc, entity_descr, label, drop=0., losses=None):
+        entity_encoding, entity_bp = self.entity_encoder.begin_update([entity_descr], drop=drop)
+        doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop)
+
+        # print("entity/article output dim", len(entity_encoding[0]), len(doc_encoding[0]))
+
+        mse, diffs = self._calculate_similarity(entity_encoding, doc_encoding)
+
+        # print()
+
+        # TODO: proper backpropagation taking ranking of elements into account ?
+        # TODO backpropagation also for negative examples
+        if label:
+            entity_bp(diffs, sgd=self.sgd_entity)
+            article_bp(diffs, sgd=self.sgd_article)
+            print(mse)
+
+
+    # TODO delete ?
+    def _simple_cnn_model(self, internal_dim):
+        nr_class = len(self.labels)
+        with Model.define_operators({">>": chain}):
+            model_entity = SpacyVectors >> flatten_add_lengths >> Pooling(mean_pool)    # entity encoding
+            model_doc = SpacyVectors >> flatten_add_lengths >> Pooling(mean_pool)       # doc encoding
+            output_layer = Softmax(nr_class, internal_dim*2)
+            model = (model_entity | model_doc) >> output_layer
+        # model.tok2vec = chain(tok2vec, flatten)
+        model.nO = nr_class
+        return model
+
+    def predict(self, entity_doc, article_doc):
+        entity_encoding = self.entity_encoder(entity_doc)
+        doc_encoding = self.article_encoder(article_doc)
+
+        print("entity_encodings", len(entity_encoding), entity_encoding)
+        print("doc_encodings", len(doc_encoding), doc_encoding)
+        mse, diffs = self._calculate_similarity(entity_encoding, doc_encoding)
+        print("mse", mse)
+
+        return mse
+
+    def _calculate_similarity(self, vector1, vector2):
+        if len(vector1) != len(vector2):
+            raise ValueError("To calculate similarity, both vectors should be of equal length")
+
+        diffs = (vector2 - vector1)
+        error_sum = (diffs ** 2).sum(axis=1)
+        mean_square_error = error_sum / len(vector1)
+        return float(mean_square_error), diffs
+
+    def _get_labels(self):
+        return tuple(self.labels)
+
+    def _get_training_data(self, training_dir, entity_descr_output, limit, to_print):
+        id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
+
+        correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir,
+                                                                                         collect_correct=True,
+                                                                                         collect_incorrect=True)
+
+        instances = list()
+        entity_descriptions = list()
+        local_vectors = list()   # TODO: local vectors
+        gold_vectors = list()
+        doc_by_article = dict()
+
+        cnt = 0
+        for f in listdir(training_dir):
+            if not limit or cnt < limit:
+                if not run_el.is_dev(f):
+                    article_id = f.replace(".txt", "")
+                    if cnt % 500 == 0 and to_print:
+                        print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset")
+                    cnt += 1
+                    if article_id not in doc_by_article:
+                        with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
+                            text = file.read()
+                            doc = self.nlp(text)
+                            doc_by_article[article_id] = doc
 
                     for mention_pos, entity_pos in correct_entries[article_id].items():
                         descr = id_to_descr.get(entity_pos)
                         if descr:
-                            doc_descr = nlp(descr)
-                            descr_vector = doc_descr.vector
-                            print("GOLD POS", mention_pos, entity_pos, len(descr_vector), "D vector")
+                            instances.append(article_id + "_" + entity_pos)
+                            doc = self.nlp(descr)
+                            entity_descriptions.append(doc)
+                            gold_vectors.append(True)
 
                     for mention_neg, entity_negs in incorrect_entries[article_id].items():
                         for entity_neg in entity_negs:
                             descr = id_to_descr.get(entity_neg)
                             if descr:
-                                doc_descr = nlp(descr)
-                                descr_vector = doc_descr.vector
-                                print("GOLD NEG", mention_neg, entity_neg, len(descr_vector), "D vector")
-
-    print()
-    print("Processed", cnt, "dev articles")
-    print()
+                                instances.append(article_id + "_" + entity_neg)
+                                doc = self.nlp(descr)
+                                entity_descriptions.append(doc)
+                                gold_vectors.append(False)
 
+        if to_print:
+            print()
+            print("Processed", cnt, "dev articles")
+            print()
+        return instances, gold_vectors, entity_descriptions, doc_by_article
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 26e2a7ae2..83650aa8d 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -1,7 +1,8 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el, train_el
+from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
+from examples.pipeline.wiki_entity_linking.train_el import EL_Model
 
 import spacy
 from spacy.vocab import Vocab
@@ -31,17 +32,17 @@ if __name__ == "__main__":
     # one-time methods to create KB and write to file
     to_create_prior_probs = False
     to_create_entity_counts = False
-    to_create_kb = True
+    to_create_kb = False
 
     # read KB back in from file
     to_read_kb = True
-    to_test_kb = True
+    to_test_kb = False
 
     # create training dataset
     create_wp_training = False
 
     # run training
-    run_training = False
+    run_training = True
 
     # apply named entity linking to the dev dataset
     apply_to_dev = False
@@ -105,16 +106,17 @@ if __name__ == "__main__":
         print("STEP 5: create training dataset", datetime.datetime.now())
         training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
 
-    # STEP 7: apply the EL algorithm on the training dataset
+    # STEP 6: apply the EL algorithm on the training dataset
     if run_training:
         print("STEP 6: training ", datetime.datetime.now())
-        my_nlp = spacy.load('en_core_web_sm')
-        train_el.train_model(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=5)
+        my_nlp = spacy.load('en_core_web_md')
+        trainer = EL_Model(kb=my_kb, nlp=my_nlp)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=50)
         print()
 
-    # STEP 8: apply the EL algorithm on the dev dataset
+    # STEP 7: apply the EL algorithm on the dev dataset
     if apply_to_dev:
-        my_nlp = spacy.load('en_core_web_sm')
+        my_nlp = spacy.load('en_core_web_md')
         run_el.run_el_dev(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=2000)
         print()
 

From 9d089c0410c8f71cdf80b0b5d613d8c2983fb454 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 9 May 2019 18:11:49 +0200
Subject: [PATCH 034/148] grouping clusters of instances per doc+mention

---
 .../pipeline/wiki_entity_linking/train_el.py  | 75 +++++++++++--------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 8dcea9256..c91058d5f 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -7,7 +7,7 @@ from os import listdir
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
-from spacy._ml import SpacyVectors, create_default_optimizer, zero_init
+from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, cosine
 
 from thinc.api import chain
 from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
@@ -33,14 +33,12 @@ class EL_Model():
         self.article_encoder = self._simple_encoder(width=300)
 
     def train_model(self, training_dir, entity_descr_output, limit=None, to_print=True):
-        instances, gold_vectors, entity_descriptions, doc_by_article = self._get_training_data(training_dir,
+        instances, pos_entities, neg_entities, doc_by_article = self._get_training_data(training_dir,
                                                                                                entity_descr_output,
                                                                                                limit, to_print)
 
         if to_print:
-            print("Training on", len(gold_vectors), "instances")
-            print(" - pos:", len([x for x in gold_vectors if x]), "instances")
-            print(" - pos:", len([x for x in gold_vectors if not x]), "instances")
+            print("Training on", len(instances), "instance clusters")
             print()
 
         self.sgd_entity = self.begin_training(self.entity_encoder)
@@ -48,11 +46,20 @@ class EL_Model():
 
         losses = {}
 
-        for inst, label, entity_descr in zip(instances, gold_vectors, entity_descriptions):
-            article = inst.split(sep="_")[0]
-            entity_id = inst.split(sep="_")[1]
-            article_doc = doc_by_article[article]
-            self.update(article_doc, entity_descr, label, losses=losses)
+        for inst_cluster in instances:
+            pos_ex = pos_entities.get(inst_cluster)
+            neg_exs = neg_entities.get(inst_cluster, [])
+
+            if pos_ex and neg_exs:
+                article = inst_cluster.split(sep="_")[0]
+                entity_id = inst_cluster.split(sep="_")[1]
+                article_doc = doc_by_article[article]
+                self.update(article_doc, pos_ex, neg_exs, losses=losses)
+            # TODO
+            # elif not pos_ex:
+                # print("Weird. Couldn't find pos example for",  inst_cluster)
+            # elif not neg_exs:
+                # print("Weird. Couldn't find neg examples for",  inst_cluster)
 
     def _simple_encoder(self, width):
         with Model.define_operators({">>": chain}):
@@ -69,22 +76,29 @@ class EL_Model():
         sgd = create_default_optimizer(model.ops)
         return sgd
 
-    def update(self, article_doc, entity_descr, label, drop=0., losses=None):
-        entity_encoding, entity_bp = self.entity_encoder.begin_update([entity_descr], drop=drop)
+    def update(self, article_doc, true_entity, false_entities, drop=0., losses=None):
         doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop)
 
+        true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop)
+        # true_similarity = cosine(true_entity_encoding, doc_encoding)
+        # print("true_similarity", true_similarity)
+
+        # for false_entity in false_entities:
+            # false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop)
+            # false_similarity = cosine(false_entity_encoding, doc_encoding)
+            # print("false_similarity", false_similarity)
+
         # print("entity/article output dim", len(entity_encoding[0]), len(doc_encoding[0]))
 
-        mse, diffs = self._calculate_similarity(entity_encoding, doc_encoding)
+        mse, diffs = self._calculate_similarity(true_entity_encoding, doc_encoding)
 
         # print()
 
         # TODO: proper backpropagation taking ranking of elements into account ?
         # TODO backpropagation also for negative examples
-        if label:
-            entity_bp(diffs, sgd=self.sgd_entity)
-            article_bp(diffs, sgd=self.sgd_article)
-            print(mse)
+        true_entity_bp(diffs, sgd=self.sgd_entity)
+        article_bp(diffs, sgd=self.sgd_article)
+        print(mse)
 
 
     # TODO delete ?
@@ -115,7 +129,7 @@ class EL_Model():
             raise ValueError("To calculate similarity, both vectors should be of equal length")
 
         diffs = (vector2 - vector1)
-        error_sum = (diffs ** 2).sum(axis=1)
+        error_sum = (diffs ** 2).sum()
         mean_square_error = error_sum / len(vector1)
         return float(mean_square_error), diffs
 
@@ -130,10 +144,10 @@ class EL_Model():
                                                                                          collect_incorrect=True)
 
         instances = list()
-        entity_descriptions = list()
         local_vectors = list()   # TODO: local vectors
-        gold_vectors = list()
         doc_by_article = dict()
+        pos_entities = dict()
+        neg_entities = dict()
 
         cnt = 0
         for f in listdir(training_dir):
@@ -149,25 +163,24 @@ class EL_Model():
                             doc = self.nlp(text)
                             doc_by_article[article_id] = doc
 
-                    for mention_pos, entity_pos in correct_entries[article_id].items():
+                    for mention, entity_pos in correct_entries[article_id].items():
                         descr = id_to_descr.get(entity_pos)
                         if descr:
-                            instances.append(article_id + "_" + entity_pos)
-                            doc = self.nlp(descr)
-                            entity_descriptions.append(doc)
-                            gold_vectors.append(True)
+                            instances.append(article_id + "_" + mention)
+                            doc_descr = self.nlp(descr)
+                            pos_entities[article_id + "_" + mention] = doc_descr
 
-                    for mention_neg, entity_negs in incorrect_entries[article_id].items():
+                    for mention, entity_negs in incorrect_entries[article_id].items():
                         for entity_neg in entity_negs:
                             descr = id_to_descr.get(entity_neg)
                             if descr:
-                                instances.append(article_id + "_" + entity_neg)
-                                doc = self.nlp(descr)
-                                entity_descriptions.append(doc)
-                                gold_vectors.append(False)
+                                doc_descr = self.nlp(descr)
+                                descr_list = neg_entities.get(article_id + "_" + mention, [])
+                                descr_list.append(doc_descr)
+                                neg_entities[article_id + "_" + mention] = descr_list
 
         if to_print:
             print()
             print("Processed", cnt, "dev articles")
             print()
-        return instances, gold_vectors, entity_descriptions, doc_by_article
+        return instances, pos_entities, neg_entities, doc_by_article

From b6d788064afdd5871e3d15303d6f622b91a59cc0 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 10 May 2019 12:53:14 +0200
Subject: [PATCH 035/148] some first experiments with different architectures
 and metrics

---
 .../pipeline/wiki_entity_linking/train_el.py  | 110 ++++++++++++++----
 1 file changed, 86 insertions(+), 24 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index c91058d5f..cfd17bd78 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -4,17 +4,17 @@ from __future__ import unicode_literals
 import os
 import datetime
 from os import listdir
+import numpy as np
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
-from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, cosine
+from spacy._ml import SpacyVectors, create_default_optimizer, zero_init
 
-from thinc.api import chain
+from thinc.api import chain, flatten_add_lengths, with_getitem, clone, with_flatten
 from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
-from thinc.api import flatten_add_lengths
 from thinc.t2v import Pooling, sum_pool, mean_pool
 from thinc.t2t import ExtractWindow, ParametricAttention
-from thinc.misc import Residual
+from thinc.misc import Residual, LayerNorm as LN
 
 """ TODO: this code needs to be implemented in pipes.pyx"""
 
@@ -29,8 +29,8 @@ class EL_Model():
         self.nlp = nlp
         self.kb = kb
 
-        self.entity_encoder = self._simple_encoder(width=300)
-        self.article_encoder = self._simple_encoder(width=300)
+        self.entity_encoder = self._simple_encoder(in_width=300, out_width=96)
+        self.article_encoder = self._simple_encoder(in_width=300, out_width=96)
 
     def train_model(self, training_dir, entity_descr_output, limit=None, to_print=True):
         instances, pos_entities, neg_entities, doc_by_article = self._get_training_data(training_dir,
@@ -61,13 +61,36 @@ class EL_Model():
             # elif not neg_exs:
                 # print("Weird. Couldn't find neg examples for",  inst_cluster)
 
-    def _simple_encoder(self, width):
-        with Model.define_operators({">>": chain}):
+    def _simple_encoder(self, in_width, out_width):
+        conv_depth = 1
+        cnn_maxout_pieces = 3
+        with Model.define_operators({">>": chain, "**": clone}):
+            # encoder = SpacyVectors \
+            #            >> flatten_add_lengths \
+            #           >> ParametricAttention(in_width)\
+            #            >> Pooling(mean_pool) \
+            #           >> Residual(zero_init(Maxout(in_width, in_width)))  \
+            #           >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
             encoder = SpacyVectors \
-                      >> flatten_add_lengths \
-                      >> ParametricAttention(width)\
-                      >> Pooling(sum_pool) \
-                      >> Residual(zero_init(Maxout(width, width)))
+                     >> flatten_add_lengths \
+                     >> with_getitem(0, Affine(in_width, in_width)) \
+                     >> ParametricAttention(in_width) \
+                     >> Pooling(sum_pool) \
+                     >> Residual(ReLu(in_width, in_width)) ** conv_depth \
+                     >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
+
+            # >> zero_init(Affine(nr_class, width, drop_factor=0.0))
+            # >> logistic
+
+            # convolution = Residual(
+            #    ExtractWindow(nW=1)
+            #    >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
+            # )
+
+            # embed = SpacyVectors >> LN(Maxout(width, width, pieces=3))
+
+            # encoder = SpacyVectors >> flatten_add_lengths >> convolution ** conv_depth
+            # encoder = with_flatten(embed >> convolution ** conv_depth, pad=conv_depth)
 
         return encoder
 
@@ -80,25 +103,56 @@ class EL_Model():
         doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop)
 
         true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop)
-        # true_similarity = cosine(true_entity_encoding, doc_encoding)
-        # print("true_similarity", true_similarity)
+        # print("encoding dim", len(true_entity_encoding[0]))
 
-        # for false_entity in false_entities:
-            # false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop)
-            # false_similarity = cosine(false_entity_encoding, doc_encoding)
-            # print("false_similarity", false_similarity)
+        consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
+        consensus_encoding_t = consensus_encoding.transpose()
 
-        # print("entity/article output dim", len(entity_encoding[0]), len(doc_encoding[0]))
+        doc_mse, doc_diffs = self._calculate_similarity(doc_encoding, consensus_encoding)
 
-        mse, diffs = self._calculate_similarity(true_entity_encoding, doc_encoding)
+        entity_mses = list()
+
+        true_mse, true_diffs = self._calculate_similarity(true_entity_encoding, consensus_encoding)
+        # print("true_mse", true_mse)
+        # print("true_diffs", true_diffs)
+        entity_mses.append(true_mse)
+        # true_exp = np.exp(true_entity_encoding.dot(consensus_encoding_t))
+        # print("true_exp", true_exp)
+
+        # false_exp_sum = 0
+
+        for false_entity in false_entities:
+            false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop)
+            false_mse, false_diffs = self._calculate_similarity(false_entity_encoding, consensus_encoding)
+            # print("false_mse", false_mse)
+            # false_exp = np.exp(false_entity_encoding.dot(consensus_encoding_t))
+            # print("false_exp", false_exp)
+            # print("false_diffs", false_diffs)
+            entity_mses.append(false_mse)
+            # if false_mse > true_mse:
+                # true_diffs = true_diffs - false_diffs ???
+            # false_exp_sum += false_exp
+
+        # prob = true_exp / false_exp_sum
+        # print("prob", prob)
+
+        entity_mses = sorted(entity_mses)
+        # mse_sum = sum(entity_mses)
+        # entity_probs = [1 - x/mse_sum for x in entity_mses]
+        # print("entity_mses", entity_mses)
+        # print("entity_probs", entity_probs)
+        true_index = entity_mses.index(true_mse)
+        # print("true index", true_index)
+        # print("true prob", entity_probs[true_index])
+
+        print(true_mse)
 
         # print()
 
         # TODO: proper backpropagation taking ranking of elements into account ?
         # TODO backpropagation also for negative examples
-        true_entity_bp(diffs, sgd=self.sgd_entity)
-        article_bp(diffs, sgd=self.sgd_article)
-        print(mse)
+        true_entity_bp(true_diffs, sgd=self.sgd_entity)
+        article_bp(doc_diffs, sgd=self.sgd_article)
 
 
     # TODO delete ?
@@ -124,11 +178,19 @@ class EL_Model():
 
         return mse
 
+    # TODO: expand to more than 2 vectors
+    def _calculate_consensus(self, vector1, vector2):
+        if len(vector1) != len(vector2):
+            raise ValueError("To calculate consenus, both vectors should be of equal length")
+
+        avg = (vector2 + vector1) / 2
+        return avg
+
     def _calculate_similarity(self, vector1, vector2):
         if len(vector1) != len(vector2):
             raise ValueError("To calculate similarity, both vectors should be of equal length")
 
-        diffs = (vector2 - vector1)
+        diffs = (vector1 - vector2)
         error_sum = (diffs ** 2).sum()
         mean_square_error = error_sum / len(vector1)
         return float(mean_square_error), diffs

From 3b81b009547b5c48dea7660e8081f050014f8609 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 13 May 2019 14:26:04 +0200
Subject: [PATCH 036/148] evaluating on dev set during training

---
 .../pipeline/wiki_entity_linking/run_el.py    | 25 +++---
 .../pipeline/wiki_entity_linking/train_el.py  | 87 ++++++++++++++++---
 2 files changed, 90 insertions(+), 22 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
index 96fe58740..66ab0385e 100644
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -70,12 +70,10 @@ def is_dev(file_name):
     return file_name.endswith("3.txt")
 
 
-def evaluate(predictions, golds):
+def evaluate(predictions, golds, to_print=True):
     if len(predictions) != len(golds):
         raise ValueError("predictions and gold entities should have the same length")
 
-    print("Evaluating", len(golds), "entities")
-
     tp = 0
     fp = 0
     fn = 0
@@ -89,17 +87,22 @@ def evaluate(predictions, golds):
         else:
             fp += 1
 
-    print("tp", tp)
-    print("fp", fp)
-    print("fn", fn)
+    if to_print:
+        print("Evaluating", len(golds), "entities")
+        print("tp", tp)
+        print("fp", fp)
+        print("fn", fn)
 
-    precision = tp / (tp + fp + 0.0000001)
-    recall = tp / (tp + fn + 0.0000001)
+    precision = 100 * tp / (tp + fp + 0.0000001)
+    recall = 100 * tp / (tp + fn + 0.0000001)
     fscore = 2 * recall * precision / (recall + precision + 0.0000001)
 
-    print("precision", round(100 * precision, 1), "%")
-    print("recall", round(100 * recall, 1), "%")
-    print("Fscore", round(100 * fscore, 1), "%")
+    if to_print:
+        print("precision", round(precision, 1), "%")
+        print("recall", round(recall, 1), "%")
+        print("Fscore", round(fscore, 1), "%")
+
+    return precision, recall, fscore
 
 
 def _prepare_pipeline(nlp, kb):
diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index cfd17bd78..7fd301e02 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -5,6 +5,7 @@ import os
 import datetime
 from os import listdir
 import numpy as np
+from random import shuffle
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
@@ -16,6 +17,8 @@ from thinc.t2v import Pooling, sum_pool, mean_pool
 from thinc.t2t import ExtractWindow, ParametricAttention
 from thinc.misc import Residual, LayerNorm as LN
 
+from spacy.tokens import Doc
+
 """ TODO: this code needs to be implemented in pipes.pyx"""
 
 
@@ -33,34 +36,93 @@ class EL_Model():
         self.article_encoder = self._simple_encoder(in_width=300, out_width=96)
 
     def train_model(self, training_dir, entity_descr_output, limit=None, to_print=True):
-        instances, pos_entities, neg_entities, doc_by_article = self._get_training_data(training_dir,
-                                                                                               entity_descr_output,
-                                                                                               limit, to_print)
+        Doc.set_extension("entity_id", default=None)
+
+        train_instances, train_pos, train_neg, train_doc = self._get_training_data(training_dir,
+                                                                                   entity_descr_output,
+                                                                                   False,
+                                                                                   limit, to_print)
+
+        dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir,
+                                                                           entity_descr_output,
+                                                                           True,
+                                                                           limit, to_print)
 
         if to_print:
-            print("Training on", len(instances), "instance clusters")
+            print("Training on", len(train_instances), "instance clusters")
+            print("Dev test on", len(dev_instances), "instance clusters")
             print()
 
         self.sgd_entity = self.begin_training(self.entity_encoder)
         self.sgd_article = self.begin_training(self.article_encoder)
 
+        self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc)
+
         losses = {}
 
-        for inst_cluster in instances:
-            pos_ex = pos_entities.get(inst_cluster)
-            neg_exs = neg_entities.get(inst_cluster, [])
+        for inst_cluster in train_instances:
+            pos_ex = train_pos.get(inst_cluster)
+            neg_exs = train_neg.get(inst_cluster, [])
 
             if pos_ex and neg_exs:
                 article = inst_cluster.split(sep="_")[0]
                 entity_id = inst_cluster.split(sep="_")[1]
-                article_doc = doc_by_article[article]
+                article_doc = train_doc[article]
                 self.update(article_doc, pos_ex, neg_exs, losses=losses)
+                p, r, fscore = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc)
+                print(round(fscore, 1))
             # TODO
             # elif not pos_ex:
                 # print("Weird. Couldn't find pos example for",  inst_cluster)
             # elif not neg_exs:
                 # print("Weird. Couldn't find neg examples for",  inst_cluster)
 
+    def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc):
+        predictions = list()
+        golds = list()
+
+        for inst_cluster in dev_instances:
+            pos_ex = dev_pos.get(inst_cluster)
+            neg_exs = dev_neg.get(inst_cluster, [])
+            ex_to_id = dict()
+
+            if pos_ex and neg_exs:
+                ex_to_id[pos_ex] = pos_ex._.entity_id
+                for neg_ex in neg_exs:
+                    ex_to_id[neg_ex] = neg_ex._.entity_id
+
+                article = inst_cluster.split(sep="_")[0]
+                entity_id = inst_cluster.split(sep="_")[1]
+                article_doc = dev_doc[article]
+
+                examples = list(neg_exs)
+                examples.append(pos_ex)
+                shuffle(examples)
+
+                best_entity, lowest_mse = self._predict(examples, article_doc)
+                predictions.append(ex_to_id[best_entity])
+                golds.append(ex_to_id[pos_ex])
+
+
+        # TODO: use lowest_mse and combine with prior probability
+        p, r, F = run_el.evaluate(predictions, golds, to_print=False)
+        return p, r, F
+
+    def _predict(self, entities, article_doc):
+        doc_encoding = self.article_encoder([article_doc])
+
+        lowest_mse = None
+        best_entity = None
+
+        for entity in entities:
+            entity_encoding = self.entity_encoder([entity])
+            mse, _ = self._calculate_similarity(doc_encoding, entity_encoding)
+            if not best_entity or mse < lowest_mse:
+                lowest_mse = mse
+                best_entity = entity
+
+        return best_entity, lowest_mse
+
     def _simple_encoder(self, in_width, out_width):
         conv_depth = 1
         cnn_maxout_pieces = 3
@@ -145,7 +207,7 @@ class EL_Model():
         # print("true index", true_index)
         # print("true prob", entity_probs[true_index])
 
-        print(true_mse)
+        # print("training loss", true_mse)
 
         # print()
 
@@ -198,13 +260,14 @@ class EL_Model():
     def _get_labels(self):
         return tuple(self.labels)
 
-    def _get_training_data(self, training_dir, entity_descr_output, limit, to_print):
+    def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
 
         correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir,
                                                                                          collect_correct=True,
                                                                                          collect_incorrect=True)
 
+
         instances = list()
         local_vectors = list()   # TODO: local vectors
         doc_by_article = dict()
@@ -214,7 +277,7 @@ class EL_Model():
         cnt = 0
         for f in listdir(training_dir):
             if not limit or cnt < limit:
-                if not run_el.is_dev(f):
+                if dev == run_el.is_dev(f):
                     article_id = f.replace(".txt", "")
                     if cnt % 500 == 0 and to_print:
                         print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset")
@@ -230,6 +293,7 @@ class EL_Model():
                         if descr:
                             instances.append(article_id + "_" + mention)
                             doc_descr = self.nlp(descr)
+                            doc_descr._.entity_id = entity_pos
                             pos_entities[article_id + "_" + mention] = doc_descr
 
                     for mention, entity_negs in incorrect_entries[article_id].items():
@@ -237,6 +301,7 @@ class EL_Model():
                             descr = id_to_descr.get(entity_neg)
                             if descr:
                                 doc_descr = self.nlp(descr)
+                                doc_descr._.entity_id = entity_neg
                                 descr_list = neg_entities.get(article_id + "_" + mention, [])
                                 descr_list.append(doc_descr)
                                 neg_entities[article_id + "_" + mention] = descr_list

From 4142e8dd1b05e396c6e24efb7550a86837359118 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 13 May 2019 17:02:34 +0200
Subject: [PATCH 037/148] train and predict per article (saving time for doc
 encoding)

---
 .../pipeline/wiki_entity_linking/train_el.py  | 182 ++++++++++--------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 2 files changed, 103 insertions(+), 81 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 7fd301e02..1e2c25ffc 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -46,11 +46,11 @@ class EL_Model():
         dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir,
                                                                            entity_descr_output,
                                                                            True,
-                                                                           limit, to_print)
+                                                                           limit / 10, to_print)
 
         if to_print:
-            print("Training on", len(train_instances), "instance clusters")
-            print("Dev test on", len(dev_instances), "instance clusters")
+            print("Training on", len(train_instances.values()), "articles")
+            print("Dev test on", len(dev_instances.values()), "articles")
             print()
 
         self.sgd_entity = self.begin_training(self.entity_encoder)
@@ -60,49 +60,51 @@ class EL_Model():
 
         losses = {}
 
-        for inst_cluster in train_instances:
-            pos_ex = train_pos.get(inst_cluster)
-            neg_exs = train_neg.get(inst_cluster, [])
+        instance_count = 0
+
+        for article_id, inst_cluster_set in train_instances.items():
+            article_doc = train_doc[article_id]
+            pos_ex_list = list()
+            neg_exs_list = list()
+            for inst_cluster in inst_cluster_set:
+                instance_count += 1
+                pos_ex_list.append(train_pos.get(inst_cluster))
+                neg_exs_list.append(train_neg.get(inst_cluster, []))
+
+            self.update(article_doc, pos_ex_list, neg_exs_list, losses=losses)
+            p, r, fscore = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc)
+            print(round(fscore, 1))
+
+        if to_print:
+            print("Trained on", instance_count, "instance clusters")
 
-            if pos_ex and neg_exs:
-                article = inst_cluster.split(sep="_")[0]
-                entity_id = inst_cluster.split(sep="_")[1]
-                article_doc = train_doc[article]
-                self.update(article_doc, pos_ex, neg_exs, losses=losses)
-                p, r, fscore = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc)
-                print(round(fscore, 1))
-            # TODO
-            # elif not pos_ex:
-                # print("Weird. Couldn't find pos example for",  inst_cluster)
-            # elif not neg_exs:
-                # print("Weird. Couldn't find neg examples for",  inst_cluster)
 
     def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc):
         predictions = list()
         golds = list()
 
-        for inst_cluster in dev_instances:
-            pos_ex = dev_pos.get(inst_cluster)
-            neg_exs = dev_neg.get(inst_cluster, [])
-            ex_to_id = dict()
+        for article_id, inst_cluster_set in dev_instances.items():
+            for inst_cluster in inst_cluster_set:
+                pos_ex = dev_pos.get(inst_cluster)
+                neg_exs = dev_neg.get(inst_cluster, [])
+                ex_to_id = dict()
 
-            if pos_ex and neg_exs:
-                ex_to_id[pos_ex] = pos_ex._.entity_id
-                for neg_ex in neg_exs:
-                    ex_to_id[neg_ex] = neg_ex._.entity_id
+                if pos_ex and neg_exs:
+                    ex_to_id[pos_ex] = pos_ex._.entity_id
+                    for neg_ex in neg_exs:
+                        ex_to_id[neg_ex] = neg_ex._.entity_id
 
-                article = inst_cluster.split(sep="_")[0]
-                entity_id = inst_cluster.split(sep="_")[1]
-                article_doc = dev_doc[article]
+                    article = inst_cluster.split(sep="_")[0]
+                    entity_id = inst_cluster.split(sep="_")[1]
+                    article_doc = dev_doc[article]
 
-                examples = list(neg_exs)
-                examples.append(pos_ex)
-                shuffle(examples)
-
-                best_entity, lowest_mse = self._predict(examples, article_doc)
-                predictions.append(ex_to_id[best_entity])
-                golds.append(ex_to_id[pos_ex])
+                    examples = list(neg_exs)
+                    examples.append(pos_ex)
+                    shuffle(examples)
 
+                    best_entity, lowest_mse = self._predict(examples, article_doc)
+                    predictions.append(ex_to_id[best_entity])
+                    golds.append(ex_to_id[pos_ex])
 
         # TODO: use lowest_mse and combine with prior probability
         p, r, F = run_el.evaluate(predictions, golds, to_print=False)
@@ -161,60 +163,79 @@ class EL_Model():
         sgd = create_default_optimizer(model.ops)
         return sgd
 
-    def update(self, article_doc, true_entity, false_entities, drop=0., losses=None):
+    def update(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None):
+        # TODO: one call only to begin_update ?
+
+        entity_diffs = None
+        doc_diffs = None
+
         doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop)
 
-        true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop)
-        # print("encoding dim", len(true_entity_encoding[0]))
+        for i, true_entity in enumerate(true_entity_list):
+            false_entities = false_entities_list[i]
 
-        consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
-        consensus_encoding_t = consensus_encoding.transpose()
+            true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop)
+            # print("encoding dim", len(true_entity_encoding[0]))
 
-        doc_mse, doc_diffs = self._calculate_similarity(doc_encoding, consensus_encoding)
+            consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
+            # consensus_encoding_t = consensus_encoding.transpose()
 
-        entity_mses = list()
+            doc_mse, doc_diff = self._calculate_similarity(doc_encoding, consensus_encoding)
 
-        true_mse, true_diffs = self._calculate_similarity(true_entity_encoding, consensus_encoding)
-        # print("true_mse", true_mse)
-        # print("true_diffs", true_diffs)
-        entity_mses.append(true_mse)
-        # true_exp = np.exp(true_entity_encoding.dot(consensus_encoding_t))
-        # print("true_exp", true_exp)
+            entity_mses = list()
 
-        # false_exp_sum = 0
+            true_mse, true_diffs = self._calculate_similarity(true_entity_encoding, consensus_encoding)
+            # print("true_mse", true_mse)
+            # print("true_diffs", true_diffs)
+            entity_mses.append(true_mse)
+            # true_exp = np.exp(true_entity_encoding.dot(consensus_encoding_t))
+            # print("true_exp", true_exp)
 
-        for false_entity in false_entities:
-            false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop)
-            false_mse, false_diffs = self._calculate_similarity(false_entity_encoding, consensus_encoding)
-            # print("false_mse", false_mse)
-            # false_exp = np.exp(false_entity_encoding.dot(consensus_encoding_t))
-            # print("false_exp", false_exp)
-            # print("false_diffs", false_diffs)
-            entity_mses.append(false_mse)
-            # if false_mse > true_mse:
-                # true_diffs = true_diffs - false_diffs ???
-            # false_exp_sum += false_exp
+            # false_exp_sum = 0
 
-        # prob = true_exp / false_exp_sum
-        # print("prob", prob)
+            if doc_diffs is not None:
+                doc_diffs += doc_diff
+                entity_diffs += true_diffs
+            else:
+                doc_diffs = doc_diff
+                entity_diffs = true_diffs
 
-        entity_mses = sorted(entity_mses)
-        # mse_sum = sum(entity_mses)
-        # entity_probs = [1 - x/mse_sum for x in entity_mses]
-        # print("entity_mses", entity_mses)
-        # print("entity_probs", entity_probs)
-        true_index = entity_mses.index(true_mse)
-        # print("true index", true_index)
-        # print("true prob", entity_probs[true_index])
+            for false_entity in false_entities:
+                false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop)
+                false_mse, false_diffs = self._calculate_similarity(false_entity_encoding, consensus_encoding)
+                # print("false_mse", false_mse)
+                # false_exp = np.exp(false_entity_encoding.dot(consensus_encoding_t))
+                # print("false_exp", false_exp)
+                # print("false_diffs", false_diffs)
+                entity_mses.append(false_mse)
+                # if false_mse > true_mse:
+                    # true_diffs = true_diffs - false_diffs ???
+                # false_exp_sum += false_exp
 
-        # print("training loss", true_mse)
+            # prob = true_exp / false_exp_sum
+            # print("prob", prob)
 
-        # print()
+            entity_mses = sorted(entity_mses)
+            # mse_sum = sum(entity_mses)
+            # entity_probs = [1 - x/mse_sum for x in entity_mses]
+            # print("entity_mses", entity_mses)
+            # print("entity_probs", entity_probs)
+            true_index = entity_mses.index(true_mse)
+            # print("true index", true_index)
+            # print("true prob", entity_probs[true_index])
+
+            # print("training loss", true_mse)
+
+            # print()
 
         # TODO: proper backpropagation taking ranking of elements into account ?
         # TODO backpropagation also for negative examples
-        true_entity_bp(true_diffs, sgd=self.sgd_entity)
-        article_bp(doc_diffs, sgd=self.sgd_article)
+
+        if doc_diffs is not None:
+            doc_diffs = doc_diffs / len(true_entity_list)
+
+            true_entity_bp(entity_diffs, sgd=self.sgd_entity)
+            article_bp(doc_diffs, sgd=self.sgd_article)
 
 
     # TODO delete ?
@@ -268,7 +289,7 @@ class EL_Model():
                                                                                          collect_incorrect=True)
 
 
-        instances = list()
+        instance_by_doc = dict()
         local_vectors = list()   # TODO: local vectors
         doc_by_article = dict()
         pos_entities = dict()
@@ -280,18 +301,19 @@ class EL_Model():
                 if dev == run_el.is_dev(f):
                     article_id = f.replace(".txt", "")
                     if cnt % 500 == 0 and to_print:
-                        print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset")
+                        print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
                     cnt += 1
                     if article_id not in doc_by_article:
                         with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
                             text = file.read()
                             doc = self.nlp(text)
                             doc_by_article[article_id] = doc
+                            instance_by_doc[article_id] = set()
 
                     for mention, entity_pos in correct_entries[article_id].items():
                         descr = id_to_descr.get(entity_pos)
                         if descr:
-                            instances.append(article_id + "_" + mention)
+                            instance_by_doc[article_id].add(article_id + "_" + mention)
                             doc_descr = self.nlp(descr)
                             doc_descr._.entity_id = entity_pos
                             pos_entities[article_id + "_" + mention] = doc_descr
@@ -308,6 +330,6 @@ class EL_Model():
 
         if to_print:
             print()
-            print("Processed", cnt, "dev articles")
+            print("Processed", cnt, "training articles, dev=" + str(dev))
             print()
-        return instances, pos_entities, neg_entities, doc_by_article
+        return instance_by_doc, pos_entities, neg_entities, doc_by_article
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 83650aa8d..581d38b1b 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training ", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=50)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=500)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 09ed446b20fbeac06f6c88869d0e9a20e6332b03 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 14 May 2019 08:37:52 +0200
Subject: [PATCH 038/148] different architecture / settings

---
 .../pipeline/wiki_entity_linking/train_el.py  | 43 +++++++++----------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  2 +-
 2 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 1e2c25ffc..b3f42dcc4 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -4,18 +4,17 @@ from __future__ import unicode_literals
 import os
 import datetime
 from os import listdir
-import numpy as np
 from random import shuffle
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
 from spacy._ml import SpacyVectors, create_default_optimizer, zero_init
 
-from thinc.api import chain, flatten_add_lengths, with_getitem, clone, with_flatten
-from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
+from thinc.api import chain, flatten_add_lengths, with_getitem, clone
+from thinc.v2v import Model, Softmax, Maxout, Affine, ReLu
 from thinc.t2v import Pooling, sum_pool, mean_pool
-from thinc.t2t import ExtractWindow, ParametricAttention
-from thinc.misc import Residual, LayerNorm as LN
+from thinc.t2t import ParametricAttention
+from thinc.misc import Residual
 
 from spacy.tokens import Doc
 
@@ -35,18 +34,20 @@ class EL_Model():
         self.entity_encoder = self._simple_encoder(in_width=300, out_width=96)
         self.article_encoder = self._simple_encoder(in_width=300, out_width=96)
 
-    def train_model(self, training_dir, entity_descr_output, limit=None, to_print=True):
+    def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
         Doc.set_extension("entity_id", default=None)
 
         train_instances, train_pos, train_neg, train_doc = self._get_training_data(training_dir,
                                                                                    entity_descr_output,
                                                                                    False,
-                                                                                   limit, to_print)
+                                                                                   trainlimit,
+                                                                                   to_print)
 
         dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir,
                                                                            entity_descr_output,
                                                                            True,
-                                                                           limit / 10, to_print)
+                                                                           devlimit,
+                                                                           to_print)
 
         if to_print:
             print("Training on", len(train_instances.values()), "articles")
@@ -78,7 +79,6 @@ class EL_Model():
         if to_print:
             print("Trained on", instance_count, "instance clusters")
 
-
     def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc):
         predictions = list()
         golds = list()
@@ -129,19 +129,19 @@ class EL_Model():
         conv_depth = 1
         cnn_maxout_pieces = 3
         with Model.define_operators({">>": chain, "**": clone}):
-            # encoder = SpacyVectors \
-            #            >> flatten_add_lengths \
-            #           >> ParametricAttention(in_width)\
-            #            >> Pooling(mean_pool) \
-            #           >> Residual(zero_init(Maxout(in_width, in_width)))  \
-            #           >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
             encoder = SpacyVectors \
-                     >> flatten_add_lengths \
-                     >> with_getitem(0, Affine(in_width, in_width)) \
-                     >> ParametricAttention(in_width) \
-                     >> Pooling(sum_pool) \
-                     >> Residual(ReLu(in_width, in_width)) ** conv_depth \
-                     >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
+                        >> flatten_add_lengths \
+                       >> ParametricAttention(in_width)\
+                        >> Pooling(mean_pool) \
+                       >> Residual(zero_init(Maxout(in_width, in_width)))  \
+                       >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
+            # encoder = SpacyVectors \
+            #         >> flatten_add_lengths \
+            #         >> with_getitem(0, Affine(in_width, in_width)) \
+            #         >> ParametricAttention(in_width) \
+            #         >> Pooling(sum_pool) \
+            #         >> Residual(ReLu(in_width, in_width)) ** conv_depth \
+            #         >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
 
             # >> zero_init(Affine(nr_class, width, drop_factor=0.0))
             # >> logistic
@@ -178,7 +178,6 @@ class EL_Model():
             # print("encoding dim", len(true_entity_encoding[0]))
 
             consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
-            # consensus_encoding_t = consensus_encoding.transpose()
 
             doc_mse, doc_diff = self._calculate_similarity(doc_encoding, consensus_encoding)
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 581d38b1b..43cc41392 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training ", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=500)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=50)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 2713abc651dc9f601d98e5f9b402852798e22b79 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 14 May 2019 22:55:56 +0200
Subject: [PATCH 039/148] implement loss function using dot product and prob
 estimate per candidate cluster

---
 .../pipeline/wiki_entity_linking/train_el.py  | 203 +++++++++---------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 2 files changed, 103 insertions(+), 102 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index b3f42dcc4..06ac8d1d4 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -5,12 +5,14 @@ import os
 import datetime
 from os import listdir
 from random import shuffle
+import numpy as np
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
 from spacy._ml import SpacyVectors, create_default_optimizer, zero_init
 
 from thinc.api import chain, flatten_add_lengths, with_getitem, clone
+from thinc.neural.util import get_array_module
 from thinc.v2v import Model, Softmax, Maxout, Affine, ReLu
 from thinc.t2v import Pooling, sum_pool, mean_pool
 from thinc.t2t import ParametricAttention
@@ -23,6 +25,11 @@ from spacy.tokens import Doc
 
 class EL_Model():
 
+    INPUT_DIM = 300
+    OUTPUT_DIM = 5  # 96
+    PRINT_LOSS = True
+    PRINT_F = True
+
     labels = ["MATCH", "NOMATCH"]
     name = "entity_linker"
 
@@ -31,8 +38,8 @@ class EL_Model():
         self.nlp = nlp
         self.kb = kb
 
-        self.entity_encoder = self._simple_encoder(in_width=300, out_width=96)
-        self.article_encoder = self._simple_encoder(in_width=300, out_width=96)
+        self.entity_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM)
+        self.article_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM)
 
     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
         Doc.set_extension("entity_id", default=None)
@@ -64,17 +71,20 @@ class EL_Model():
         instance_count = 0
 
         for article_id, inst_cluster_set in train_instances.items():
+            print("article", article_id)
             article_doc = train_doc[article_id]
             pos_ex_list = list()
             neg_exs_list = list()
             for inst_cluster in inst_cluster_set:
+                print("inst_cluster", inst_cluster)
                 instance_count += 1
                 pos_ex_list.append(train_pos.get(inst_cluster))
                 neg_exs_list.append(train_neg.get(inst_cluster, []))
 
             self.update(article_doc, pos_ex_list, neg_exs_list, losses=losses)
             p, r, fscore = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc)
-            print(round(fscore, 1))
+            if self.PRINT_F:
+                print(round(fscore, 1))
 
         if to_print:
             print("Trained on", instance_count, "instance clusters")
@@ -102,7 +112,7 @@ class EL_Model():
                     examples.append(pos_ex)
                     shuffle(examples)
 
-                    best_entity, lowest_mse = self._predict(examples, article_doc)
+                    best_entity, highest_prob = self._predict(examples, article_doc)
                     predictions.append(ex_to_id[best_entity])
                     golds.append(ex_to_id[pos_ex])
 
@@ -113,17 +123,21 @@ class EL_Model():
     def _predict(self, entities, article_doc):
         doc_encoding = self.article_encoder([article_doc])
 
-        lowest_mse = None
+        highest_prob = None
         best_entity = None
 
+        entity_to_vector = dict()
         for entity in entities:
-            entity_encoding = self.entity_encoder([entity])
-            mse, _ = self._calculate_similarity(doc_encoding, entity_encoding)
-            if not best_entity or mse < lowest_mse:
-                lowest_mse = mse
+            entity_to_vector[entity] = self.entity_encoder([entity])
+
+        for entity in entities:
+            entity_encoding = entity_to_vector[entity]
+            prob = self._calculate_probability(doc_encoding, entity_encoding, entity_to_vector.values())
+            if not best_entity or prob > highest_prob:
+                highest_prob = prob
                 best_entity = entity
 
-        return best_entity, lowest_mse
+        return best_entity, highest_prob
 
     def _simple_encoder(self, in_width, out_width):
         conv_depth = 1
@@ -164,103 +178,56 @@ class EL_Model():
         return sgd
 
     def update(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None):
-        # TODO: one call only to begin_update ?
 
-        entity_diffs = None
-        doc_diffs = None
-
-        doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop)
 
         for i, true_entity in enumerate(true_entity_list):
-            false_entities = false_entities_list[i]
+            for cnt in range(10):
+            #try:
+                false_vectors = list()
+                false_entities = false_entities_list[i]
+                if len(false_entities) > 0:
+                    # TODO: batch per doc
+                    doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop)
+                    doc_encoding = doc_encoding[0]
+                    print()
+                    print(cnt)
+                    print("doc", doc_encoding)
 
-            true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop)
-            # print("encoding dim", len(true_entity_encoding[0]))
+                    for false_entity in false_entities:
+                        # TODO: one call only to begin_update ?
+                        false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop)
+                        false_entity_encoding = false_entity_encoding[0]
+                        false_vectors.append(false_entity_encoding)
 
-            consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
+                    true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop)
+                    true_entity_encoding = true_entity_encoding[0]
 
-            doc_mse, doc_diff = self._calculate_similarity(doc_encoding, consensus_encoding)
+                    all_vectors = [true_entity_encoding]
+                    all_vectors.extend(false_vectors)
 
-            entity_mses = list()
+                    # consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
 
-            true_mse, true_diffs = self._calculate_similarity(true_entity_encoding, consensus_encoding)
-            # print("true_mse", true_mse)
-            # print("true_diffs", true_diffs)
-            entity_mses.append(true_mse)
-            # true_exp = np.exp(true_entity_encoding.dot(consensus_encoding_t))
-            # print("true_exp", true_exp)
+                    true_prob = self._calculate_probability(doc_encoding, true_entity_encoding, all_vectors)
+                    print("true", true_prob, true_entity_encoding)
 
-            # false_exp_sum = 0
+                    all_probs = [true_prob]
+                    for false_vector in false_vectors:
+                        false_prob = self._calculate_probability(doc_encoding, false_vector, all_vectors)
+                        print("false", false_prob, false_vector)
+                        all_probs.append(false_prob)
 
-            if doc_diffs is not None:
-                doc_diffs += doc_diff
-                entity_diffs += true_diffs
-            else:
-                doc_diffs = doc_diff
-                entity_diffs = true_diffs
+                    loss = self._calculate_loss(true_prob, all_probs).astype(np.float32)
+                    if self.PRINT_LOSS:
+                        print("loss", round(loss, 5))
 
-            for false_entity in false_entities:
-                false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop)
-                false_mse, false_diffs = self._calculate_similarity(false_entity_encoding, consensus_encoding)
-                # print("false_mse", false_mse)
-                # false_exp = np.exp(false_entity_encoding.dot(consensus_encoding_t))
-                # print("false_exp", false_exp)
-                # print("false_diffs", false_diffs)
-                entity_mses.append(false_mse)
-                # if false_mse > true_mse:
-                    # true_diffs = true_diffs - false_diffs ???
-                # false_exp_sum += false_exp
-
-            # prob = true_exp / false_exp_sum
-            # print("prob", prob)
-
-            entity_mses = sorted(entity_mses)
-            # mse_sum = sum(entity_mses)
-            # entity_probs = [1 - x/mse_sum for x in entity_mses]
-            # print("entity_mses", entity_mses)
-            # print("entity_probs", entity_probs)
-            true_index = entity_mses.index(true_mse)
-            # print("true index", true_index)
-            # print("true prob", entity_probs[true_index])
-
-            # print("training loss", true_mse)
-
-            # print()
-
-        # TODO: proper backpropagation taking ranking of elements into account ?
-        # TODO backpropagation also for negative examples
-
-        if doc_diffs is not None:
-            doc_diffs = doc_diffs / len(true_entity_list)
-
-            true_entity_bp(entity_diffs, sgd=self.sgd_entity)
-            article_bp(doc_diffs, sgd=self.sgd_article)
+                    doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_vectors)
+                    print("doc_gradient", doc_gradient)
+                    article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article)
+            #except Exception as e:
+                #pass
 
 
-    # TODO delete ?
-    def _simple_cnn_model(self, internal_dim):
-        nr_class = len(self.labels)
-        with Model.define_operators({">>": chain}):
-            model_entity = SpacyVectors >> flatten_add_lengths >> Pooling(mean_pool)    # entity encoding
-            model_doc = SpacyVectors >> flatten_add_lengths >> Pooling(mean_pool)       # doc encoding
-            output_layer = Softmax(nr_class, internal_dim*2)
-            model = (model_entity | model_doc) >> output_layer
-        # model.tok2vec = chain(tok2vec, flatten)
-        model.nO = nr_class
-        return model
-
-    def predict(self, entity_doc, article_doc):
-        entity_encoding = self.entity_encoder(entity_doc)
-        doc_encoding = self.article_encoder(article_doc)
-
-        print("entity_encodings", len(entity_encoding), entity_encoding)
-        print("doc_encodings", len(doc_encoding), doc_encoding)
-        mse, diffs = self._calculate_similarity(entity_encoding, doc_encoding)
-        print("mse", mse)
-
-        return mse
-
-    # TODO: expand to more than 2 vectors
+    # TODO: FIX
     def _calculate_consensus(self, vector1, vector2):
         if len(vector1) != len(vector2):
             raise ValueError("To calculate consenus, both vectors should be of equal length")
@@ -268,17 +235,51 @@ class EL_Model():
         avg = (vector2 + vector1) / 2
         return avg
 
-    def _calculate_similarity(self, vector1, vector2):
+    def _calculate_probability(self, vector1, vector2, allvectors):
+        """ Make sure that vector2 is included in allvectors """
         if len(vector1) != len(vector2):
             raise ValueError("To calculate similarity, both vectors should be of equal length")
 
-        diffs = (vector1 - vector2)
-        error_sum = (diffs ** 2).sum()
-        mean_square_error = error_sum / len(vector1)
-        return float(mean_square_error), diffs
+        vector1_t = vector1.transpose()
+        e = self._calculate_dot_exp(vector2, vector1_t)
+        e_sum = 0
+        for v in allvectors:
+            e_sum += self._calculate_dot_exp(v, vector1_t)
 
-    def _get_labels(self):
-        return tuple(self.labels)
+        return float(e / e_sum)
+
+    @staticmethod
+    def _calculate_loss(true_prob, all_probs):
+        """ all_probs should include true_prob ! """
+        return -1 * np.log(true_prob / sum(all_probs))
+
+    @staticmethod
+    def _calculate_doc_gradient(loss, doc_vector, true_vector, false_vectors):
+        gradient = np.zeros(len(doc_vector))
+        for i in range(len(doc_vector)):
+            min_false = min(x[i] for x in false_vectors)
+            max_false = max(x[i] for x in false_vectors)
+
+            if true_vector[i] > max_false:
+                if doc_vector[i] > 0:
+                    gradient[i] = 0
+                else:
+                    gradient[i] = -loss
+            elif true_vector[i] < min_false:
+                if doc_vector[i] > 0:
+                    gradient[i] = loss
+                if doc_vector[i] < 0:
+                    gradient[i] = 0
+            else:
+                target = 0  # non-distinctive vector positions should convert to 0
+                gradient[i] = doc_vector[i] - target
+
+        return gradient
+
+    @staticmethod
+    def _calculate_dot_exp(vector1, vector2_transposed):
+        e = np.exp(vector1.dot(vector2_transposed))
+        return e
 
     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 43cc41392..bc75ac09a 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training ", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=50)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=5)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 9ffe5437aee37c02db2d32a79bc4a2072448cce3 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 15 May 2019 02:23:08 +0200
Subject: [PATCH 040/148] calculate gradient for entity encoding

---
 .../pipeline/wiki_entity_linking/train_el.py  | 125 ++++++++++++------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 2 files changed, 88 insertions(+), 39 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 06ac8d1d4..9f674d239 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -26,9 +26,10 @@ from spacy.tokens import Doc
 class EL_Model():
 
     INPUT_DIM = 300
-    OUTPUT_DIM = 5  # 96
-    PRINT_LOSS = True
+    OUTPUT_DIM = 96
+    PRINT_LOSS = False
     PRINT_F = True
+    EPS = 0.0000000005
 
     labels = ["MATCH", "NOMATCH"]
     name = "entity_linker"
@@ -71,12 +72,12 @@ class EL_Model():
         instance_count = 0
 
         for article_id, inst_cluster_set in train_instances.items():
-            print("article", article_id)
+            # print("article", article_id)
             article_doc = train_doc[article_id]
             pos_ex_list = list()
             neg_exs_list = list()
             for inst_cluster in inst_cluster_set:
-                print("inst_cluster", inst_cluster)
+                # print("inst_cluster", inst_cluster)
                 instance_count += 1
                 pos_ex_list.append(train_pos.get(inst_cluster))
                 neg_exs_list.append(train_neg.get(inst_cluster, []))
@@ -143,19 +144,19 @@ class EL_Model():
         conv_depth = 1
         cnn_maxout_pieces = 3
         with Model.define_operators({">>": chain, "**": clone}):
-            encoder = SpacyVectors \
-                        >> flatten_add_lengths \
-                       >> ParametricAttention(in_width)\
-                        >> Pooling(mean_pool) \
-                       >> Residual(zero_init(Maxout(in_width, in_width)))  \
-                       >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
             # encoder = SpacyVectors \
-            #         >> flatten_add_lengths \
-            #         >> with_getitem(0, Affine(in_width, in_width)) \
-            #         >> ParametricAttention(in_width) \
-            #         >> Pooling(sum_pool) \
-            #         >> Residual(ReLu(in_width, in_width)) ** conv_depth \
-            #         >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
+            #            >> flatten_add_lengths \
+            #           >> ParametricAttention(in_width)\
+            #            >> Pooling(mean_pool) \
+            #           >> Residual(zero_init(Maxout(in_width, in_width)))  \
+            #           >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
+            encoder = SpacyVectors \
+                     >> flatten_add_lengths \
+                     >> with_getitem(0, Affine(in_width, in_width)) \
+                     >> ParametricAttention(in_width) \
+                     >> Pooling(sum_pool) \
+                     >> Residual(ReLu(in_width, in_width)) ** conv_depth \
+                     >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
 
             # >> zero_init(Affine(nr_class, width, drop_factor=0.0))
             # >> logistic
@@ -178,20 +179,16 @@ class EL_Model():
         return sgd
 
     def update(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None):
-
+        doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop)
+        doc_encoding = doc_encoding[0]
+        # print("doc", doc_encoding)
 
         for i, true_entity in enumerate(true_entity_list):
-            for cnt in range(10):
-            #try:
+            try:
                 false_vectors = list()
                 false_entities = false_entities_list[i]
                 if len(false_entities) > 0:
                     # TODO: batch per doc
-                    doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop)
-                    doc_encoding = doc_encoding[0]
-                    print()
-                    print(cnt)
-                    print("doc", doc_encoding)
 
                     for false_entity in false_entities:
                         # TODO: one call only to begin_update ?
@@ -201,6 +198,7 @@ class EL_Model():
 
                     true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop)
                     true_entity_encoding = true_entity_encoding[0]
+                    # true_gradient = self._calculate_true_gradient(doc_encoding, true_entity_encoding)
 
                     all_vectors = [true_entity_encoding]
                     all_vectors.extend(false_vectors)
@@ -208,29 +206,37 @@ class EL_Model():
                     # consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
 
                     true_prob = self._calculate_probability(doc_encoding, true_entity_encoding, all_vectors)
-                    print("true", true_prob, true_entity_encoding)
+                    # print("true", true_prob, true_entity_encoding)
+                    # print("true gradient", true_gradient)
+                    # print()
 
                     all_probs = [true_prob]
                     for false_vector in false_vectors:
                         false_prob = self._calculate_probability(doc_encoding, false_vector, all_vectors)
-                        print("false", false_prob, false_vector)
+                        # print("false", false_prob, false_vector)
+                        # print("false gradient", false_gradient)
+                        # print()
                         all_probs.append(false_prob)
 
                     loss = self._calculate_loss(true_prob, all_probs).astype(np.float32)
                     if self.PRINT_LOSS:
-                        print("loss", round(loss, 5))
+                        print(round(loss, 5))
 
-                    doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_vectors)
-                    print("doc_gradient", doc_gradient)
-                    article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article)
-            #except Exception as e:
-                #pass
+                    #doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_vectors)
+                    entity_gradient = self._calculate_entity_gradient(doc_encoding, true_entity_encoding, false_vectors)
+                    # print("entity_gradient", entity_gradient)
+                    # print("doc_gradient", doc_gradient)
+                    # article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article)
+                    true_entity_bp([entity_gradient.astype(np.float32)], sgd=self.sgd_entity)
+                    #true_entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity)
+            except Exception as e:
+                pass
 
 
     # TODO: FIX
     def _calculate_consensus(self, vector1, vector2):
         if len(vector1) != len(vector2):
-            raise ValueError("To calculate consenus, both vectors should be of equal length")
+            raise ValueError("To calculate consensus, both vectors should be of equal length")
 
         avg = (vector2 + vector1) / 2
         return avg
@@ -246,12 +252,11 @@ class EL_Model():
         for v in allvectors:
             e_sum += self._calculate_dot_exp(v, vector1_t)
 
-        return float(e / e_sum)
+        return float(e / (self.EPS + e_sum))
 
-    @staticmethod
-    def _calculate_loss(true_prob, all_probs):
+    def _calculate_loss(self, true_prob, all_probs):
         """ all_probs should include true_prob ! """
-        return -1 * np.log(true_prob / sum(all_probs))
+        return -1 * np.log((self.EPS + true_prob) / (self.EPS + sum(all_probs)))
 
     @staticmethod
     def _calculate_doc_gradient(loss, doc_vector, true_vector, false_vectors):
@@ -276,9 +281,53 @@ class EL_Model():
 
         return gradient
 
+    def _calculate_true_gradient(self, doc_vector, entity_vector):
+        # sum_entity_vector = sum(entity_vector)
+        # gradient = [-sum_entity_vector/(self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))]
+        gradient = [1 / (self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))]
+        return np.asarray(gradient)
+
+    def _calculate_entity_gradient(self, doc_vector, true_vector, false_vectors):
+        entity_gradient = list()
+        prob_true = list()
+        false_prob_list = list()
+        for i in range(len(true_vector)):
+            doc_i = np.asarray([doc_vector[i]])
+            true_i = np.asarray([true_vector[i]])
+            falses_i = np.asarray([[fv[i]] for fv in false_vectors])
+            all_i = [true_i]
+            all_i.extend(falses_i)
+
+            prob_true_i = self._calculate_probability(doc_i, true_i, all_i)
+            prob_true.append(prob_true_i)
+
+            false_list = list()
+            all_probs_i = [prob_true_i]
+            for false_vector in falses_i:
+                false_prob_i = self._calculate_probability(doc_i, false_vector, all_i)
+                all_probs_i.append(false_prob_i)
+                false_list.append(false_prob_i)
+            false_prob_list.append(false_list)
+
+            sign_loss_i = 1
+            if doc_vector[i] * true_vector[i] < 0:
+                sign_loss_i = -1
+
+            loss_i = sign_loss_i * self._calculate_loss(prob_true_i, all_probs_i).astype(np.float32)
+            entity_gradient.append(loss_i)
+        # print("prob_true", prob_true)
+        # print("false_prob_list", false_prob_list)
+        return np.asarray(entity_gradient)
+
+
     @staticmethod
     def _calculate_dot_exp(vector1, vector2_transposed):
-        e = np.exp(vector1.dot(vector2_transposed))
+        dot_product = vector1.dot(vector2_transposed)
+        dot_product = min(50, dot_product)
+        # dot_product = max(-10000, dot_product)
+        # print("DOT", dot_product)
+        e = np.exp(dot_product)
+        # print("E", e)
         return e
 
     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index bc75ac09a..cccc67650 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training ", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=5)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1500, devlimit=50)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From b5470f3d753dd3bac3423121a44c0862a67b607c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 16 May 2019 18:25:34 +0200
Subject: [PATCH 041/148] various tests, architectures and experiments

---
 .../pipeline/wiki_entity_linking/train_el.py  | 472 ++++++++++++++----
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 2 files changed, 363 insertions(+), 111 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 9f674d239..5cb027d0e 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -6,32 +6,40 @@ import datetime
 from os import listdir
 from random import shuffle
 import numpy as np
+import random
+from thinc.neural._classes.convolution import ExtractWindow
+from thinc.neural._classes.feature_extracter import FeatureExtracter
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
-from spacy._ml import SpacyVectors, create_default_optimizer, zero_init
+from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic
 
-from thinc.api import chain, flatten_add_lengths, with_getitem, clone
+from thinc.api import chain, concatenate, flatten_add_lengths, with_getitem, clone, with_flatten
 from thinc.neural.util import get_array_module
 from thinc.v2v import Model, Softmax, Maxout, Affine, ReLu
-from thinc.t2v import Pooling, sum_pool, mean_pool
+from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
 from thinc.t2t import ParametricAttention
 from thinc.misc import Residual
+from thinc.misc import LayerNorm as LN
 
 from spacy.tokens import Doc
 
 """ TODO: this code needs to be implemented in pipes.pyx"""
 
 
-class EL_Model():
+class EL_Model:
 
-    INPUT_DIM = 300
-    OUTPUT_DIM = 96
-    PRINT_LOSS = False
+    PRINT_LOSS = True
     PRINT_F = True
     EPS = 0.0000000005
+    CUTOFF = 0.5
+
+    INPUT_DIM = 300
+    ENTITY_WIDTH = 64
+    ARTICLE_WIDTH = 64
+    HIDDEN_1_WIDTH = 256
+    HIDDEN_2_WIDTH = 64
 
-    labels = ["MATCH", "NOMATCH"]
     name = "entity_linker"
 
     def __init__(self, kb, nlp):
@@ -39,58 +47,102 @@ class EL_Model():
         self.nlp = nlp
         self.kb = kb
 
-        self.entity_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM)
-        self.article_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM)
+        self._build_cnn(hidden_entity_width=self.ENTITY_WIDTH, hidden_article_width=self.ARTICLE_WIDTH)
+
+        # self.entity_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM)
+        # self.article_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM)
 
     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
+        # raise errors instead of runtime warnings in case of int/float overflow
+        np.seterr(all='raise')
+
         Doc.set_extension("entity_id", default=None)
 
         train_instances, train_pos, train_neg, train_doc = self._get_training_data(training_dir,
                                                                                    entity_descr_output,
                                                                                    False,
                                                                                    trainlimit,
-                                                                                   to_print)
+                                                                                   to_print=False)
 
         dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir,
                                                                            entity_descr_output,
                                                                            True,
                                                                            devlimit,
-                                                                           to_print)
+                                                                           to_print=False)
+
+        # self.sgd_entity = self.begin_training(self.entity_encoder)
+        # self.sgd_article = self.begin_training(self.article_encoder)
+        self._begin_training()
+
+        if self.PRINT_F:
+            _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
+            _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False)
+            _, _, f_random_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, calc_random=True)
+            _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
+            _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False)
+            _, _, f_random_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, calc_random=True)
+
+            print("random F train", round(f_random_train, 1))
+            print("random F dev", round(f_random_dev, 1))
+            print()
+            print("avg/nonavg F train", round(f_avg_train, 1), round(f_nonavg_train, 1))
+            print("avg/nonavg F dev", round(f_avg_dev, 1), round(f_nonavg_dev, 1))
+            print()
+
+        instance_pos_count = 0
+        instance_neg_count = 0
 
         if to_print:
             print("Training on", len(train_instances.values()), "articles")
             print("Dev test on", len(dev_instances.values()), "articles")
             print()
 
-        self.sgd_entity = self.begin_training(self.entity_encoder)
-        self.sgd_article = self.begin_training(self.article_encoder)
+        # for article_id, inst_cluster_set in train_instances.items():
+            # article_doc = train_doc[article_id]
+            # print("training on", article_id, inst_cluster_set)
+            # pos_ex_list = list()
+            # neg_exs_list = list()
+            # for inst_cluster in inst_cluster_set:
+                # instance_count += 1
+                # pos_ex_list.append(train_pos.get(inst_cluster))
+                # neg_exs_list.append(train_neg.get(inst_cluster, []))
 
-        self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc)
-
-        losses = {}
-
-        instance_count = 0
+            #self.update(article_doc, pos_ex_list, neg_exs_list)
 
+        article_docs = list()
+        entities = list()
+        golds = list()
         for article_id, inst_cluster_set in train_instances.items():
-            # print("article", article_id)
-            article_doc = train_doc[article_id]
-            pos_ex_list = list()
-            neg_exs_list = list()
             for inst_cluster in inst_cluster_set:
-                # print("inst_cluster", inst_cluster)
-                instance_count += 1
-                pos_ex_list.append(train_pos.get(inst_cluster))
-                neg_exs_list.append(train_neg.get(inst_cluster, []))
+                article_docs.append(train_doc[article_id])
+                entities.append(train_pos.get(inst_cluster))
+                golds.append(float(1.0))
+                instance_pos_count += 1
+                for neg_entity in train_neg.get(inst_cluster, []):
+                    article_docs.append(train_doc[article_id])
+                    entities.append(neg_entity)
+                    golds.append(float(0.0))
+                    instance_neg_count += 1
 
-            self.update(article_doc, pos_ex_list, neg_exs_list, losses=losses)
-            p, r, fscore = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc)
+        for x in range(10):
+            print("Updating", x)
+            self.update(article_docs=article_docs, entities=entities, golds=golds)
+
+            # eval again
             if self.PRINT_F:
-                print(round(fscore, 1))
+                _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
+                _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False)
+                _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
+                _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False)
+
+                print("avg/nonavg F train", round(f_avg_train, 1), round(f_nonavg_train, 1))
+                print("avg/nonavg F dev", round(f_avg_dev, 1), round(f_nonavg_dev, 1))
+                print()
 
         if to_print:
-            print("Trained on", instance_count, "instance clusters")
+            print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg")
 
-    def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc):
+    def _test_dev_depr(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False):
         predictions = list()
         golds = list()
 
@@ -113,23 +165,65 @@ class EL_Model():
                     examples.append(pos_ex)
                     shuffle(examples)
 
-                    best_entity, highest_prob = self._predict(examples, article_doc)
+                    best_entity, highest_prob = self._predict(examples, article_doc, avg)
+                    if calc_random:
+                        best_entity, highest_prob = self._predict_random(examples)
                     predictions.append(ex_to_id[best_entity])
                     golds.append(ex_to_id[pos_ex])
 
         # TODO: use lowest_mse and combine with prior probability
-        p, r, F = run_el.evaluate(predictions, golds, to_print=False)
-        return p, r, F
+        p, r, f = run_el.evaluate(predictions, golds, to_print=False)
+        return p, r, f
 
-    def _predict(self, entities, article_doc):
-        doc_encoding = self.article_encoder([article_doc])
+    def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False):
+        predictions = list()
+        golds = list()
+
+        for article_id, inst_cluster_set in dev_instances.items():
+            for inst_cluster in inst_cluster_set:
+                pos_ex = dev_pos.get(inst_cluster)
+                neg_exs = dev_neg.get(inst_cluster, [])
+
+                article = inst_cluster.split(sep="_")[0]
+                entity_id = inst_cluster.split(sep="_")[1]
+                article_doc = dev_doc[article]
+
+                if calc_random:
+                    prediction = self._predict_random(entity=pos_ex)
+                else:
+                    prediction = self._predict(article_doc=article_doc, entity=pos_ex, avg=avg)
+                predictions.append(prediction)
+                golds.append(float(1.0))
+
+                for neg_ex in neg_exs:
+                    if calc_random:
+                        prediction = self._predict_random(entity=neg_ex)
+                    else:
+                        prediction = self._predict(article_doc=article_doc, entity=neg_ex, avg=avg)
+                    predictions.append(prediction)
+                    golds.append(float(0.0))
+
+        # TODO: use lowest_mse and combine with prior probability
+        p, r, f = run_el.evaluate(predictions, golds, to_print=False)
+        return p, r, f
+
+    def _predict_depr(self, entities, article_doc, avg=False):
+        if avg:
+            with self.article_encoder.use_params(self.sgd_article.averages):
+                doc_encoding = self.article_encoder([article_doc])
+        else:
+            doc_encoding = self.article_encoder([article_doc])
 
         highest_prob = None
         best_entity = None
 
         entity_to_vector = dict()
         for entity in entities:
-            entity_to_vector[entity] = self.entity_encoder([entity])
+            if avg:
+                with self.entity_encoder.use_params(self.sgd_entity.averages):
+                    entity_to_vector[entity] = self.entity_encoder([entity])
+            else:
+                entity_to_vector[entity] = self.entity_encoder([entity])
 
         for entity in entities:
             entity_encoding = entity_to_vector[entity]
@@ -140,7 +234,97 @@ class EL_Model():
 
         return best_entity, highest_prob
 
-    def _simple_encoder(self, in_width, out_width):
+    def _predict(self, article_doc, entity, avg=False, apply_threshold=True):
+        if avg:
+            with self.sgd.use_params(self.model.averages):
+                doc_encoding = self.article_encoder([article_doc])
+                entity_encoding = self.entity_encoder([entity])
+                return self.model(np.append(entity_encoding, doc_encoding))  # TODO list
+
+        doc_encoding = self.article_encoder([article_doc])[0]
+        entity_encoding = self.entity_encoder([entity])[0]
+        concat_encoding = list(entity_encoding) + list(doc_encoding)
+        np_array = np.asarray([concat_encoding])
+        prediction = self.model(np_array)
+        if not apply_threshold:
+            return float(prediction)
+        if prediction > self.CUTOFF:
+            return float(1.0)
+        return float(0.0)
+
+    def _predict_random_depr(self, entities):
+        highest_prob = 1
+        best_entity = random.choice(entities)
+        return best_entity, highest_prob
+
+    def _predict_random(self, entity, apply_threshold=True):
+        r = random.uniform(0, 1)
+        if not apply_threshold:
+            return r
+        if r > self.CUTOFF:
+            return float(1.0)
+        return float(0.0)
+
+    def _build_cnn(self, hidden_entity_width, hidden_article_width):
+        with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
+            self.entity_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_entity_width)  # entity encoding
+            self.article_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_article_width)  # doc encoding
+
+            hidden_input_with = hidden_entity_width + hidden_article_width
+            hidden_output_with = self.HIDDEN_1_WIDTH
+
+            convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_output_with, hidden_output_with * 3))))
+
+            # self.entity_encoder | self.article_encoder \
+            # self.model = with_flatten(LN(Maxout(hidden_with, hidden_with)) >> convolution_2 ** 2, pad=2)  \
+            #          >> flatten_add_lengths \
+            #          >> ParametricAttention(hidden_with) \
+            #          >> Pooling(sum_pool) \
+            #          >> Softmax(nr_class, nr_class)
+
+            self.model = Affine(hidden_output_with, hidden_input_with) \
+                       >> LN(Maxout(hidden_output_with, hidden_output_with)) \
+                       >> convolution_2 \
+                       >> Affine(self.HIDDEN_2_WIDTH, hidden_output_with) \
+                       >> Affine(1, self.HIDDEN_2_WIDTH) \
+                       >> logistic
+                       # >> with_flatten(LN(Maxout(hidden_output_with, hidden_output_with)) >> convolution_2 ** 2, pad=2)
+
+                    #  >> convolution_2 \
+
+                       #  >> flatten_add_lengths
+                       #  >> ParametricAttention(hidden_output_with) \
+                       #  >> Pooling(max_pool) \
+                       #  >> Softmax(nr_class, nr_class)
+
+        # self.model.nO = nr_class
+
+    @staticmethod
+    def _encoder(in_width, hidden_width):
+        with Model.define_operators({">>": chain}):
+            encoder = SpacyVectors \
+                >> flatten_add_lengths \
+                >> ParametricAttention(in_width)\
+                >> Pooling(mean_pool) \
+                >> Residual(zero_init(Maxout(in_width, in_width)))  \
+                >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0))
+
+        return encoder
+
+    def begin_training_depr(self, model):
+        # TODO ? link_vectors_to_models(self.vocab) depr?
+        sgd = create_default_optimizer(model.ops)
+        return sgd
+
+    def _begin_training(self):
+        # self.sgd_entity = self.begin_training(self.entity_encoder)
+        # self.sgd_article = self.begin_training(self.article_encoder)
+        self.sgd = create_default_optimizer(self.model.ops)
+
+    # TODO: deprecated ?
+    def _simple_encoder_depr(self, in_width, out_width):
+        hidden_with = 128
+
         conv_depth = 1
         cnn_maxout_pieces = 3
         with Model.define_operators({">>": chain, "**": clone}):
@@ -150,21 +334,56 @@ class EL_Model():
             #            >> Pooling(mean_pool) \
             #           >> Residual(zero_init(Maxout(in_width, in_width)))  \
             #           >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
-            encoder = SpacyVectors \
-                     >> flatten_add_lengths \
-                     >> with_getitem(0, Affine(in_width, in_width)) \
-                     >> ParametricAttention(in_width) \
-                     >> Pooling(sum_pool) \
-                     >> Residual(ReLu(in_width, in_width)) ** conv_depth \
-                     >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
+            # encoder = SpacyVectors \
+            #         >> flatten_add_lengths \
+            #         >> with_getitem(0, Affine(in_width, in_width)) \
+            #         >> ParametricAttention(in_width) \
+            #         >> Pooling(sum_pool) \
+            #         >> Residual(ReLu(in_width, in_width)) ** conv_depth \
+            #         >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
+            # encoder = SpacyVectors \
+            #        >> flatten_add_lengths \
+            #        >> ParametricAttention(in_width)\
+            #        >> Pooling(sum_pool) \
+            #        >> Residual(zero_init(Maxout(in_width, in_width)))  \
+            #        >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
 
             # >> zero_init(Affine(nr_class, width, drop_factor=0.0))
             # >> logistic
 
-            # convolution = Residual(
-            #    ExtractWindow(nW=1)
-            #    >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
-            # )
+            #convolution = Residual(ExtractWindow(nW=1)
+            #                       >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces))
+            #)
+            #encoder = SpacyVectors >> with_flatten(
+            #    embed >> convolution ** conv_depth, pad=conv_depth
+            #)
+
+            # static_vectors = SpacyVectors >> with_flatten(
+            #    Affine(in_width, in_width)
+            #)
+
+            convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_with, hidden_with * 3))))
+
+            encoder = SpacyVectors >> with_flatten(LN(Maxout(hidden_with, in_width)) >> convolution_2 ** 2, pad = 2)  \
+                      >> flatten_add_lengths \
+                      >> ParametricAttention(hidden_with) \
+                      >> Pooling(sum_pool) \
+                      >> Residual(zero_init(Maxout(hidden_with, hidden_with))) \
+                      >> zero_init(Affine(out_width, hidden_with, drop_factor=0.0)) \
+                      >> logistic
+
+            # convolution = Residual(ExtractWindow(nW=1) >> ReLu(in_width, in_width*3))
+
+            # encoder = static_vectors # >> with_flatten(
+            #    ReLu(in_width, in_width)
+            #    >> convolution ** conv_depth, pad=conv_depth) \
+            #    >> Affine(out_width, in_width, drop_factor=0.0)
+
+            # encoder = SpacyVectors >> with_flatten(
+            #    LN(Maxout(in_width, in_width))
+            #    >> Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces)))) ** conv_depth,
+            #    pad=conv_depth,
+            #)  >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
 
             # embed = SpacyVectors >> LN(Maxout(width, width, pieces=3))
 
@@ -173,75 +392,91 @@ class EL_Model():
 
         return encoder
 
-    def begin_training(self, model):
-        # TODO ? link_vectors_to_models(self.vocab)
-        sgd = create_default_optimizer(model.ops)
-        return sgd
-
-    def update(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None):
+    def update_depr(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None):
         doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop)
         doc_encoding = doc_encoding[0]
+        # print()
         # print("doc", doc_encoding)
 
         for i, true_entity in enumerate(true_entity_list):
             try:
-                false_vectors = list()
                 false_entities = false_entities_list[i]
                 if len(false_entities) > 0:
                     # TODO: batch per doc
 
-                    for false_entity in false_entities:
-                        # TODO: one call only to begin_update ?
-                        false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop)
-                        false_entity_encoding = false_entity_encoding[0]
-                        false_vectors.append(false_entity_encoding)
+                    all_entities = [true_entity]
+                    all_entities.extend(false_entities)
 
-                    true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop)
-                    true_entity_encoding = true_entity_encoding[0]
-                    # true_gradient = self._calculate_true_gradient(doc_encoding, true_entity_encoding)
+                    entity_encodings, entity_bp = self.entity_encoder.begin_update(all_entities, drop=drop)
+                    true_entity_encoding = entity_encodings[0]
+                    false_entity_encodings = entity_encodings[1:]
 
                     all_vectors = [true_entity_encoding]
-                    all_vectors.extend(false_vectors)
+                    all_vectors.extend(false_entity_encodings)
 
                     # consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
 
                     true_prob = self._calculate_probability(doc_encoding, true_entity_encoding, all_vectors)
                     # print("true", true_prob, true_entity_encoding)
-                    # print("true gradient", true_gradient)
-                    # print()
 
                     all_probs = [true_prob]
-                    for false_vector in false_vectors:
+                    for false_vector in false_entity_encodings:
                         false_prob = self._calculate_probability(doc_encoding, false_vector, all_vectors)
                         # print("false", false_prob, false_vector)
-                        # print("false gradient", false_gradient)
-                        # print()
                         all_probs.append(false_prob)
 
                     loss = self._calculate_loss(true_prob, all_probs).astype(np.float32)
                     if self.PRINT_LOSS:
-                        print(round(loss, 5))
+                        print("loss train", round(loss, 5))
 
-                    #doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_vectors)
-                    entity_gradient = self._calculate_entity_gradient(doc_encoding, true_entity_encoding, false_vectors)
-                    # print("entity_gradient", entity_gradient)
+                    # for false_vector in false_vectors:
+                    #    false_gradient = -1 * self._calculate_entity_gradient(loss, doc_encoding, false_vector, false_vectors)
+                    #    print("false gradient", false_gradient)
+
+                    # doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_entity_encodings)
+                    true_gradient, doc_gradient = self._calculate_entity_gradient(loss, doc_encoding, true_entity_encoding, false_entity_encodings)
+                    # print("true_gradient", true_gradient)
                     # print("doc_gradient", doc_gradient)
-                    # article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article)
-                    true_entity_bp([entity_gradient.astype(np.float32)], sgd=self.sgd_entity)
+                    article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article)
+                    entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity)
                     #true_entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity)
             except Exception as e:
                 pass
 
+    def update(self, article_docs, entities, golds, drop=0.):
+        doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
+        entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=drop)
+        concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
 
-    # TODO: FIX
-    def _calculate_consensus(self, vector1, vector2):
-        if len(vector1) != len(vector2):
-            raise ValueError("To calculate consensus, both vectors should be of equal length")
+        predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=drop)
 
-        avg = (vector2 + vector1) / 2
-        return avg
+        predictions = self.model.ops.flatten(predictions)
+        golds = self.model.ops.asarray(golds)
 
-    def _calculate_probability(self, vector1, vector2, allvectors):
+        # print("predictions", predictions)
+        # print("golds", golds)
+
+        d_scores = (predictions - golds) # / predictions.shape[0]
+        # print("d_scores (1)", d_scores)
+
+        loss = (d_scores ** 2).sum()
+
+        if self.PRINT_LOSS:
+            print("loss train", round(loss, 5))
+
+        d_scores = d_scores.reshape((-1, 1))
+        d_scores = d_scores.astype(np.float32)
+        # print("d_scores (2)", d_scores)
+
+        model_gradient = bp_model(d_scores, sgd=self.sgd)
+
+        doc_gradient = [x[0:self.ARTICLE_WIDTH] for x in model_gradient]
+        entity_gradient = [x[self.ARTICLE_WIDTH:] for x in model_gradient]
+
+        bp_doc(doc_gradient)
+        bp_encoding(entity_gradient)
+
+    def _calculate_probability_depr(self, vector1, vector2, allvectors):
         """ Make sure that vector2 is included in allvectors """
         if len(vector1) != len(vector2):
             raise ValueError("To calculate similarity, both vectors should be of equal length")
@@ -254,12 +489,12 @@ class EL_Model():
 
         return float(e / (self.EPS + e_sum))
 
-    def _calculate_loss(self, true_prob, all_probs):
+    def _calculate_loss_depr(self, true_prob, all_probs):
         """ all_probs should include true_prob ! """
         return -1 * np.log((self.EPS + true_prob) / (self.EPS + sum(all_probs)))
 
     @staticmethod
-    def _calculate_doc_gradient(loss, doc_vector, true_vector, false_vectors):
+    def _calculate_doc_gradient_depr(loss, doc_vector, true_vector, false_vectors):
         gradient = np.zeros(len(doc_vector))
         for i in range(len(doc_vector)):
             min_false = min(x[i] for x in false_vectors)
@@ -276,21 +511,25 @@ class EL_Model():
                 if doc_vector[i] < 0:
                     gradient[i] = 0
             else:
-                target = 0  # non-distinctive vector positions should convert to 0
-                gradient[i] = doc_vector[i] - target
+                # non-distinctive vector positions should converge to 0
+                gradient[i] = doc_vector[i]
 
         return gradient
 
-    def _calculate_true_gradient(self, doc_vector, entity_vector):
+    # TODO: delete ? try again ?
+    def depr__calculate_true_gradient(self, doc_vector, entity_vector):
         # sum_entity_vector = sum(entity_vector)
         # gradient = [-sum_entity_vector/(self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))]
         gradient = [1 / (self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))]
         return np.asarray(gradient)
 
-    def _calculate_entity_gradient(self, doc_vector, true_vector, false_vectors):
-        entity_gradient = list()
-        prob_true = list()
-        false_prob_list = list()
+    def _calculate_losses_vector_depr(self, doc_vector, true_vector, false_vectors):
+        # prob_true = list()
+        # prob_false_dict = dict()
+
+        true_losses = list()
+        # false_losses_dict = dict()
+
         for i in range(len(true_vector)):
             doc_i = np.asarray([doc_vector[i]])
             true_i = np.asarray([true_vector[i]])
@@ -299,32 +538,45 @@ class EL_Model():
             all_i.extend(falses_i)
 
             prob_true_i = self._calculate_probability(doc_i, true_i, all_i)
-            prob_true.append(prob_true_i)
+            # prob_true.append(prob_true_i)
 
-            false_list = list()
+            # false_list = list()
             all_probs_i = [prob_true_i]
-            for false_vector in falses_i:
-                false_prob_i = self._calculate_probability(doc_i, false_vector, all_i)
-                all_probs_i.append(false_prob_i)
-                false_list.append(false_prob_i)
-            false_prob_list.append(false_list)
+            for false_i in falses_i:
+                prob_false_i = self._calculate_probability(doc_i, false_i, all_i)
+                all_probs_i.append(prob_false_i)
+                # false_list.append(prob_false_i)
+            # prob_false_dict[i] = false_list
 
-            sign_loss_i = 1
-            if doc_vector[i] * true_vector[i] < 0:
-                sign_loss_i = -1
+            true_loss_i = self._calculate_loss(prob_true_i, all_probs_i).astype(np.float32)
+            if doc_vector[i] > 0:
+                true_loss_i = -1 * true_loss_i
+            true_losses.append(true_loss_i)
 
-            loss_i = sign_loss_i * self._calculate_loss(prob_true_i, all_probs_i).astype(np.float32)
-            entity_gradient.append(loss_i)
-        # print("prob_true", prob_true)
-        # print("false_prob_list", false_prob_list)
-        return np.asarray(entity_gradient)
+            # false_loss_list = list()
+            # for prob_false_i in false_list:
+                # false_loss_i = self._calculate_loss(prob_false_i, all_probs_i).astype(np.float32)
+                # false_loss_list.append(false_loss_i)
+            # false_losses_dict[i] = false_loss_list
+
+        return true_losses  # , false_losses_dict
+
+    def _calculate_entity_gradient_depr(self, loss, doc_vector, true_vector, false_vectors):
+        true_losses = self._calculate_losses_vector(doc_vector, true_vector, false_vectors)
+
+        # renormalize the gradient so that the total sum of abs values does not exceed the actual loss
+        loss_i = sum([abs(x) for x in true_losses])  # sum of absolute values
+        entity_gradient = [(x/2) * (loss/loss_i) for x in true_losses]
+        doc_gradient = [(x/2) * (loss/loss_i) for x in true_losses]
+
+        return np.asarray(entity_gradient), np.asarray(doc_gradient)
 
 
     @staticmethod
-    def _calculate_dot_exp(vector1, vector2_transposed):
+    def _calculate_dot_exp_depr(vector1, vector2_transposed):
         dot_product = vector1.dot(vector2_transposed)
         dot_product = min(50, dot_product)
-        # dot_product = max(-10000, dot_product)
+        dot_product = max(-10000, dot_product)
         # print("DOT", dot_product)
         e = np.exp(dot_product)
         # print("E", e)
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index cccc67650..a5ebc99bb 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training ", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1500, devlimit=50)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=1)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From d51bffe63b9e92b3f6c2b4cfb09d2039e6e55a5f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 16 May 2019 18:36:15 +0200
Subject: [PATCH 042/148] clean up code

---
 .../pipeline/wiki_entity_linking/train_el.py  | 346 +-----------------
 1 file changed, 4 insertions(+), 342 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 5cb027d0e..369b0762c 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -4,11 +4,9 @@ from __future__ import unicode_literals
 import os
 import datetime
 from os import listdir
-from random import shuffle
 import numpy as np
 import random
 from thinc.neural._classes.convolution import ExtractWindow
-from thinc.neural._classes.feature_extracter import FeatureExtracter
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
@@ -49,9 +47,6 @@ class EL_Model:
 
         self._build_cnn(hidden_entity_width=self.ENTITY_WIDTH, hidden_article_width=self.ARTICLE_WIDTH)
 
-        # self.entity_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM)
-        # self.article_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM)
-
     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
         # raise errors instead of runtime warnings in case of int/float overflow
         np.seterr(all='raise')
@@ -69,16 +64,13 @@ class EL_Model:
                                                                            True,
                                                                            devlimit,
                                                                            to_print=False)
-
-        # self.sgd_entity = self.begin_training(self.entity_encoder)
-        # self.sgd_article = self.begin_training(self.article_encoder)
         self._begin_training()
 
         if self.PRINT_F:
-            _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
+            _, _, f_avg_train = -3.42, -3.42, -3.42   # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
             _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False)
             _, _, f_random_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, calc_random=True)
-            _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
+            _, _, f_avg_dev = -3.42, -3.42, -3.42   # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
             _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False)
             _, _, f_random_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, calc_random=True)
 
@@ -97,18 +89,6 @@ class EL_Model:
             print("Dev test on", len(dev_instances.values()), "articles")
             print()
 
-        # for article_id, inst_cluster_set in train_instances.items():
-            # article_doc = train_doc[article_id]
-            # print("training on", article_id, inst_cluster_set)
-            # pos_ex_list = list()
-            # neg_exs_list = list()
-            # for inst_cluster in inst_cluster_set:
-                # instance_count += 1
-                # pos_ex_list.append(train_pos.get(inst_cluster))
-                # neg_exs_list.append(train_neg.get(inst_cluster, []))
-
-            #self.update(article_doc, pos_ex_list, neg_exs_list)
-
         article_docs = list()
         entities = list()
         golds = list()
@@ -130,9 +110,9 @@ class EL_Model:
 
             # eval again
             if self.PRINT_F:
-                _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
+                _, _, f_avg_train = -3.42, -3.42, -3.42  # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
                 _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False)
-                _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
+                _, _, f_avg_dev = -3.42, -3.42, -3.42  # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
                 _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False)
 
                 print("avg/nonavg F train", round(f_avg_train, 1), round(f_nonavg_train, 1))
@@ -142,39 +122,6 @@ class EL_Model:
         if to_print:
             print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg")
 
-    def _test_dev_depr(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False):
-        predictions = list()
-        golds = list()
-
-        for article_id, inst_cluster_set in dev_instances.items():
-            for inst_cluster in inst_cluster_set:
-                pos_ex = dev_pos.get(inst_cluster)
-                neg_exs = dev_neg.get(inst_cluster, [])
-                ex_to_id = dict()
-
-                if pos_ex and neg_exs:
-                    ex_to_id[pos_ex] = pos_ex._.entity_id
-                    for neg_ex in neg_exs:
-                        ex_to_id[neg_ex] = neg_ex._.entity_id
-
-                    article = inst_cluster.split(sep="_")[0]
-                    entity_id = inst_cluster.split(sep="_")[1]
-                    article_doc = dev_doc[article]
-
-                    examples = list(neg_exs)
-                    examples.append(pos_ex)
-                    shuffle(examples)
-
-                    best_entity, highest_prob = self._predict(examples, article_doc, avg)
-                    if calc_random:
-                        best_entity, highest_prob = self._predict_random(examples)
-                    predictions.append(ex_to_id[best_entity])
-                    golds.append(ex_to_id[pos_ex])
-
-        # TODO: use lowest_mse and combine with prior probability
-        p, r, f = run_el.evaluate(predictions, golds, to_print=False)
-        return p, r, f
-
     def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False):
         predictions = list()
         golds = list()
@@ -207,33 +154,6 @@ class EL_Model:
         p, r, f = run_el.evaluate(predictions, golds, to_print=False)
         return p, r, f
 
-    def _predict_depr(self, entities, article_doc, avg=False):
-        if avg:
-            with self.article_encoder.use_params(self.sgd_article.averages):
-                doc_encoding = self.article_encoder([article_doc])
-        else:
-            doc_encoding = self.article_encoder([article_doc])
-
-        highest_prob = None
-        best_entity = None
-
-        entity_to_vector = dict()
-        for entity in entities:
-            if avg:
-                with self.entity_encoder.use_params(self.sgd_entity.averages):
-                    entity_to_vector[entity] = self.entity_encoder([entity])
-            else:
-                entity_to_vector[entity] = self.entity_encoder([entity])
-
-        for entity in entities:
-            entity_encoding = entity_to_vector[entity]
-            prob = self._calculate_probability(doc_encoding, entity_encoding, entity_to_vector.values())
-            if not best_entity or prob > highest_prob:
-                highest_prob = prob
-                best_entity = entity
-
-        return best_entity, highest_prob
-
     def _predict(self, article_doc, entity, avg=False, apply_threshold=True):
         if avg:
             with self.sgd.use_params(self.model.averages):
@@ -252,11 +172,6 @@ class EL_Model:
             return float(1.0)
         return float(0.0)
 
-    def _predict_random_depr(self, entities):
-        highest_prob = 1
-        best_entity = random.choice(entities)
-        return best_entity, highest_prob
-
     def _predict_random(self, entity, apply_threshold=True):
         r = random.uniform(0, 1)
         if not apply_threshold:
@@ -275,29 +190,12 @@ class EL_Model:
 
             convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_output_with, hidden_output_with * 3))))
 
-            # self.entity_encoder | self.article_encoder \
-            # self.model = with_flatten(LN(Maxout(hidden_with, hidden_with)) >> convolution_2 ** 2, pad=2)  \
-            #          >> flatten_add_lengths \
-            #          >> ParametricAttention(hidden_with) \
-            #          >> Pooling(sum_pool) \
-            #          >> Softmax(nr_class, nr_class)
-
             self.model = Affine(hidden_output_with, hidden_input_with) \
                        >> LN(Maxout(hidden_output_with, hidden_output_with)) \
                        >> convolution_2 \
                        >> Affine(self.HIDDEN_2_WIDTH, hidden_output_with) \
                        >> Affine(1, self.HIDDEN_2_WIDTH) \
                        >> logistic
-                       # >> with_flatten(LN(Maxout(hidden_output_with, hidden_output_with)) >> convolution_2 ** 2, pad=2)
-
-                    #  >> convolution_2 \
-
-                       #  >> flatten_add_lengths
-                       #  >> ParametricAttention(hidden_output_with) \
-                       #  >> Pooling(max_pool) \
-                       #  >> Softmax(nr_class, nr_class)
-
-        # self.model.nO = nr_class
 
     @staticmethod
     def _encoder(in_width, hidden_width):
@@ -311,138 +209,9 @@ class EL_Model:
 
         return encoder
 
-    def begin_training_depr(self, model):
-        # TODO ? link_vectors_to_models(self.vocab) depr?
-        sgd = create_default_optimizer(model.ops)
-        return sgd
-
     def _begin_training(self):
-        # self.sgd_entity = self.begin_training(self.entity_encoder)
-        # self.sgd_article = self.begin_training(self.article_encoder)
         self.sgd = create_default_optimizer(self.model.ops)
 
-    # TODO: deprecated ?
-    def _simple_encoder_depr(self, in_width, out_width):
-        hidden_with = 128
-
-        conv_depth = 1
-        cnn_maxout_pieces = 3
-        with Model.define_operators({">>": chain, "**": clone}):
-            # encoder = SpacyVectors \
-            #            >> flatten_add_lengths \
-            #           >> ParametricAttention(in_width)\
-            #            >> Pooling(mean_pool) \
-            #           >> Residual(zero_init(Maxout(in_width, in_width)))  \
-            #           >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
-            # encoder = SpacyVectors \
-            #         >> flatten_add_lengths \
-            #         >> with_getitem(0, Affine(in_width, in_width)) \
-            #         >> ParametricAttention(in_width) \
-            #         >> Pooling(sum_pool) \
-            #         >> Residual(ReLu(in_width, in_width)) ** conv_depth \
-            #         >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
-            # encoder = SpacyVectors \
-            #        >> flatten_add_lengths \
-            #        >> ParametricAttention(in_width)\
-            #        >> Pooling(sum_pool) \
-            #        >> Residual(zero_init(Maxout(in_width, in_width)))  \
-            #        >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
-
-            # >> zero_init(Affine(nr_class, width, drop_factor=0.0))
-            # >> logistic
-
-            #convolution = Residual(ExtractWindow(nW=1)
-            #                       >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces))
-            #)
-            #encoder = SpacyVectors >> with_flatten(
-            #    embed >> convolution ** conv_depth, pad=conv_depth
-            #)
-
-            # static_vectors = SpacyVectors >> with_flatten(
-            #    Affine(in_width, in_width)
-            #)
-
-            convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_with, hidden_with * 3))))
-
-            encoder = SpacyVectors >> with_flatten(LN(Maxout(hidden_with, in_width)) >> convolution_2 ** 2, pad = 2)  \
-                      >> flatten_add_lengths \
-                      >> ParametricAttention(hidden_with) \
-                      >> Pooling(sum_pool) \
-                      >> Residual(zero_init(Maxout(hidden_with, hidden_with))) \
-                      >> zero_init(Affine(out_width, hidden_with, drop_factor=0.0)) \
-                      >> logistic
-
-            # convolution = Residual(ExtractWindow(nW=1) >> ReLu(in_width, in_width*3))
-
-            # encoder = static_vectors # >> with_flatten(
-            #    ReLu(in_width, in_width)
-            #    >> convolution ** conv_depth, pad=conv_depth) \
-            #    >> Affine(out_width, in_width, drop_factor=0.0)
-
-            # encoder = SpacyVectors >> with_flatten(
-            #    LN(Maxout(in_width, in_width))
-            #    >> Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces)))) ** conv_depth,
-            #    pad=conv_depth,
-            #)  >> zero_init(Affine(out_width, in_width, drop_factor=0.0))
-
-            # embed = SpacyVectors >> LN(Maxout(width, width, pieces=3))
-
-            # encoder = SpacyVectors >> flatten_add_lengths >> convolution ** conv_depth
-            # encoder = with_flatten(embed >> convolution ** conv_depth, pad=conv_depth)
-
-        return encoder
-
-    def update_depr(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None):
-        doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop)
-        doc_encoding = doc_encoding[0]
-        # print()
-        # print("doc", doc_encoding)
-
-        for i, true_entity in enumerate(true_entity_list):
-            try:
-                false_entities = false_entities_list[i]
-                if len(false_entities) > 0:
-                    # TODO: batch per doc
-
-                    all_entities = [true_entity]
-                    all_entities.extend(false_entities)
-
-                    entity_encodings, entity_bp = self.entity_encoder.begin_update(all_entities, drop=drop)
-                    true_entity_encoding = entity_encodings[0]
-                    false_entity_encodings = entity_encodings[1:]
-
-                    all_vectors = [true_entity_encoding]
-                    all_vectors.extend(false_entity_encodings)
-
-                    # consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding)
-
-                    true_prob = self._calculate_probability(doc_encoding, true_entity_encoding, all_vectors)
-                    # print("true", true_prob, true_entity_encoding)
-
-                    all_probs = [true_prob]
-                    for false_vector in false_entity_encodings:
-                        false_prob = self._calculate_probability(doc_encoding, false_vector, all_vectors)
-                        # print("false", false_prob, false_vector)
-                        all_probs.append(false_prob)
-
-                    loss = self._calculate_loss(true_prob, all_probs).astype(np.float32)
-                    if self.PRINT_LOSS:
-                        print("loss train", round(loss, 5))
-
-                    # for false_vector in false_vectors:
-                    #    false_gradient = -1 * self._calculate_entity_gradient(loss, doc_encoding, false_vector, false_vectors)
-                    #    print("false gradient", false_gradient)
-
-                    # doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_entity_encodings)
-                    true_gradient, doc_gradient = self._calculate_entity_gradient(loss, doc_encoding, true_entity_encoding, false_entity_encodings)
-                    # print("true_gradient", true_gradient)
-                    # print("doc_gradient", doc_gradient)
-                    article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article)
-                    entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity)
-                    #true_entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity)
-            except Exception as e:
-                pass
-
     def update(self, article_docs, entities, golds, drop=0.):
         doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
         entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=drop)
@@ -476,112 +245,6 @@ class EL_Model:
         bp_doc(doc_gradient)
         bp_encoding(entity_gradient)
 
-    def _calculate_probability_depr(self, vector1, vector2, allvectors):
-        """ Make sure that vector2 is included in allvectors """
-        if len(vector1) != len(vector2):
-            raise ValueError("To calculate similarity, both vectors should be of equal length")
-
-        vector1_t = vector1.transpose()
-        e = self._calculate_dot_exp(vector2, vector1_t)
-        e_sum = 0
-        for v in allvectors:
-            e_sum += self._calculate_dot_exp(v, vector1_t)
-
-        return float(e / (self.EPS + e_sum))
-
-    def _calculate_loss_depr(self, true_prob, all_probs):
-        """ all_probs should include true_prob ! """
-        return -1 * np.log((self.EPS + true_prob) / (self.EPS + sum(all_probs)))
-
-    @staticmethod
-    def _calculate_doc_gradient_depr(loss, doc_vector, true_vector, false_vectors):
-        gradient = np.zeros(len(doc_vector))
-        for i in range(len(doc_vector)):
-            min_false = min(x[i] for x in false_vectors)
-            max_false = max(x[i] for x in false_vectors)
-
-            if true_vector[i] > max_false:
-                if doc_vector[i] > 0:
-                    gradient[i] = 0
-                else:
-                    gradient[i] = -loss
-            elif true_vector[i] < min_false:
-                if doc_vector[i] > 0:
-                    gradient[i] = loss
-                if doc_vector[i] < 0:
-                    gradient[i] = 0
-            else:
-                # non-distinctive vector positions should converge to 0
-                gradient[i] = doc_vector[i]
-
-        return gradient
-
-    # TODO: delete ? try again ?
-    def depr__calculate_true_gradient(self, doc_vector, entity_vector):
-        # sum_entity_vector = sum(entity_vector)
-        # gradient = [-sum_entity_vector/(self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))]
-        gradient = [1 / (self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))]
-        return np.asarray(gradient)
-
-    def _calculate_losses_vector_depr(self, doc_vector, true_vector, false_vectors):
-        # prob_true = list()
-        # prob_false_dict = dict()
-
-        true_losses = list()
-        # false_losses_dict = dict()
-
-        for i in range(len(true_vector)):
-            doc_i = np.asarray([doc_vector[i]])
-            true_i = np.asarray([true_vector[i]])
-            falses_i = np.asarray([[fv[i]] for fv in false_vectors])
-            all_i = [true_i]
-            all_i.extend(falses_i)
-
-            prob_true_i = self._calculate_probability(doc_i, true_i, all_i)
-            # prob_true.append(prob_true_i)
-
-            # false_list = list()
-            all_probs_i = [prob_true_i]
-            for false_i in falses_i:
-                prob_false_i = self._calculate_probability(doc_i, false_i, all_i)
-                all_probs_i.append(prob_false_i)
-                # false_list.append(prob_false_i)
-            # prob_false_dict[i] = false_list
-
-            true_loss_i = self._calculate_loss(prob_true_i, all_probs_i).astype(np.float32)
-            if doc_vector[i] > 0:
-                true_loss_i = -1 * true_loss_i
-            true_losses.append(true_loss_i)
-
-            # false_loss_list = list()
-            # for prob_false_i in false_list:
-                # false_loss_i = self._calculate_loss(prob_false_i, all_probs_i).astype(np.float32)
-                # false_loss_list.append(false_loss_i)
-            # false_losses_dict[i] = false_loss_list
-
-        return true_losses  # , false_losses_dict
-
-    def _calculate_entity_gradient_depr(self, loss, doc_vector, true_vector, false_vectors):
-        true_losses = self._calculate_losses_vector(doc_vector, true_vector, false_vectors)
-
-        # renormalize the gradient so that the total sum of abs values does not exceed the actual loss
-        loss_i = sum([abs(x) for x in true_losses])  # sum of absolute values
-        entity_gradient = [(x/2) * (loss/loss_i) for x in true_losses]
-        doc_gradient = [(x/2) * (loss/loss_i) for x in true_losses]
-
-        return np.asarray(entity_gradient), np.asarray(doc_gradient)
-
-
-    @staticmethod
-    def _calculate_dot_exp_depr(vector1, vector2_transposed):
-        dot_product = vector1.dot(vector2_transposed)
-        dot_product = min(50, dot_product)
-        dot_product = max(-10000, dot_product)
-        # print("DOT", dot_product)
-        e = np.exp(dot_product)
-        # print("E", e)
-        return e
-
     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
 
@@ -589,7 +252,6 @@ class EL_Model:
                                                                                          collect_correct=True,
                                                                                          collect_incorrect=True)
 
-
         instance_by_doc = dict()
         local_vectors = list()   # TODO: local vectors
         doc_by_article = dict()

From 400b19353de9768805b6a4bcc7bcd72ba57bd001 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 17 May 2019 01:51:18 +0200
Subject: [PATCH 043/148] simplify architecture and larger-scale test runs

---
 .../pipeline/wiki_entity_linking/run_el.py    |   3 +-
 .../pipeline/wiki_entity_linking/train_el.py  | 165 +++++++++---------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   4 +-
 3 files changed, 88 insertions(+), 84 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
index 66ab0385e..6ab7ea75f 100644
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -81,7 +81,8 @@ def evaluate(predictions, golds, to_print=True):
     for pred, gold in zip(predictions, golds):
         is_correct = pred == gold
         if not pred:
-            fn += 1
+            if not is_correct:  # we don't care about tn
+                fn += 1
         elif is_correct:
             tp += 1
         else:
diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 369b0762c..21bc03282 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -12,10 +12,9 @@ from examples.pipeline.wiki_entity_linking import run_el, training_set_creator,
 
 from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic
 
-from thinc.api import chain, concatenate, flatten_add_lengths, with_getitem, clone, with_flatten
-from thinc.neural.util import get_array_module
-from thinc.v2v import Model, Softmax, Maxout, Affine, ReLu
-from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
+from thinc.api import chain, concatenate, flatten_add_lengths, clone
+from thinc.v2v import Model, Maxout, Affine
+from thinc.t2v import Pooling, mean_pool
 from thinc.t2t import ParametricAttention
 from thinc.misc import Residual
 from thinc.misc import LayerNorm as LN
@@ -27,16 +26,15 @@ from spacy.tokens import Doc
 
 class EL_Model:
 
-    PRINT_LOSS = True
+    PRINT_LOSS = False
     PRINT_F = True
     EPS = 0.0000000005
     CUTOFF = 0.5
 
     INPUT_DIM = 300
     ENTITY_WIDTH = 64
-    ARTICLE_WIDTH = 64
-    HIDDEN_1_WIDTH = 256
-    HIDDEN_2_WIDTH = 64
+    ARTICLE_WIDTH = 128
+    HIDDEN_WIDTH = 64
 
     name = "entity_linker"
 
@@ -53,46 +51,44 @@ class EL_Model:
 
         Doc.set_extension("entity_id", default=None)
 
-        train_instances, train_pos, train_neg, train_doc = self._get_training_data(training_dir,
-                                                                                   entity_descr_output,
-                                                                                   False,
-                                                                                   trainlimit,
-                                                                                   to_print=False)
+        train_inst, train_pos, train_neg, train_doc = self._get_training_data(training_dir,
+                                                                              entity_descr_output,
+                                                                              False,
+                                                                              trainlimit,
+                                                                              to_print=False)
 
-        dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir,
-                                                                           entity_descr_output,
-                                                                           True,
-                                                                           devlimit,
-                                                                           to_print=False)
+        dev_inst, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir,
+                                                                      entity_descr_output,
+                                                                      True,
+                                                                      devlimit,
+                                                                      to_print=False)
         self._begin_training()
 
-        if self.PRINT_F:
-            _, _, f_avg_train = -3.42, -3.42, -3.42   # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
-            _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False)
-            _, _, f_random_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, calc_random=True)
-            _, _, f_avg_dev = -3.42, -3.42, -3.42   # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
-            _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False)
-            _, _, f_random_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, calc_random=True)
-
-            print("random F train", round(f_random_train, 1))
-            print("random F dev", round(f_random_dev, 1))
-            print()
-            print("avg/nonavg F train", round(f_avg_train, 1), round(f_nonavg_train, 1))
-            print("avg/nonavg F dev", round(f_avg_dev, 1), round(f_nonavg_dev, 1))
-            print()
+        print()
+        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_random", calc_random=True)
+        self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_random", calc_random=True)
+        print()
+        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_pre", calc_random=False)
+        self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_pre", avg=False)
 
         instance_pos_count = 0
         instance_neg_count = 0
 
         if to_print:
-            print("Training on", len(train_instances.values()), "articles")
-            print("Dev test on", len(dev_instances.values()), "articles")
             print()
+            print("Training on", len(train_inst.values()), "articles")
+            print("Dev test on", len(dev_inst.values()), "articles")
 
-        article_docs = list()
-        entities = list()
-        golds = list()
-        for article_id, inst_cluster_set in train_instances.items():
+        # TODO: proper batches. Currently 1 article at the time
+        article_count = 0
+        for article_id, inst_cluster_set in train_inst.items():
+            # if to_print:
+                # print()
+                # print(article_count, "Training on article", article_id)
+            article_count += 1
+            article_docs = list()
+            entities = list()
+            golds = list()
             for inst_cluster in inst_cluster_set:
                 article_docs.append(train_doc[article_id])
                 entities.append(train_pos.get(inst_cluster))
@@ -104,36 +100,31 @@ class EL_Model:
                     golds.append(float(0.0))
                     instance_neg_count += 1
 
-        for x in range(10):
-            print("Updating", x)
             self.update(article_docs=article_docs, entities=entities, golds=golds)
 
-            # eval again
-            if self.PRINT_F:
-                _, _, f_avg_train = -3.42, -3.42, -3.42  # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True)
-                _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False)
-                _, _, f_avg_dev = -3.42, -3.42, -3.42  # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True)
-                _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False)
-
-                print("avg/nonavg F train", round(f_avg_train, 1), round(f_nonavg_train, 1))
-                print("avg/nonavg F dev", round(f_avg_dev, 1), round(f_nonavg_dev, 1))
-                print()
+            # dev eval
+            self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False)
 
         if to_print:
+            print()
             print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg")
 
-    def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False):
+        print()
+        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", calc_random=False)
+        self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post", avg=False)
+
+    def _test_dev(self, instances, pos, neg, doc, print_string, avg=False, calc_random=False):
         predictions = list()
         golds = list()
 
-        for article_id, inst_cluster_set in dev_instances.items():
+        for article_id, inst_cluster_set in instances.items():
             for inst_cluster in inst_cluster_set:
-                pos_ex = dev_pos.get(inst_cluster)
-                neg_exs = dev_neg.get(inst_cluster, [])
+                pos_ex = pos.get(inst_cluster)
+                neg_exs = neg.get(inst_cluster, [])
 
                 article = inst_cluster.split(sep="_")[0]
                 entity_id = inst_cluster.split(sep="_")[1]
-                article_doc = dev_doc[article]
+                article_doc = doc[article]
 
                 if calc_random:
                     prediction = self._predict_random(entity=pos_ex)
@@ -150,9 +141,17 @@ class EL_Model:
                     predictions.append(prediction)
                     golds.append(float(0.0))
 
-        # TODO: use lowest_mse and combine with prior probability
+        # TODO: combine with prior probability
         p, r, f = run_el.evaluate(predictions, golds, to_print=False)
-        return p, r, f
+        if self.PRINT_F:
+            # print("p/r/F", print_string, round(p, 1), round(r, 1), round(f, 1))
+            print("F", print_string, round(f, 1))
+
+        loss, d_scores = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds))
+        if self.PRINT_LOSS:
+            print("loss", print_string, round(loss, 5))
+
+        return loss, p, r, f
 
     def _predict(self, article_doc, entity, avg=False, apply_threshold=True):
         if avg:
@@ -182,20 +181,16 @@ class EL_Model:
 
     def _build_cnn(self, hidden_entity_width, hidden_article_width):
         with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
-            self.entity_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_entity_width)  # entity encoding
-            self.article_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_article_width)  # doc encoding
+            self.entity_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_entity_width)
+            self.article_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_article_width)
 
-            hidden_input_with = hidden_entity_width + hidden_article_width
-            hidden_output_with = self.HIDDEN_1_WIDTH
+            nr_i = hidden_entity_width + hidden_article_width
+            nr_o = self.HIDDEN_WIDTH
 
-            convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_output_with, hidden_output_with * 3))))
-
-            self.model = Affine(hidden_output_with, hidden_input_with) \
-                       >> LN(Maxout(hidden_output_with, hidden_output_with)) \
-                       >> convolution_2 \
-                       >> Affine(self.HIDDEN_2_WIDTH, hidden_output_with) \
-                       >> Affine(1, self.HIDDEN_2_WIDTH) \
-                       >> logistic
+            self.model = Affine(nr_o, nr_i) \
+                >> LN(Maxout(nr_o, nr_o)) \
+                >> Affine(1, nr_o) \
+                >> logistic
 
     @staticmethod
     def _encoder(in_width, hidden_width):
@@ -204,38 +199,46 @@ class EL_Model:
                 >> flatten_add_lengths \
                 >> ParametricAttention(in_width)\
                 >> Pooling(mean_pool) \
-                >> Residual(zero_init(Maxout(in_width, in_width)))  \
+                >> Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3))))  \
                 >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0))
 
+            # TODO: ReLu instead of LN(Maxout)  ?
+
         return encoder
 
     def _begin_training(self):
         self.sgd = create_default_optimizer(self.model.ops)
 
-    def update(self, article_docs, entities, golds, drop=0.):
+    @staticmethod
+    def get_loss(predictions, golds):
+        d_scores = (predictions - golds)
+
+        loss = (d_scores ** 2).sum()
+        return loss, d_scores
+
+    def update(self, article_docs, entities, golds, drop=0., apply_threshold=True):
         doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
         entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=drop)
         concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
 
         predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=drop)
-
         predictions = self.model.ops.flatten(predictions)
         golds = self.model.ops.asarray(golds)
 
-        # print("predictions", predictions)
-        # print("golds", golds)
+        loss, d_scores = self.get_loss(predictions, golds)
 
-        d_scores = (predictions - golds) # / predictions.shape[0]
-        # print("d_scores (1)", d_scores)
+        # if self.PRINT_LOSS:
+        #    print("loss train", round(loss, 5))
 
-        loss = (d_scores ** 2).sum()
-
-        if self.PRINT_LOSS:
-            print("loss train", round(loss, 5))
+        # if self.PRINT_F:
+        #    predictions_f = [x for x in predictions]
+        #    if apply_threshold:
+        #        predictions_f = [1.0 if x > self.CUTOFF else 0.0 for x in predictions_f]
+        #    p, r, f = run_el.evaluate(predictions_f, golds, to_print=False)
+        #    print("p/r/F train", round(p, 1), round(r, 1), round(f, 1))
 
         d_scores = d_scores.reshape((-1, 1))
         d_scores = d_scores.astype(np.float32)
-        # print("d_scores (2)", d_scores)
 
         model_gradient = bp_model(d_scores, sgd=self.sgd)
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index a5ebc99bb..2e4ab3c2e 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -108,10 +108,10 @@ if __name__ == "__main__":
 
     # STEP 6: apply the EL algorithm on the training dataset
     if run_training:
-        print("STEP 6: training ", datetime.datetime.now())
+        print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=1)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=2000, devlimit=200)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From dd691d00530eed432d6cf60b39d99206e5830f69 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 17 May 2019 17:44:11 +0200
Subject: [PATCH 044/148] debugging

---
 .../pipeline/wiki_entity_linking/train_el.py  | 140 ++++++++++++------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 spacy/pipeline/pipes.pyx                      |   2 +-
 3 files changed, 98 insertions(+), 46 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 21bc03282..312e50cad 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -28,13 +28,16 @@ class EL_Model:
 
     PRINT_LOSS = False
     PRINT_F = True
+    PRINT_TRAIN = True
     EPS = 0.0000000005
     CUTOFF = 0.5
 
     INPUT_DIM = 300
-    ENTITY_WIDTH = 64
-    ARTICLE_WIDTH = 128
-    HIDDEN_WIDTH = 64
+    ENTITY_WIDTH = 4 # 64
+    ARTICLE_WIDTH = 8 #  128
+    HIDDEN_WIDTH = 6 # 64
+
+    DROP = 0.00
 
     name = "entity_linker"
 
@@ -78,40 +81,63 @@ class EL_Model:
             print()
             print("Training on", len(train_inst.values()), "articles")
             print("Dev test on", len(dev_inst.values()), "articles")
+            print()
+            print(" CUTOFF", self.CUTOFF)
+            print(" INPUT_DIM", self.INPUT_DIM)
+            print(" ENTITY_WIDTH", self.ENTITY_WIDTH)
+            print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH)
+            print(" HIDDEN_WIDTH", self.ARTICLE_WIDTH)
+            print(" DROP", self.DROP)
+            print()
 
         # TODO: proper batches. Currently 1 article at the time
         article_count = 0
         for article_id, inst_cluster_set in train_inst.items():
-            # if to_print:
-                # print()
-                # print(article_count, "Training on article", article_id)
-            article_count += 1
-            article_docs = list()
-            entities = list()
-            golds = list()
-            for inst_cluster in inst_cluster_set:
-                article_docs.append(train_doc[article_id])
-                entities.append(train_pos.get(inst_cluster))
-                golds.append(float(1.0))
-                instance_pos_count += 1
-                for neg_entity in train_neg.get(inst_cluster, []):
-                    article_docs.append(train_doc[article_id])
-                    entities.append(neg_entity)
-                    golds.append(float(0.0))
-                    instance_neg_count += 1
+            try:
+                # if to_print:
+                    # print()
+                    # print(article_count, "Training on article", article_id)
+                article_count += 1
+                article_docs = list()
+                entities = list()
+                golds = list()
+                for inst_cluster in inst_cluster_set:
+                    if instance_pos_count < 2:   # TODO remove
+                        article_docs.append(train_doc[article_id])
+                        entities.append(train_pos.get(inst_cluster))
+                        golds.append(float(1.0))
+                        instance_pos_count += 1
+                        for neg_entity in train_neg.get(inst_cluster, []):
+                            article_docs.append(train_doc[article_id])
+                            entities.append(neg_entity)
+                            golds.append(float(0.0))
+                            instance_neg_count += 1
 
-            self.update(article_docs=article_docs, entities=entities, golds=golds)
+                for k in range(5):
+                    print()
+                    print("update", k)
+                    print()
+                    # print("article docs", article_docs)
+                    print("entities", entities)
+                    print("golds", golds)
+                    print()
+                    self.update(article_docs=article_docs, entities=entities, golds=golds)
 
-            # dev eval
-            self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False)
+                    # dev eval
+                    self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False)
+                    self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter_avg", avg=True)
+            except ValueError as e:
+                print("Error in article id", article_id)
 
         if to_print:
             print()
             print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg")
 
         print()
-        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", calc_random=False)
+        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", avg=False)
+        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post_avg", avg=True)
         self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post", avg=False)
+        self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post_avg", avg=True)
 
     def _test_dev(self, instances, pos, neg, doc, print_string, avg=False, calc_random=False):
         predictions = list()
@@ -155,16 +181,24 @@ class EL_Model:
 
     def _predict(self, article_doc, entity, avg=False, apply_threshold=True):
         if avg:
-            with self.sgd.use_params(self.model.averages):
-                doc_encoding = self.article_encoder([article_doc])
-                entity_encoding = self.entity_encoder([entity])
-                return self.model(np.append(entity_encoding, doc_encoding))  # TODO list
+            with self.article_encoder.use_params(self.sgd_article.averages) \
+                 and self.entity_encoder.use_params(self.sgd_article.averages):
+                doc_encoding = self.article_encoder([article_doc])[0]
+                entity_encoding = self.entity_encoder([entity])[0]
+
+        else:
+            doc_encoding = self.article_encoder([article_doc])[0]
+            entity_encoding = self.entity_encoder([entity])[0]
 
-        doc_encoding = self.article_encoder([article_doc])[0]
-        entity_encoding = self.entity_encoder([entity])[0]
         concat_encoding = list(entity_encoding) + list(doc_encoding)
         np_array = np.asarray([concat_encoding])
-        prediction = self.model(np_array)
+
+        if avg:
+           with self.model.use_params(self.sgd.averages):
+               prediction = self.model(np_array)
+        else:
+            prediction = self.model(np_array)
+
         if not apply_threshold:
             return float(prediction)
         if prediction > self.CUTOFF:
@@ -199,14 +233,17 @@ class EL_Model:
                 >> flatten_add_lengths \
                 >> ParametricAttention(in_width)\
                 >> Pooling(mean_pool) \
-                >> Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3))))  \
+                >> (ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3)))  \
                 >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0))
 
             # TODO: ReLu instead of LN(Maxout)  ?
+            # TODO: more convolutions ?
 
         return encoder
 
     def _begin_training(self):
+        self.sgd_article = create_default_optimizer(self.article_encoder.ops)
+        self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
         self.sgd = create_default_optimizer(self.model.ops)
 
     @staticmethod
@@ -216,34 +253,49 @@ class EL_Model:
         loss = (d_scores ** 2).sum()
         return loss, d_scores
 
-    def update(self, article_docs, entities, golds, drop=0., apply_threshold=True):
-        doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
-        entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=drop)
+    def update(self, article_docs, entities, golds, apply_threshold=True):
+        print("article_docs", len(article_docs))
+        for a in article_docs:
+            print(a[0:10], a[-10:])
+            doc_encoding, bp_doc = self.article_encoder.begin_update([a], drop=self.DROP)
+            print(doc_encoding)
+
+        doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP)
+        entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=self.DROP)
         concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
 
-        predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=drop)
+        print("doc_encodings", len(doc_encodings), doc_encodings)
+        print("entity_encodings", len(entity_encodings), entity_encodings)
+        print("concat_encodings", len(concat_encodings), concat_encodings)
+
+        predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP)
+        print("predictions", predictions)
         predictions = self.model.ops.flatten(predictions)
         golds = self.model.ops.asarray(golds)
 
         loss, d_scores = self.get_loss(predictions, golds)
 
-        # if self.PRINT_LOSS:
-        #    print("loss train", round(loss, 5))
+        if self.PRINT_LOSS and self.PRINT_TRAIN:
+            print("loss train", round(loss, 5))
 
-        # if self.PRINT_F:
-        #    predictions_f = [x for x in predictions]
-        #    if apply_threshold:
-        #        predictions_f = [1.0 if x > self.CUTOFF else 0.0 for x in predictions_f]
-        #    p, r, f = run_el.evaluate(predictions_f, golds, to_print=False)
-        #    print("p/r/F train", round(p, 1), round(r, 1), round(f, 1))
+        if self.PRINT_F and self.PRINT_TRAIN:
+            predictions_f = [x for x in predictions]
+            if apply_threshold:
+                predictions_f = [1.0 if x > self.CUTOFF else 0.0 for x in predictions_f]
+            p, r, f = run_el.evaluate(predictions_f, golds, to_print=False)
+            print("p/r/F train", round(p, 1), round(r, 1), round(f, 1))
 
         d_scores = d_scores.reshape((-1, 1))
         d_scores = d_scores.astype(np.float32)
+        print("d_scores", d_scores)
 
         model_gradient = bp_model(d_scores, sgd=self.sgd)
+        print("model_gradient", model_gradient)
 
         doc_gradient = [x[0:self.ARTICLE_WIDTH] for x in model_gradient]
+        print("doc_gradient", doc_gradient)
         entity_gradient = [x[self.ARTICLE_WIDTH:] for x in model_gradient]
+        print("entity_gradient", entity_gradient)
 
         bp_doc(doc_gradient)
         bp_encoding(entity_gradient)
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 2e4ab3c2e..ced905ac5 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=2000, devlimit=200)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=10)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 7043c1647..69521c1b2 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -293,7 +293,7 @@ class Tensorizer(Pipe):
 
         docs (iterable): A batch of `Doc` objects.
         golds (iterable): A batch of `GoldParse` objects.
-        drop (float): The droput rate.
+        drop (float): The dropout rate.
         sgd (callable): An optimizer.
         RETURNS (dict): Results from the update.
         """

From 7edb2e171181f0f49fb4b1f54326fa9e2b97373b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 20 May 2019 11:58:48 +0200
Subject: [PATCH 045/148] fix convolution layer

---
 .../pipeline/wiki_entity_linking/train_el.py  | 44 +++++++++++--------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 312e50cad..2d7ede48d 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -12,9 +12,9 @@ from examples.pipeline.wiki_entity_linking import run_el, training_set_creator,
 
 from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic
 
-from thinc.api import chain, concatenate, flatten_add_lengths, clone
+from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten
 from thinc.v2v import Model, Maxout, Affine
-from thinc.t2v import Pooling, mean_pool
+from thinc.t2v import Pooling, mean_pool, sum_pool
 from thinc.t2t import ParametricAttention
 from thinc.misc import Residual
 from thinc.misc import LayerNorm as LN
@@ -96,13 +96,13 @@ class EL_Model:
             try:
                 # if to_print:
                     # print()
-                    # print(article_count, "Training on article", article_id)
+                print(article_count, "Training on article", article_id)
                 article_count += 1
                 article_docs = list()
                 entities = list()
                 golds = list()
                 for inst_cluster in inst_cluster_set:
-                    if instance_pos_count < 2:   # TODO remove
+                    if instance_pos_count < 2:  # TODO del
                         article_docs.append(train_doc[article_id])
                         entities.append(train_pos.get(inst_cluster))
                         golds.append(float(1.0))
@@ -228,16 +228,23 @@ class EL_Model:
 
     @staticmethod
     def _encoder(in_width, hidden_width):
+        conv_depth = 1
+        cnn_maxout_pieces = 3
+
         with Model.define_operators({">>": chain}):
+            convolution = Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces))))
+
             encoder = SpacyVectors \
-                >> flatten_add_lengths \
-                >> ParametricAttention(in_width)\
-                >> Pooling(mean_pool) \
-                >> (ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3)))  \
-                >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0))
+                      >> with_flatten(LN(Maxout(in_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \
+                      >> flatten_add_lengths \
+                      >> ParametricAttention(in_width)\
+                      >> Pooling(mean_pool) \
+                      >> Residual(zero_init(Maxout(in_width, in_width))) \
+                      >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0))
 
             # TODO: ReLu instead of LN(Maxout)  ?
             # TODO: more convolutions ?
+            # sum_pool or mean_pool ?
 
         return encoder
 
@@ -261,16 +268,17 @@ class EL_Model:
             print(doc_encoding)
 
         doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP)
-        entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=self.DROP)
-        concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
-
         print("doc_encodings", len(doc_encodings), doc_encodings)
+
+        entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=self.DROP)
         print("entity_encodings", len(entity_encodings), entity_encodings)
-        print("concat_encodings", len(concat_encodings), concat_encodings)
+
+        concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
+        # print("concat_encodings", len(concat_encodings), concat_encodings)
 
         predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP)
-        print("predictions", predictions)
         predictions = self.model.ops.flatten(predictions)
+        print("predictions", predictions)
         golds = self.model.ops.asarray(golds)
 
         loss, d_scores = self.get_loss(predictions, golds)
@@ -287,15 +295,15 @@ class EL_Model:
 
         d_scores = d_scores.reshape((-1, 1))
         d_scores = d_scores.astype(np.float32)
-        print("d_scores", d_scores)
+        # print("d_scores", d_scores)
 
         model_gradient = bp_model(d_scores, sgd=self.sgd)
-        print("model_gradient", model_gradient)
+        # print("model_gradient", model_gradient)
 
         doc_gradient = [x[0:self.ARTICLE_WIDTH] for x in model_gradient]
-        print("doc_gradient", doc_gradient)
+        # print("doc_gradient", doc_gradient)
         entity_gradient = [x[self.ARTICLE_WIDTH:] for x in model_gradient]
-        print("entity_gradient", entity_gradient)
+        # print("entity_gradient", entity_gradient)
 
         bp_doc(doc_gradient)
         bp_encoding(entity_gradient)

From 89e322a637243d261b84ce01ae6d5595b7e82dd6 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 20 May 2019 17:20:39 +0200
Subject: [PATCH 046/148] small fixes

---
 .../pipeline/wiki_entity_linking/train_el.py  | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 2d7ede48d..3a7cd6186 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -33,9 +33,9 @@ class EL_Model:
     CUTOFF = 0.5
 
     INPUT_DIM = 300
-    ENTITY_WIDTH = 4 # 64
-    ARTICLE_WIDTH = 8 #  128
-    HIDDEN_WIDTH = 6 # 64
+    ENTITY_WIDTH = 4  # 64
+    ARTICLE_WIDTH = 8  # 128
+    HIDDEN_WIDTH = 6  # 64
 
     DROP = 0.00
 
@@ -71,7 +71,7 @@ class EL_Model:
         self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_random", calc_random=True)
         self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_random", calc_random=True)
         print()
-        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_pre", calc_random=False)
+        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_pre", avg=False)
         self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_pre", avg=False)
 
         instance_pos_count = 0
@@ -113,7 +113,7 @@ class EL_Model:
                             golds.append(float(0.0))
                             instance_neg_count += 1
 
-                for k in range(5):
+                for k in range(10):
                     print()
                     print("update", k)
                     print()
@@ -182,7 +182,7 @@ class EL_Model:
     def _predict(self, article_doc, entity, avg=False, apply_threshold=True):
         if avg:
             with self.article_encoder.use_params(self.sgd_article.averages) \
-                 and self.entity_encoder.use_params(self.sgd_article.averages):
+                 and self.entity_encoder.use_params(self.sgd_entity.averages):
                 doc_encoding = self.article_encoder([article_doc])[0]
                 entity_encoding = self.entity_encoder([entity])[0]
 
@@ -228,7 +228,7 @@ class EL_Model:
 
     @staticmethod
     def _encoder(in_width, hidden_width):
-        conv_depth = 1
+        conv_depth = 2
         cnn_maxout_pieces = 3
 
         with Model.define_operators({">>": chain}):
@@ -261,16 +261,10 @@ class EL_Model:
         return loss, d_scores
 
     def update(self, article_docs, entities, golds, apply_threshold=True):
-        print("article_docs", len(article_docs))
-        for a in article_docs:
-            print(a[0:10], a[-10:])
-            doc_encoding, bp_doc = self.article_encoder.begin_update([a], drop=self.DROP)
-            print(doc_encoding)
-
         doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP)
         print("doc_encodings", len(doc_encodings), doc_encodings)
 
-        entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=self.DROP)
+        entity_encodings, bp_entity = self.entity_encoder.begin_update(entities, drop=self.DROP)
         print("entity_encodings", len(entity_encodings), entity_encodings)
 
         concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
@@ -298,15 +292,19 @@ class EL_Model:
         # print("d_scores", d_scores)
 
         model_gradient = bp_model(d_scores, sgd=self.sgd)
-        # print("model_gradient", model_gradient)
+        print("model_gradient", model_gradient)
 
-        doc_gradient = [x[0:self.ARTICLE_WIDTH] for x in model_gradient]
-        # print("doc_gradient", doc_gradient)
-        entity_gradient = [x[self.ARTICLE_WIDTH:] for x in model_gradient]
-        # print("entity_gradient", entity_gradient)
+        doc_gradient = list()
+        entity_gradient = list()
+        for x in model_gradient:
+            doc_gradient.append(list(x[0:self.ARTICLE_WIDTH]))
+            entity_gradient.append(list(x[self.ARTICLE_WIDTH:]))
+
+        print("doc_gradient", doc_gradient)
+        print("entity_gradient", entity_gradient)
 
         bp_doc(doc_gradient)
-        bp_encoding(entity_gradient)
+        bp_entity(entity_gradient)
 
     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)

From 0a15ee4541b2b46db716990830eb0d67d71fa45a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 20 May 2019 23:54:55 +0200
Subject: [PATCH 047/148] fix in bp call

---
 .../pipeline/wiki_entity_linking/train_el.py  | 82 +++++++++----------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  2 +-
 2 files changed, 38 insertions(+), 46 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 3a7cd6186..e213f0955 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -13,7 +13,7 @@ from examples.pipeline.wiki_entity_linking import run_el, training_set_creator,
 from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic
 
 from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten
-from thinc.v2v import Model, Maxout, Affine
+from thinc.v2v import Model, Maxout, Affine, ReLu
 from thinc.t2v import Pooling, mean_pool, sum_pool
 from thinc.t2t import ParametricAttention
 from thinc.misc import Residual
@@ -28,16 +28,16 @@ class EL_Model:
 
     PRINT_LOSS = False
     PRINT_F = True
-    PRINT_TRAIN = True
+    PRINT_TRAIN = False
     EPS = 0.0000000005
     CUTOFF = 0.5
 
     INPUT_DIM = 300
-    ENTITY_WIDTH = 4  # 64
-    ARTICLE_WIDTH = 8  # 128
-    HIDDEN_WIDTH = 6  # 64
+    ENTITY_WIDTH = 64   # 4
+    ARTICLE_WIDTH = 128  # 8
+    HIDDEN_WIDTH = 64  # 6
 
-    DROP = 0.00
+    DROP = 0.1
 
     name = "entity_linker"
 
@@ -91,41 +91,34 @@ class EL_Model:
             print()
 
         # TODO: proper batches. Currently 1 article at the time
+        # TODO shuffle data (currently positive is always followed by several negatives)
         article_count = 0
         for article_id, inst_cluster_set in train_inst.items():
             try:
                 # if to_print:
                     # print()
-                print(article_count, "Training on article", article_id)
+                    # print(article_count, "Training on article", article_id)
                 article_count += 1
                 article_docs = list()
                 entities = list()
                 golds = list()
                 for inst_cluster in inst_cluster_set:
-                    if instance_pos_count < 2:  # TODO del
+                    article_docs.append(train_doc[article_id])
+                    entities.append(train_pos.get(inst_cluster))
+                    golds.append(float(1.0))
+                    instance_pos_count += 1
+                    for neg_entity in train_neg.get(inst_cluster, []):
                         article_docs.append(train_doc[article_id])
-                        entities.append(train_pos.get(inst_cluster))
-                        golds.append(float(1.0))
-                        instance_pos_count += 1
-                        for neg_entity in train_neg.get(inst_cluster, []):
-                            article_docs.append(train_doc[article_id])
-                            entities.append(neg_entity)
-                            golds.append(float(0.0))
-                            instance_neg_count += 1
+                        entities.append(neg_entity)
+                        golds.append(float(0.0))
+                        instance_neg_count += 1
 
-                for k in range(10):
-                    print()
-                    print("update", k)
-                    print()
-                    # print("article docs", article_docs)
-                    print("entities", entities)
-                    print("golds", golds)
-                    print()
-                    self.update(article_docs=article_docs, entities=entities, golds=golds)
+                self.update(article_docs=article_docs, entities=entities, golds=golds)
 
-                    # dev eval
-                    self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False)
-                    self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter_avg", avg=True)
+                # dev eval
+                # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False)
+                self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter_avg", avg=True)
+                print()
             except ValueError as e:
                 print("Error in article id", article_id)
 
@@ -133,11 +126,12 @@ class EL_Model:
             print()
             print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg")
 
-        print()
-        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", avg=False)
-        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post_avg", avg=True)
-        self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post", avg=False)
-        self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post_avg", avg=True)
+        if self.PRINT_TRAIN:
+            # print()
+            # self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", avg=False)
+            self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post_avg", avg=True)
+        # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post", avg=False)
+        # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post_avg", avg=True)
 
     def _test_dev(self, instances, pos, neg, doc, print_string, avg=False, calc_random=False):
         predictions = list()
@@ -170,8 +164,7 @@ class EL_Model:
         # TODO: combine with prior probability
         p, r, f = run_el.evaluate(predictions, golds, to_print=False)
         if self.PRINT_F:
-            # print("p/r/F", print_string, round(p, 1), round(r, 1), round(f, 1))
-            print("F", print_string, round(f, 1))
+            print("p/r/F", print_string, round(p, 1), round(r, 1), round(f, 1))
 
         loss, d_scores = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds))
         if self.PRINT_LOSS:
@@ -242,8 +235,7 @@ class EL_Model:
                       >> Residual(zero_init(Maxout(in_width, in_width))) \
                       >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0))
 
-            # TODO: ReLu instead of LN(Maxout)  ?
-            # TODO: more convolutions ?
+            # TODO: ReLu or LN(Maxout)  ?
             # sum_pool or mean_pool ?
 
         return encoder
@@ -262,17 +254,17 @@ class EL_Model:
 
     def update(self, article_docs, entities, golds, apply_threshold=True):
         doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP)
-        print("doc_encodings", len(doc_encodings), doc_encodings)
+        # print("doc_encodings", len(doc_encodings), doc_encodings)
 
         entity_encodings, bp_entity = self.entity_encoder.begin_update(entities, drop=self.DROP)
-        print("entity_encodings", len(entity_encodings), entity_encodings)
+        # print("entity_encodings", len(entity_encodings), entity_encodings)
 
         concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
         # print("concat_encodings", len(concat_encodings), concat_encodings)
 
         predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP)
         predictions = self.model.ops.flatten(predictions)
-        print("predictions", predictions)
+        # print("predictions", predictions)
         golds = self.model.ops.asarray(golds)
 
         loss, d_scores = self.get_loss(predictions, golds)
@@ -292,7 +284,7 @@ class EL_Model:
         # print("d_scores", d_scores)
 
         model_gradient = bp_model(d_scores, sgd=self.sgd)
-        print("model_gradient", model_gradient)
+        # print("model_gradient", model_gradient)
 
         doc_gradient = list()
         entity_gradient = list()
@@ -300,11 +292,11 @@ class EL_Model:
             doc_gradient.append(list(x[0:self.ARTICLE_WIDTH]))
             entity_gradient.append(list(x[self.ARTICLE_WIDTH:]))
 
-        print("doc_gradient", doc_gradient)
-        print("entity_gradient", entity_gradient)
+        # print("doc_gradient", doc_gradient)
+        # print("entity_gradient", entity_gradient)
 
-        bp_doc(doc_gradient)
-        bp_entity(entity_gradient)
+        bp_doc(doc_gradient, sgd=self.sgd_article)
+        bp_entity(entity_gradient, sgd=self.sgd_entity)
 
     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index ced905ac5..6f021597f 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=10)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=200)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 2fa3fac8512c1ed102a64017123246ca156cfef5 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 21 May 2019 13:43:59 +0200
Subject: [PATCH 048/148] fix concat bp and more efficient batch calls

---
 .../pipeline/wiki_entity_linking/train_el.py  | 163 ++++++++----------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 2 files changed, 76 insertions(+), 89 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index e213f0955..2d218ed60 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -52,27 +52,25 @@ class EL_Model:
         # raise errors instead of runtime warnings in case of int/float overflow
         np.seterr(all='raise')
 
-        Doc.set_extension("entity_id", default=None)
+        train_inst, train_pos, train_neg, train_texts = self._get_training_data(training_dir,
+                                                                                entity_descr_output,
+                                                                                False,
+                                                                                trainlimit,
+                                                                                to_print=False)
 
-        train_inst, train_pos, train_neg, train_doc = self._get_training_data(training_dir,
-                                                                              entity_descr_output,
-                                                                              False,
-                                                                              trainlimit,
-                                                                              to_print=False)
-
-        dev_inst, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir,
-                                                                      entity_descr_output,
-                                                                      True,
-                                                                      devlimit,
-                                                                      to_print=False)
+        dev_inst, dev_pos, dev_neg, dev_texts = self._get_training_data(training_dir,
+                                                                        entity_descr_output,
+                                                                        True,
+                                                                        devlimit,
+                                                                        to_print=False)
         self._begin_training()
 
         print()
-        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_random", calc_random=True)
-        self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_random", calc_random=True)
+        self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_random", calc_random=True)
+        self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_random", calc_random=True)
         print()
-        self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_pre", avg=False)
-        self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_pre", avg=False)
+        self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_pre", avg=False)
+        self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_pre", avg=False)
 
         instance_pos_count = 0
         instance_neg_count = 0
@@ -99,26 +97,22 @@ class EL_Model:
                     # print()
                     # print(article_count, "Training on article", article_id)
                 article_count += 1
-                article_docs = list()
+                article_text = train_texts[article_id]
                 entities = list()
                 golds = list()
                 for inst_cluster in inst_cluster_set:
-                    article_docs.append(train_doc[article_id])
                     entities.append(train_pos.get(inst_cluster))
                     golds.append(float(1.0))
                     instance_pos_count += 1
                     for neg_entity in train_neg.get(inst_cluster, []):
-                        article_docs.append(train_doc[article_id])
                         entities.append(neg_entity)
                         golds.append(float(0.0))
                         instance_neg_count += 1
 
-                self.update(article_docs=article_docs, entities=entities, golds=golds)
+                self.update(article_text=article_text, entities=entities, golds=golds)
 
                 # dev eval
-                # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False)
-                self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter_avg", avg=True)
-                print()
+                self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_inter_avg", avg=True)
             except ValueError as e:
                 print("Error in article id", article_id)
 
@@ -127,13 +121,9 @@ class EL_Model:
             print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg")
 
         if self.PRINT_TRAIN:
-            # print()
-            # self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", avg=False)
-            self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post_avg", avg=True)
-        # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post", avg=False)
-        # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post_avg", avg=True)
+            self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_post_avg", avg=True)
 
-    def _test_dev(self, instances, pos, neg, doc, print_string, avg=False, calc_random=False):
+    def _test_dev(self, instances, pos, neg, texts_by_id, print_string, avg=False, calc_random=False):
         predictions = list()
         golds = list()
 
@@ -144,22 +134,18 @@ class EL_Model:
 
                 article = inst_cluster.split(sep="_")[0]
                 entity_id = inst_cluster.split(sep="_")[1]
-                article_doc = doc[article]
+                article_doc = self.nlp(texts_by_id[article])
+                entities = [self.nlp(pos_ex)]
+                golds.append(float(1.0))
+                for neg_ex in neg_exs:
+                    entities.append(self.nlp(neg_ex))
+                    golds.append(float(0.0))
 
                 if calc_random:
-                    prediction = self._predict_random(entity=pos_ex)
+                    preds = self._predict_random(entities=entities)
                 else:
-                    prediction = self._predict(article_doc=article_doc, entity=pos_ex, avg=avg)
-                predictions.append(prediction)
-                golds.append(float(1.0))
-
-                for neg_ex in neg_exs:
-                    if calc_random:
-                        prediction = self._predict_random(entity=neg_ex)
-                    else:
-                        prediction = self._predict(article_doc=article_doc, entity=neg_ex, avg=avg)
-                    predictions.append(prediction)
-                    golds.append(float(0.0))
+                    preds = self._predict(article_doc=article_doc, entities=entities, avg=avg)
+                predictions.extend(preds)
 
         # TODO: combine with prior probability
         p, r, f = run_el.evaluate(predictions, golds, to_print=False)
@@ -172,39 +158,38 @@ class EL_Model:
 
         return loss, p, r, f
 
-    def _predict(self, article_doc, entity, avg=False, apply_threshold=True):
+    def _predict(self, article_doc, entities, avg=False, apply_threshold=True):
         if avg:
             with self.article_encoder.use_params(self.sgd_article.averages) \
                  and self.entity_encoder.use_params(self.sgd_entity.averages):
                 doc_encoding = self.article_encoder([article_doc])[0]
-                entity_encoding = self.entity_encoder([entity])[0]
+                entity_encodings = self.entity_encoder(entities)
 
         else:
             doc_encoding = self.article_encoder([article_doc])[0]
-            entity_encoding = self.entity_encoder([entity])[0]
+            entity_encodings = self.entity_encoder(entities)
 
-        concat_encoding = list(entity_encoding) + list(doc_encoding)
-        np_array = np.asarray([concat_encoding])
+        concat_encodings = [list(entity_encodings[i]) + list(doc_encoding) for i in range(len(entities))]
+        np_array_list = np.asarray(concat_encodings)
 
         if avg:
-           with self.model.use_params(self.sgd.averages):
-               prediction = self.model(np_array)
+            with self.model.use_params(self.sgd.averages):
+                predictions = self.model(np_array_list)
         else:
-            prediction = self.model(np_array)
+            predictions = self.model(np_array_list)
 
-        if not apply_threshold:
-            return float(prediction)
-        if prediction > self.CUTOFF:
-            return float(1.0)
-        return float(0.0)
+        predictions = self.model.ops.flatten(predictions)
+        predictions = [float(p) for p in predictions]
+        if apply_threshold:
+            predictions = [float(1.0) if p > self.CUTOFF else float(0.0) for p in predictions]
 
-    def _predict_random(self, entity, apply_threshold=True):
-        r = random.uniform(0, 1)
+        return predictions
+
+    def _predict_random(self, entities, apply_threshold=True):
         if not apply_threshold:
-            return r
-        if r > self.CUTOFF:
-            return float(1.0)
-        return float(0.0)
+            return [float(random.uniform(0,1)) for e in entities]
+        else:
+            return [float(1.0) if random.uniform(0,1) > self.CUTOFF else float(0.0) for e in entities]
 
     def _build_cnn(self, hidden_entity_width, hidden_article_width):
         with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
@@ -252,20 +237,27 @@ class EL_Model:
         loss = (d_scores ** 2).sum()
         return loss, d_scores
 
-    def update(self, article_docs, entities, golds, apply_threshold=True):
-        doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP)
-        # print("doc_encodings", len(doc_encodings), doc_encodings)
+    # TODO: multiple docs/articles
+    def update(self, article_text, entities, golds, apply_threshold=True):
+        article_doc = self.nlp(article_text)
+        doc_encodings, bp_doc = self.article_encoder.begin_update([article_doc], drop=self.DROP)
+        doc_encoding = doc_encodings[0]
 
-        entity_encodings, bp_entity = self.entity_encoder.begin_update(entities, drop=self.DROP)
+        entity_docs = list(self.nlp.pipe(entities))
+        # print("entity_docs", type(entity_docs))
+
+        entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=self.DROP)
         # print("entity_encodings", len(entity_encodings), entity_encodings)
 
-        concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
+        concat_encodings = [list(entity_encodings[i]) + list(doc_encoding) for i in range(len(entities))]
         # print("concat_encodings", len(concat_encodings), concat_encodings)
 
         predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP)
         predictions = self.model.ops.flatten(predictions)
+
         # print("predictions", predictions)
         golds = self.model.ops.asarray(golds)
+        # print("golds", golds)
 
         loss, d_scores = self.get_loss(predictions, golds)
 
@@ -275,7 +267,7 @@ class EL_Model:
         if self.PRINT_F and self.PRINT_TRAIN:
             predictions_f = [x for x in predictions]
             if apply_threshold:
-                predictions_f = [1.0 if x > self.CUTOFF else 0.0 for x in predictions_f]
+                predictions_f = [float(1.0) if x > self.CUTOFF else float(0.0) for x in predictions_f]
             p, r, f = run_el.evaluate(predictions_f, golds, to_print=False)
             print("p/r/F train", round(p, 1), round(r, 1), round(f, 1))
 
@@ -286,17 +278,17 @@ class EL_Model:
         model_gradient = bp_model(d_scores, sgd=self.sgd)
         # print("model_gradient", model_gradient)
 
-        doc_gradient = list()
-        entity_gradient = list()
+        # concat = entity + doc, but doc is the same within this function (TODO: multiple docs/articles)
+        doc_gradient = model_gradient[0][self.ENTITY_WIDTH:]
+        entity_gradients = list()
         for x in model_gradient:
-            doc_gradient.append(list(x[0:self.ARTICLE_WIDTH]))
-            entity_gradient.append(list(x[self.ARTICLE_WIDTH:]))
+            entity_gradients.append(list(x[0:self.ENTITY_WIDTH]))
 
         # print("doc_gradient", doc_gradient)
-        # print("entity_gradient", entity_gradient)
+        # print("entity_gradients", entity_gradients)
 
-        bp_doc(doc_gradient, sgd=self.sgd_article)
-        bp_entity(entity_gradient, sgd=self.sgd_entity)
+        bp_doc([doc_gradient], sgd=self.sgd_article)
+        bp_entity(entity_gradients, sgd=self.sgd_entity)
 
     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
@@ -305,9 +297,9 @@ class EL_Model:
                                                                                          collect_correct=True,
                                                                                          collect_incorrect=True)
 
-        instance_by_doc = dict()
+        instance_by_article = dict()
         local_vectors = list()   # TODO: local vectors
-        doc_by_article = dict()
+        text_by_article = dict()
         pos_entities = dict()
         neg_entities = dict()
 
@@ -319,33 +311,28 @@ class EL_Model:
                     if cnt % 500 == 0 and to_print:
                         print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
                     cnt += 1
-                    if article_id not in doc_by_article:
+                    if article_id not in text_by_article:
                         with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
                             text = file.read()
-                            doc = self.nlp(text)
-                            doc_by_article[article_id] = doc
-                            instance_by_doc[article_id] = set()
+                            text_by_article[article_id] = text
+                            instance_by_article[article_id] = set()
 
                     for mention, entity_pos in correct_entries[article_id].items():
                         descr = id_to_descr.get(entity_pos)
                         if descr:
-                            instance_by_doc[article_id].add(article_id + "_" + mention)
-                            doc_descr = self.nlp(descr)
-                            doc_descr._.entity_id = entity_pos
-                            pos_entities[article_id + "_" + mention] = doc_descr
+                            instance_by_article[article_id].add(article_id + "_" + mention)
+                            pos_entities[article_id + "_" + mention] = descr
 
                     for mention, entity_negs in incorrect_entries[article_id].items():
                         for entity_neg in entity_negs:
                             descr = id_to_descr.get(entity_neg)
                             if descr:
-                                doc_descr = self.nlp(descr)
-                                doc_descr._.entity_id = entity_neg
                                 descr_list = neg_entities.get(article_id + "_" + mention, [])
-                                descr_list.append(doc_descr)
+                                descr_list.append(descr)
                                 neg_entities[article_id + "_" + mention] = descr_list
 
         if to_print:
             print()
             print("Processed", cnt, "training articles, dev=" + str(dev))
             print()
-        return instance_by_doc, pos_entities, neg_entities, doc_by_article
+        return instance_by_article, pos_entities, neg_entities, text_by_article
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 6f021597f..23c12bfe6 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=200)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10, devlimit=10)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 7b13e3d56fb2af2ba6f2ebdd9e26e1aa8f540dd5 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 21 May 2019 18:35:10 +0200
Subject: [PATCH 049/148] undersampling negatives

---
 .../pipeline/wiki_entity_linking/train_el.py  | 20 +++++++++----------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 2d218ed60..20a5e4428 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -56,20 +56,19 @@ class EL_Model:
                                                                                 entity_descr_output,
                                                                                 False,
                                                                                 trainlimit,
+                                                                                balance=True,
                                                                                 to_print=False)
 
         dev_inst, dev_pos, dev_neg, dev_texts = self._get_training_data(training_dir,
                                                                         entity_descr_output,
                                                                         True,
                                                                         devlimit,
+                                                                        balance=False,
                                                                         to_print=False)
         self._begin_training()
 
         print()
-        self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_random", calc_random=True)
         self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_random", calc_random=True)
-        print()
-        self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_pre", avg=False)
         self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_pre", avg=False)
 
         instance_pos_count = 0
@@ -120,9 +119,6 @@ class EL_Model:
             print()
             print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg")
 
-        if self.PRINT_TRAIN:
-            self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_post_avg", avg=True)
-
     def _test_dev(self, instances, pos, neg, texts_by_id, print_string, avg=False, calc_random=False):
         predictions = list()
         golds = list()
@@ -290,7 +286,7 @@ class EL_Model:
         bp_doc([doc_gradient], sgd=self.sgd_article)
         bp_entity(entity_gradients, sgd=self.sgd_entity)
 
-    def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
+    def _get_training_data(self, training_dir, entity_descr_output, dev, limit, balance, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
 
         correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir,
@@ -324,12 +320,16 @@ class EL_Model:
                             pos_entities[article_id + "_" + mention] = descr
 
                     for mention, entity_negs in incorrect_entries[article_id].items():
+                        neg_count = 0
                         for entity_neg in entity_negs:
                             descr = id_to_descr.get(entity_neg)
                             if descr:
-                                descr_list = neg_entities.get(article_id + "_" + mention, [])
-                                descr_list.append(descr)
-                                neg_entities[article_id + "_" + mention] = descr_list
+                                # if balance, keep only 1 negative instance for each positive instance
+                                if neg_count < 1 or not balance:
+                                    descr_list = neg_entities.get(article_id + "_" + mention, [])
+                                    descr_list.append(descr)
+                                    neg_entities[article_id + "_" + mention] = descr_list
+                                    neg_count += 1
 
         if to_print:
             print()
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 23c12bfe6..0927fb394 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10, devlimit=10)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=500, devlimit=20)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From eb08bdb11feef7bd8ffaa31a7d30dab37e97d1d3 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 21 May 2019 23:42:46 +0200
Subject: [PATCH 050/148] hidden with for encoders

---
 .../pipeline/wiki_entity_linking/train_el.py  | 44 +++++++++++--------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  2 +-
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 20a5e4428..36fb9227a 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -33,9 +33,10 @@ class EL_Model:
     CUTOFF = 0.5
 
     INPUT_DIM = 300
-    ENTITY_WIDTH = 64   # 4
-    ARTICLE_WIDTH = 128  # 8
-    HIDDEN_WIDTH = 64  # 6
+    HIDDEN_1_WIDTH = 256  # 10
+    HIDDEN_2_WIDTH = 32  # 6
+    ENTITY_WIDTH = 64     # 4
+    ARTICLE_WIDTH = 128   # 8
 
     DROP = 0.1
 
@@ -46,7 +47,11 @@ class EL_Model:
         self.nlp = nlp
         self.kb = kb
 
-        self._build_cnn(hidden_entity_width=self.ENTITY_WIDTH, hidden_article_width=self.ARTICLE_WIDTH)
+        self._build_cnn(in_width=self.INPUT_DIM,
+                        entity_width=self.ENTITY_WIDTH,
+                        article_width=self.ARTICLE_WIDTH,
+                        hidden_1_width=self.HIDDEN_1_WIDTH,
+                        hidden_2_width=self.HIDDEN_2_WIDTH)
 
     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
         # raise errors instead of runtime warnings in case of int/float overflow
@@ -81,9 +86,10 @@ class EL_Model:
             print()
             print(" CUTOFF", self.CUTOFF)
             print(" INPUT_DIM", self.INPUT_DIM)
+            print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH)
             print(" ENTITY_WIDTH", self.ENTITY_WIDTH)
             print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH)
-            print(" HIDDEN_WIDTH", self.ARTICLE_WIDTH)
+            print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH)
             print(" DROP", self.DROP)
             print()
 
@@ -187,34 +193,34 @@ class EL_Model:
         else:
             return [float(1.0) if random.uniform(0,1) > self.CUTOFF else float(0.0) for e in entities]
 
-    def _build_cnn(self, hidden_entity_width, hidden_article_width):
+    def _build_cnn(self, in_width, entity_width, article_width, hidden_1_width, hidden_2_width):
         with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
-            self.entity_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_entity_width)
-            self.article_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_article_width)
+            self.entity_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=entity_width)
+            self.article_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=article_width)
 
-            nr_i = hidden_entity_width + hidden_article_width
-            nr_o = self.HIDDEN_WIDTH
+            in_width = entity_width + article_width
+            out_width = hidden_2_width
 
-            self.model = Affine(nr_o, nr_i) \
-                >> LN(Maxout(nr_o, nr_o)) \
-                >> Affine(1, nr_o) \
+            self.model = Affine(out_width, in_width) \
+                >> LN(Maxout(out_width, out_width)) \
+                >> Affine(1, out_width) \
                 >> logistic
 
     @staticmethod
-    def _encoder(in_width, hidden_width):
+    def _encoder(in_width, hidden_with, end_width):
         conv_depth = 2
         cnn_maxout_pieces = 3
 
         with Model.define_operators({">>": chain}):
-            convolution = Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces))))
+            convolution = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_with, hidden_with * 3, pieces=cnn_maxout_pieces))))
 
             encoder = SpacyVectors \
-                      >> with_flatten(LN(Maxout(in_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \
+                      >> with_flatten(LN(Maxout(hidden_with, in_width)) >> convolution ** conv_depth, pad=conv_depth) \
                       >> flatten_add_lengths \
-                      >> ParametricAttention(in_width)\
+                      >> ParametricAttention(hidden_with)\
                       >> Pooling(mean_pool) \
-                      >> Residual(zero_init(Maxout(in_width, in_width))) \
-                      >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0))
+                      >> Residual(zero_init(Maxout(hidden_with, hidden_with))) \
+                      >> zero_init(Affine(end_width, hidden_with, drop_factor=0.0))
 
             # TODO: ReLu or LN(Maxout)  ?
             # sum_pool or mean_pool ?
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 0927fb394..a3d6a69f9 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=500, devlimit=20)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=20)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 1a16490d20185949d65831fc96064a4c1e1c97e8 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 22 May 2019 12:46:40 +0200
Subject: [PATCH 051/148] update per entity

---
 .../pipeline/wiki_entity_linking/train_el.py  | 91 +++++++++----------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  2 +-
 2 files changed, 45 insertions(+), 48 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 36fb9227a..a383a3687 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -154,7 +154,7 @@ class EL_Model:
         if self.PRINT_F:
             print("p/r/F", print_string, round(p, 1), round(r, 1), round(f, 1))
 
-        loss, d_scores = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds))
+        loss, gradient = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds))
         if self.PRINT_LOSS:
             print("loss", print_string, round(loss, 5))
 
@@ -235,62 +235,58 @@ class EL_Model:
     @staticmethod
     def get_loss(predictions, golds):
         d_scores = (predictions - golds)
-
-        loss = (d_scores ** 2).sum()
+        loss = (d_scores ** 2).mean()
         return loss, d_scores
 
     # TODO: multiple docs/articles
     def update(self, article_text, entities, golds, apply_threshold=True):
         article_doc = self.nlp(article_text)
-        doc_encodings, bp_doc = self.article_encoder.begin_update([article_doc], drop=self.DROP)
-        doc_encoding = doc_encodings[0]
+        # entity_docs = list(self.nlp.pipe(entities))
 
-        entity_docs = list(self.nlp.pipe(entities))
-        # print("entity_docs", type(entity_docs))
+        for entity, gold in zip(entities, golds):
+            doc_encodings, bp_doc = self.article_encoder.begin_update([article_doc], drop=self.DROP)
+            doc_encoding = doc_encodings[0]
 
-        entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=self.DROP)
-        # print("entity_encodings", len(entity_encodings), entity_encodings)
+            entity_doc = self.nlp(entity)
+            # print("entity_docs", type(entity_doc))
 
-        concat_encodings = [list(entity_encodings[i]) + list(doc_encoding) for i in range(len(entities))]
-        # print("concat_encodings", len(concat_encodings), concat_encodings)
+            entity_encodings, bp_entity = self.entity_encoder.begin_update([entity_doc], drop=self.DROP)
+            entity_encoding = entity_encodings[0]
+            # print("entity_encoding", len(entity_encoding), entity_encoding)
 
-        predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP)
-        predictions = self.model.ops.flatten(predictions)
+            concat_encodings = [list(entity_encoding) + list(doc_encoding)]  #  for i in range(len(entities))
+            # print("concat_encodings", len(concat_encodings), concat_encodings)
 
-        # print("predictions", predictions)
-        golds = self.model.ops.asarray(golds)
-        # print("golds", golds)
+            prediction, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP)
+            # predictions = self.model.ops.flatten(predictions)
 
-        loss, d_scores = self.get_loss(predictions, golds)
+            # print("prediction", prediction)
+            # golds = self.model.ops.asarray(golds)
+            # print("gold", gold)
 
-        if self.PRINT_LOSS and self.PRINT_TRAIN:
-            print("loss train", round(loss, 5))
+            loss, gradient = self.get_loss(prediction, gold)
 
-        if self.PRINT_F and self.PRINT_TRAIN:
-            predictions_f = [x for x in predictions]
-            if apply_threshold:
-                predictions_f = [float(1.0) if x > self.CUTOFF else float(0.0) for x in predictions_f]
-            p, r, f = run_el.evaluate(predictions_f, golds, to_print=False)
-            print("p/r/F train", round(p, 1), round(r, 1), round(f, 1))
+            if self.PRINT_LOSS and self.PRINT_TRAIN:
+                print("loss train", round(loss, 5))
 
-        d_scores = d_scores.reshape((-1, 1))
-        d_scores = d_scores.astype(np.float32)
-        # print("d_scores", d_scores)
+            gradient = float(gradient)
+            # print("gradient", gradient)
+            # print("loss", loss)
 
-        model_gradient = bp_model(d_scores, sgd=self.sgd)
-        # print("model_gradient", model_gradient)
+            model_gradient = bp_model(gradient, sgd=self.sgd)
+            # print("model_gradient", model_gradient)
 
-        # concat = entity + doc, but doc is the same within this function (TODO: multiple docs/articles)
-        doc_gradient = model_gradient[0][self.ENTITY_WIDTH:]
-        entity_gradients = list()
-        for x in model_gradient:
-            entity_gradients.append(list(x[0:self.ENTITY_WIDTH]))
+            # concat = entity + doc, but doc is the same within this function (TODO: multiple docs/articles)
+            doc_gradient = model_gradient[0][self.ENTITY_WIDTH:]
+            entity_gradients = list()
+            for x in model_gradient:
+                entity_gradients.append(list(x[0:self.ENTITY_WIDTH]))
 
-        # print("doc_gradient", doc_gradient)
-        # print("entity_gradients", entity_gradients)
+            # print("doc_gradient", doc_gradient)
+            # print("entity_gradients", entity_gradients)
 
-        bp_doc([doc_gradient], sgd=self.sgd_article)
-        bp_entity(entity_gradients, sgd=self.sgd_entity)
+            bp_doc([doc_gradient], sgd=self.sgd_article)
+            bp_entity(entity_gradients, sgd=self.sgd_entity)
 
     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, balance, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
@@ -326,16 +322,17 @@ class EL_Model:
                             pos_entities[article_id + "_" + mention] = descr
 
                     for mention, entity_negs in incorrect_entries[article_id].items():
-                        neg_count = 0
-                        for entity_neg in entity_negs:
-                            descr = id_to_descr.get(entity_neg)
-                            if descr:
+                        if not balance or pos_entities.get(article_id + "_" + mention):
+                            neg_count = 0
+                            for entity_neg in entity_negs:
                                 # if balance, keep only 1 negative instance for each positive instance
                                 if neg_count < 1 or not balance:
-                                    descr_list = neg_entities.get(article_id + "_" + mention, [])
-                                    descr_list.append(descr)
-                                    neg_entities[article_id + "_" + mention] = descr_list
-                                    neg_count += 1
+                                    descr = id_to_descr.get(entity_neg)
+                                    if descr:
+                                        descr_list = neg_entities.get(article_id + "_" + mention, [])
+                                        descr_list.append(descr)
+                                        neg_entities[article_id + "_" + mention] = descr_list
+                                        neg_count += 1
 
         if to_print:
             print()
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index a3d6a69f9..319b1e1c8 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=20)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 97241a3ed78d7fa41aaea3de30843ca49b0ae6d0 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 22 May 2019 23:40:10 +0200
Subject: [PATCH 052/148] upsampling and batch processing

---
 .../pipeline/wiki_entity_linking/run_el.py    |  12 +-
 .../pipeline/wiki_entity_linking/train_el.py  | 294 +++++++++---------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 3 files changed, 157 insertions(+), 151 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
index 6ab7ea75f..273543306 100644
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -78,8 +78,15 @@ def evaluate(predictions, golds, to_print=True):
     fp = 0
     fn = 0
 
+    corrects = 0
+    incorrects = 0
+
     for pred, gold in zip(predictions, golds):
         is_correct = pred == gold
+        if is_correct:
+            corrects += 1
+        else:
+            incorrects += 1
         if not pred:
             if not is_correct:  # we don't care about tn
                 fn += 1
@@ -98,12 +105,15 @@ def evaluate(predictions, golds, to_print=True):
     recall = 100 * tp / (tp + fn + 0.0000001)
     fscore = 2 * recall * precision / (recall + precision + 0.0000001)
 
+    accuracy = corrects / (corrects + incorrects)
+
     if to_print:
         print("precision", round(precision, 1), "%")
         print("recall", round(recall, 1), "%")
         print("Fscore", round(fscore, 1), "%")
+        print("Accuracy", round(accuracy, 1), "%")
 
-    return precision, recall, fscore
+    return precision, recall, fscore, accuracy
 
 
 def _prepare_pipeline(nlp, kb):
diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index a383a3687..cd6e9de4d 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -6,6 +6,7 @@ import datetime
 from os import listdir
 import numpy as np
 import random
+from random import shuffle
 from thinc.neural._classes.convolution import ExtractWindow
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
@@ -26,17 +27,17 @@ from spacy.tokens import Doc
 
 class EL_Model:
 
-    PRINT_LOSS = False
-    PRINT_F = True
     PRINT_TRAIN = False
     EPS = 0.0000000005
     CUTOFF = 0.5
 
+    BATCH_SIZE = 5
+
     INPUT_DIM = 300
-    HIDDEN_1_WIDTH = 256  # 10
+    HIDDEN_1_WIDTH = 32   # 10
     HIDDEN_2_WIDTH = 32  # 6
-    ENTITY_WIDTH = 64     # 4
-    ARTICLE_WIDTH = 128   # 8
+    DESC_WIDTH = 64     # 4
+    ARTICLE_WIDTH = 64   # 8
 
     DROP = 0.1
 
@@ -48,7 +49,7 @@ class EL_Model:
         self.kb = kb
 
         self._build_cnn(in_width=self.INPUT_DIM,
-                        entity_width=self.ENTITY_WIDTH,
+                        desc_width=self.DESC_WIDTH,
                         article_width=self.ARTICLE_WIDTH,
                         hidden_1_width=self.HIDDEN_1_WIDTH,
                         hidden_2_width=self.HIDDEN_2_WIDTH)
@@ -57,121 +58,118 @@ class EL_Model:
         # raise errors instead of runtime warnings in case of int/float overflow
         np.seterr(all='raise')
 
-        train_inst, train_pos, train_neg, train_texts = self._get_training_data(training_dir,
-                                                                                entity_descr_output,
-                                                                                False,
-                                                                                trainlimit,
-                                                                                balance=True,
-                                                                                to_print=False)
+        train_ent, train_gold, train_desc, train_article, train_texts = self._get_training_data(training_dir,
+                                                                                                entity_descr_output,
+                                                                                                False,
+                                                                                                trainlimit,
+                                                                                                to_print=False)
+
+        train_pos_entities = [k for k,v in train_gold.items() if v]
+        train_neg_entities = [k for k,v in train_gold.items() if not v]
+
+        train_pos_count = len(train_pos_entities)
+        train_neg_count = len(train_neg_entities)
+
+        # upsample positives to 50-50 distribution
+        while train_pos_count < train_neg_count:
+            train_ent.append(random.choice(train_pos_entities))
+            train_pos_count += 1
+
+        # upsample negatives to 50-50 distribution
+        while train_neg_count < train_pos_count:
+            train_ent.append(random.choice(train_neg_entities))
+            train_neg_count += 1
+
+        shuffle(train_ent)
+
+        dev_ent, dev_gold, dev_desc, dev_article, dev_texts = self._get_training_data(training_dir,
+                                                                                      entity_descr_output,
+                                                                                      True,
+                                                                                      devlimit,
+                                                                                      to_print=False)
+        shuffle(dev_ent)
+
+        dev_pos_count = len([g for g in dev_gold.values() if g])
+        dev_neg_count = len([g for g in dev_gold.values() if not g])
 
-        dev_inst, dev_pos, dev_neg, dev_texts = self._get_training_data(training_dir,
-                                                                        entity_descr_output,
-                                                                        True,
-                                                                        devlimit,
-                                                                        balance=False,
-                                                                        to_print=False)
         self._begin_training()
 
         print()
-        self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_random", calc_random=True)
-        self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_pre", avg=False)
-
-        instance_pos_count = 0
-        instance_neg_count = 0
+        self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_random", calc_random=True)
+        print()
+        self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_pre", avg=True)
 
         if to_print:
             print()
-            print("Training on", len(train_inst.values()), "articles")
-            print("Dev test on", len(dev_inst.values()), "articles")
+            print("Training on", len(train_ent), "entities in", len(train_texts), "articles")
+            print("Training instances pos/neg", train_pos_count, train_neg_count)
+            print()
+            print("Dev test on", len(dev_ent), "entities in", len(dev_texts), "articles")
+            print("Dev instances pos/neg", dev_pos_count, dev_neg_count)
             print()
             print(" CUTOFF", self.CUTOFF)
             print(" INPUT_DIM", self.INPUT_DIM)
             print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH)
-            print(" ENTITY_WIDTH", self.ENTITY_WIDTH)
+            print(" DESC_WIDTH", self.DESC_WIDTH)
             print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH)
             print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH)
             print(" DROP", self.DROP)
             print()
 
-        # TODO: proper batches. Currently 1 article at the time
-        # TODO shuffle data (currently positive is always followed by several negatives)
-        article_count = 0
-        for article_id, inst_cluster_set in train_inst.items():
-            try:
-                # if to_print:
-                    # print()
-                    # print(article_count, "Training on article", article_id)
-                article_count += 1
-                article_text = train_texts[article_id]
-                entities = list()
-                golds = list()
-                for inst_cluster in inst_cluster_set:
-                    entities.append(train_pos.get(inst_cluster))
-                    golds.append(float(1.0))
-                    instance_pos_count += 1
-                    for neg_entity in train_neg.get(inst_cluster, []):
-                        entities.append(neg_entity)
-                        golds.append(float(0.0))
-                        instance_neg_count += 1
+        start = 0
+        stop = min(self.BATCH_SIZE, len(train_ent))
+        processed = 0
 
-                self.update(article_text=article_text, entities=entities, golds=golds)
+        while start < len(train_ent):
+            next_batch = train_ent[start:stop]
 
-                # dev eval
-                self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_inter_avg", avg=True)
-            except ValueError as e:
-                print("Error in article id", article_id)
+            golds = [train_gold[e] for e in next_batch]
+            descs = [train_desc[e] for e in next_batch]
+            articles = [train_texts[train_article[e]] for e in next_batch]
+
+            self.update(entities=next_batch, golds=golds, descs=descs, texts=articles)
+            self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_inter", avg=True)
+
+            processed += len(next_batch)
+
+            start = start + self.BATCH_SIZE
+            stop = min(stop + self.BATCH_SIZE, len(train_ent))
 
         if to_print:
             print()
-            print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg")
+            print("Trained on", processed, "entities in total")
 
-    def _test_dev(self, instances, pos, neg, texts_by_id, print_string, avg=False, calc_random=False):
-        predictions = list()
-        golds = list()
+    def _test_dev(self, entities, gold_by_entity, desc_by_entity, article_by_entity, texts_by_id, print_string, avg=True, calc_random=False):
+        golds = [gold_by_entity[e] for e in entities]
 
-        for article_id, inst_cluster_set in instances.items():
-            for inst_cluster in inst_cluster_set:
-                pos_ex = pos.get(inst_cluster)
-                neg_exs = neg.get(inst_cluster, [])
+        if calc_random:
+            predictions = self._predict_random(entities=entities)
 
-                article = inst_cluster.split(sep="_")[0]
-                entity_id = inst_cluster.split(sep="_")[1]
-                article_doc = self.nlp(texts_by_id[article])
-                entities = [self.nlp(pos_ex)]
-                golds.append(float(1.0))
-                for neg_ex in neg_exs:
-                    entities.append(self.nlp(neg_ex))
-                    golds.append(float(0.0))
-
-                if calc_random:
-                    preds = self._predict_random(entities=entities)
-                else:
-                    preds = self._predict(article_doc=article_doc, entities=entities, avg=avg)
-                predictions.extend(preds)
+        else:
+            desc_docs = self.nlp.pipe([desc_by_entity[e] for e in entities])
+            article_docs = self.nlp.pipe([texts_by_id[article_by_entity[e]] for e in entities])
+            predictions = self._predict(entities=entities, article_docs=article_docs, desc_docs=desc_docs, avg=avg)
 
         # TODO: combine with prior probability
-        p, r, f = run_el.evaluate(predictions, golds, to_print=False)
-        if self.PRINT_F:
-            print("p/r/F", print_string, round(p, 1), round(r, 1), round(f, 1))
-
+        p, r, f, acc = run_el.evaluate(predictions, golds, to_print=False)
         loss, gradient = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds))
-        if self.PRINT_LOSS:
-            print("loss", print_string, round(loss, 5))
+
+        print("p/r/F/acc/loss", print_string, round(p, 1), round(r, 1), round(f, 1), round(acc, 2), round(loss, 5))
 
         return loss, p, r, f
 
-    def _predict(self, article_doc, entities, avg=False, apply_threshold=True):
+    def _predict(self, entities, article_docs, desc_docs, avg=True, apply_threshold=True):
         if avg:
             with self.article_encoder.use_params(self.sgd_article.averages) \
-                 and self.entity_encoder.use_params(self.sgd_entity.averages):
-                doc_encoding = self.article_encoder([article_doc])[0]
-                entity_encodings = self.entity_encoder(entities)
+                 and self.desc_encoder.use_params(self.sgd_entity.averages):
+                doc_encodings = self.article_encoder(article_docs)
+                desc_encodings = self.desc_encoder(desc_docs)
 
         else:
-            doc_encoding = self.article_encoder([article_doc])[0]
-            entity_encodings = self.entity_encoder(entities)
+            doc_encodings = self.article_encoder(article_docs)
+            desc_encodings = self.desc_encoder(desc_docs)
 
-        concat_encodings = [list(entity_encodings[i]) + list(doc_encoding) for i in range(len(entities))]
+        concat_encodings = [list(desc_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
         np_array_list = np.asarray(concat_encodings)
 
         if avg:
@@ -189,16 +187,16 @@ class EL_Model:
 
     def _predict_random(self, entities, apply_threshold=True):
         if not apply_threshold:
-            return [float(random.uniform(0,1)) for e in entities]
+            return [float(random.uniform(0, 1)) for e in entities]
         else:
-            return [float(1.0) if random.uniform(0,1) > self.CUTOFF else float(0.0) for e in entities]
+            return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for e in entities]
 
-    def _build_cnn(self, in_width, entity_width, article_width, hidden_1_width, hidden_2_width):
+    def _build_cnn(self, in_width, desc_width, article_width, hidden_1_width, hidden_2_width):
         with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
-            self.entity_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=entity_width)
+            self.desc_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=desc_width)
             self.article_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=article_width)
 
-            in_width = entity_width + article_width
+            in_width = desc_width + article_width
             out_width = hidden_2_width
 
             self.model = Affine(out_width, in_width) \
@@ -229,80 +227,78 @@ class EL_Model:
 
     def _begin_training(self):
         self.sgd_article = create_default_optimizer(self.article_encoder.ops)
-        self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
+        self.sgd_entity = create_default_optimizer(self.desc_encoder.ops)
         self.sgd = create_default_optimizer(self.model.ops)
 
     @staticmethod
     def get_loss(predictions, golds):
         d_scores = (predictions - golds)
+        gradient = d_scores.mean()
         loss = (d_scores ** 2).mean()
-        return loss, d_scores
+        return loss, gradient
 
-    # TODO: multiple docs/articles
-    def update(self, article_text, entities, golds, apply_threshold=True):
-        article_doc = self.nlp(article_text)
-        # entity_docs = list(self.nlp.pipe(entities))
+    def update(self, entities, golds, descs, texts):
+        golds = self.model.ops.asarray(golds)
 
-        for entity, gold in zip(entities, golds):
-            doc_encodings, bp_doc = self.article_encoder.begin_update([article_doc], drop=self.DROP)
-            doc_encoding = doc_encodings[0]
+        desc_docs = self.nlp.pipe(descs)
+        article_docs = self.nlp.pipe(texts)
 
-            entity_doc = self.nlp(entity)
-            # print("entity_docs", type(entity_doc))
+        doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP)
 
-            entity_encodings, bp_entity = self.entity_encoder.begin_update([entity_doc], drop=self.DROP)
-            entity_encoding = entity_encodings[0]
-            # print("entity_encoding", len(entity_encoding), entity_encoding)
+        desc_encodings, bp_entity = self.desc_encoder.begin_update(desc_docs, drop=self.DROP)
 
-            concat_encodings = [list(entity_encoding) + list(doc_encoding)]  #  for i in range(len(entities))
-            # print("concat_encodings", len(concat_encodings), concat_encodings)
+        concat_encodings = [list(desc_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
 
-            prediction, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP)
-            # predictions = self.model.ops.flatten(predictions)
+        predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP)
+        predictions = self.model.ops.flatten(predictions)
 
-            # print("prediction", prediction)
-            # golds = self.model.ops.asarray(golds)
-            # print("gold", gold)
+        # print("entities", entities)
+        # print("predictions", predictions)
+        # print("golds", golds)
 
-            loss, gradient = self.get_loss(prediction, gold)
+        loss, gradient = self.get_loss(predictions, golds)
 
-            if self.PRINT_LOSS and self.PRINT_TRAIN:
-                print("loss train", round(loss, 5))
+        if self.PRINT_TRAIN:
+            print("loss train", round(loss, 5))
 
-            gradient = float(gradient)
-            # print("gradient", gradient)
-            # print("loss", loss)
+        gradient = float(gradient)
+        # print("gradient", gradient)
+        # print("loss", loss)
 
-            model_gradient = bp_model(gradient, sgd=self.sgd)
-            # print("model_gradient", model_gradient)
+        model_gradient = bp_model(gradient, sgd=self.sgd)
+        # print("model_gradient", model_gradient)
 
-            # concat = entity + doc, but doc is the same within this function (TODO: multiple docs/articles)
-            doc_gradient = model_gradient[0][self.ENTITY_WIDTH:]
-            entity_gradients = list()
-            for x in model_gradient:
-                entity_gradients.append(list(x[0:self.ENTITY_WIDTH]))
+        # concat = desc + doc, but doc is the same within this function (TODO: multiple docs/articles)
+        doc_gradient = model_gradient[0][self.DESC_WIDTH:]
+        entity_gradients = list()
+        for x in model_gradient:
+            entity_gradients.append(list(x[0:self.DESC_WIDTH]))
 
-            # print("doc_gradient", doc_gradient)
-            # print("entity_gradients", entity_gradients)
+        # print("doc_gradient", doc_gradient)
+        # print("entity_gradients", entity_gradients)
 
-            bp_doc([doc_gradient], sgd=self.sgd_article)
-            bp_entity(entity_gradients, sgd=self.sgd_entity)
+        bp_doc([doc_gradient], sgd=self.sgd_article)
+        bp_entity(entity_gradients, sgd=self.sgd_entity)
 
-    def _get_training_data(self, training_dir, entity_descr_output, dev, limit, balance, to_print):
+    def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
 
         correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir,
                                                                                          collect_correct=True,
                                                                                          collect_incorrect=True)
 
-        instance_by_article = dict()
         local_vectors = list()   # TODO: local vectors
         text_by_article = dict()
-        pos_entities = dict()
-        neg_entities = dict()
+        gold_by_entity = dict()
+        desc_by_entity = dict()
+        article_by_entity = dict()
+        entities = list()
 
         cnt = 0
-        for f in listdir(training_dir):
+        next_entity_nr = 0
+        files = listdir(training_dir)
+        shuffle(files)
+        for f in files:
             if not limit or cnt < limit:
                 if dev == run_el.is_dev(f):
                     article_id = f.replace(".txt", "")
@@ -313,29 +309,29 @@ class EL_Model:
                         with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
                             text = file.read()
                             text_by_article[article_id] = text
-                            instance_by_article[article_id] = set()
 
                     for mention, entity_pos in correct_entries[article_id].items():
                         descr = id_to_descr.get(entity_pos)
                         if descr:
-                            instance_by_article[article_id].add(article_id + "_" + mention)
-                            pos_entities[article_id + "_" + mention] = descr
+                            entities.append(next_entity_nr)
+                            gold_by_entity[next_entity_nr] = 1
+                            desc_by_entity[next_entity_nr] = descr
+                            article_by_entity[next_entity_nr] = article_id
+                            next_entity_nr += 1
 
                     for mention, entity_negs in incorrect_entries[article_id].items():
-                        if not balance or pos_entities.get(article_id + "_" + mention):
-                            neg_count = 0
-                            for entity_neg in entity_negs:
-                                # if balance, keep only 1 negative instance for each positive instance
-                                if neg_count < 1 or not balance:
-                                    descr = id_to_descr.get(entity_neg)
-                                    if descr:
-                                        descr_list = neg_entities.get(article_id + "_" + mention, [])
-                                        descr_list.append(descr)
-                                        neg_entities[article_id + "_" + mention] = descr_list
-                                        neg_count += 1
+                        for entity_neg in entity_negs:
+                            descr = id_to_descr.get(entity_neg)
+                            if descr:
+                                entities.append(next_entity_nr)
+                                gold_by_entity[next_entity_nr] = 0
+                                desc_by_entity[next_entity_nr] = descr
+                                article_by_entity[next_entity_nr] = article_id
+                                next_entity_nr += 1
 
         if to_print:
             print()
             print("Processed", cnt, "training articles, dev=" + str(dev))
             print()
-        return instance_by_article, pos_entities, neg_entities, text_by_article
+        return entities, gold_by_entity, desc_by_entity, article_by_entity, text_by_article
+
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 319b1e1c8..715282642 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=400, devlimit=50)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 4392c01b7bfb22e435249128ac15c196c5b50bd1 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 23 May 2019 15:37:05 +0200
Subject: [PATCH 053/148] obtain sentence for each mention

---
 .../pipeline/wiki_entity_linking/run_el.py    |   9 +-
 .../pipeline/wiki_entity_linking/train_el.py  | 144 +++++++++++++-----
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 3 files changed, 112 insertions(+), 43 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
index 273543306..c0c219829 100644
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -70,7 +70,7 @@ def is_dev(file_name):
     return file_name.endswith("3.txt")
 
 
-def evaluate(predictions, golds, to_print=True):
+def evaluate(predictions, golds, to_print=True, times_hundred=True):
     if len(predictions) != len(golds):
         raise ValueError("predictions and gold entities should have the same length")
 
@@ -101,8 +101,11 @@ def evaluate(predictions, golds, to_print=True):
         print("fp", fp)
         print("fn", fn)
 
-    precision = 100 * tp / (tp + fp + 0.0000001)
-    recall = 100 * tp / (tp + fn + 0.0000001)
+    precision = tp / (tp + fp + 0.0000001)
+    recall = tp / (tp + fn + 0.0000001)
+    if times_hundred:
+        precision = precision*100
+        recall = recall*100
     fscore = 2 * recall * precision / (recall + precision + 0.0000001)
 
     accuracy = corrects / (corrects + incorrects)
diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index cd6e9de4d..d8082635a 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -20,6 +20,7 @@ from thinc.t2t import ParametricAttention
 from thinc.misc import Residual
 from thinc.misc import LayerNorm as LN
 
+from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
 
 """ TODO: this code needs to be implemented in pipes.pyx"""
@@ -27,13 +28,16 @@ from spacy.tokens import Doc
 
 class EL_Model:
 
+    PRINT_INSPECT = False
     PRINT_TRAIN = False
     EPS = 0.0000000005
     CUTOFF = 0.5
 
     BATCH_SIZE = 5
 
-    INPUT_DIM = 300
+    DOC_CUTOFF = 300    # number of characters from the doc context
+    INPUT_DIM = 300     # dimension of pre-trained vectors
+
     HIDDEN_1_WIDTH = 32   # 10
     HIDDEN_2_WIDTH = 32  # 6
     DESC_WIDTH = 64     # 4
@@ -58,11 +62,20 @@ class EL_Model:
         # raise errors instead of runtime warnings in case of int/float overflow
         np.seterr(all='raise')
 
-        train_ent, train_gold, train_desc, train_article, train_texts = self._get_training_data(training_dir,
-                                                                                                entity_descr_output,
-                                                                                                False,
-                                                                                                trainlimit,
-                                                                                                to_print=False)
+        train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \
+            self._get_training_data(training_dir, entity_descr_output, False, trainlimit, to_print=False)
+
+        # inspect data
+        if self.PRINT_INSPECT:
+            for entity in train_ent:
+                print("entity", entity)
+                print("gold", train_gold[entity])
+                print("desc", train_desc[entity])
+                print("sentence ID", train_sent[entity])
+                print("sentence text", train_sent_texts[train_sent[entity]])
+                print("article ID", train_art[entity])
+                print("article text", train_art_texts[train_art[entity]])
+                print()
 
         train_pos_entities = [k for k,v in train_gold.items() if v]
         train_neg_entities = [k for k,v in train_gold.items() if not v]
@@ -70,6 +83,10 @@ class EL_Model:
         train_pos_count = len(train_pos_entities)
         train_neg_count = len(train_neg_entities)
 
+        if to_print:
+            print()
+            print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count)
+
         # upsample positives to 50-50 distribution
         while train_pos_count < train_neg_count:
             train_ent.append(random.choice(train_pos_entities))
@@ -82,11 +99,8 @@ class EL_Model:
 
         shuffle(train_ent)
 
-        dev_ent, dev_gold, dev_desc, dev_article, dev_texts = self._get_training_data(training_dir,
-                                                                                      entity_descr_output,
-                                                                                      True,
-                                                                                      devlimit,
-                                                                                      to_print=False)
+        dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \
+            self._get_training_data(training_dir, entity_descr_output, True, devlimit, to_print=False)
         shuffle(dev_ent)
 
         dev_pos_count = len([g for g in dev_gold.values() if g])
@@ -94,20 +108,16 @@ class EL_Model:
 
         self._begin_training()
 
-        print()
-        self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_random", calc_random=True)
-        print()
-        self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_pre", avg=True)
-
         if to_print:
             print()
-            print("Training on", len(train_ent), "entities in", len(train_texts), "articles")
-            print("Training instances pos/neg", train_pos_count, train_neg_count)
+            print("Training on", len(train_ent), "entities in", len(train_art_texts), "articles")
+            print("Training instances pos/neg:", train_pos_count, train_neg_count)
             print()
-            print("Dev test on", len(dev_ent), "entities in", len(dev_texts), "articles")
-            print("Dev instances pos/neg", dev_pos_count, dev_neg_count)
+            print("Dev test on", len(dev_ent), "entities in", len(dev_art_texts), "articles")
+            print("Dev instances pos/neg:", dev_pos_count, dev_neg_count)
             print()
             print(" CUTOFF", self.CUTOFF)
+            print(" DOC_CUTOFF", self.DOC_CUTOFF)
             print(" INPUT_DIM", self.INPUT_DIM)
             print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH)
             print(" DESC_WIDTH", self.DESC_WIDTH)
@@ -116,6 +126,10 @@ class EL_Model:
             print(" DROP", self.DROP)
             print()
 
+        self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_random", calc_random=True)
+        self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_pre", avg=True)
+        print()
+
         start = 0
         stop = min(self.BATCH_SIZE, len(train_ent))
         processed = 0
@@ -125,10 +139,10 @@ class EL_Model:
 
             golds = [train_gold[e] for e in next_batch]
             descs = [train_desc[e] for e in next_batch]
-            articles = [train_texts[train_article[e]] for e in next_batch]
+            articles = [train_art_texts[train_art[e]] for e in next_batch]
 
             self.update(entities=next_batch, golds=golds, descs=descs, texts=articles)
-            self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_inter", avg=True)
+            self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_inter", avg=True)
 
             processed += len(next_batch)
 
@@ -151,7 +165,7 @@ class EL_Model:
             predictions = self._predict(entities=entities, article_docs=article_docs, desc_docs=desc_docs, avg=avg)
 
         # TODO: combine with prior probability
-        p, r, f, acc = run_el.evaluate(predictions, golds, to_print=False)
+        p, r, f, acc = run_el.evaluate(predictions, golds, to_print=False, times_hundred=False)
         loss, gradient = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds))
 
         print("p/r/F/acc/loss", print_string, round(p, 1), round(r, 1), round(f, 1), round(acc, 2), round(loss, 5))
@@ -288,14 +302,18 @@ class EL_Model:
                                                                                          collect_incorrect=True)
 
         local_vectors = list()   # TODO: local vectors
-        text_by_article = dict()
+
+        entities = set()
         gold_by_entity = dict()
         desc_by_entity = dict()
         article_by_entity = dict()
-        entities = list()
+        text_by_article = dict()
+        sentence_by_entity = dict()
+        text_by_sentence = dict()
 
         cnt = 0
-        next_entity_nr = 0
+        next_entity_nr = 1
+        next_sent_nr = 1
         files = listdir(training_dir)
         shuffle(files)
         for f in files:
@@ -305,33 +323,81 @@ class EL_Model:
                     if cnt % 500 == 0 and to_print:
                         print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
                     cnt += 1
-                    if article_id not in text_by_article:
-                        with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
-                            text = file.read()
-                            text_by_article[article_id] = text
+
+                    # parse the article text
+                    with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
+                        text = file.read()
+                        article_doc = self.nlp(text)
+                        truncated_text = text[0:min(self.DOC_CUTOFF, len(text))]
+                        text_by_article[article_id] = truncated_text
+
+                    # process all positive and negative entities, collect all relevant mentions in this article
+                    article_terms = set()
+                    entities_by_mention = dict()
 
                     for mention, entity_pos in correct_entries[article_id].items():
                         descr = id_to_descr.get(entity_pos)
                         if descr:
-                            entities.append(next_entity_nr)
-                            gold_by_entity[next_entity_nr] = 1
-                            desc_by_entity[next_entity_nr] = descr
-                            article_by_entity[next_entity_nr] = article_id
+                            entity = "E_" + str(next_entity_nr) + "_" + article_id + "_" + mention
                             next_entity_nr += 1
+                            gold_by_entity[entity] = 1
+                            desc_by_entity[entity] = descr
+                            article_terms.add(mention)
+                            mention_entities = entities_by_mention.get(mention, set())
+                            mention_entities.add(entity)
+                            entities_by_mention[mention] = mention_entities
 
                     for mention, entity_negs in incorrect_entries[article_id].items():
                         for entity_neg in entity_negs:
                             descr = id_to_descr.get(entity_neg)
                             if descr:
-                                entities.append(next_entity_nr)
-                                gold_by_entity[next_entity_nr] = 0
-                                desc_by_entity[next_entity_nr] = descr
-                                article_by_entity[next_entity_nr] = article_id
+                                entity = "E_" + str(next_entity_nr) + "_" + article_id + "_" + mention
                                 next_entity_nr += 1
+                                gold_by_entity[entity] = 0
+                                desc_by_entity[entity] = descr
+                                article_terms.add(mention)
+                                mention_entities = entities_by_mention.get(mention, set())
+                                mention_entities.add(entity)
+                                entities_by_mention[mention] = mention_entities
+
+                    # find all matches in the doc for the mentions
+                    # TODO: fix this - doesn't look like all entities are found
+                    matcher = PhraseMatcher(self.nlp.vocab)
+                    patterns = list(self.nlp.tokenizer.pipe(article_terms))
+
+                    matcher.add("TerminologyList", None, *patterns)
+                    matches = matcher(article_doc)
+
+                    # store sentences
+                    sentence_to_id = dict()
+                    for match_id, start, end in matches:
+                        span = article_doc[start:end]
+                        sent_text = span.sent
+                        sent_nr = sentence_to_id.get(sent_text,  None)
+                        if sent_nr is None:
+                            sent_nr = "S_" + str(next_sent_nr) + article_id
+                            next_sent_nr += 1
+                            text_by_sentence[sent_nr] = sent_text
+                            sentence_to_id[sent_text] = sent_nr
+                        mention_entities = entities_by_mention[span.text]
+                        for entity in mention_entities:
+                            entities.add(entity)
+                            sentence_by_entity[entity] = sent_nr
+                            article_by_entity[entity] = article_id
+
+        # remove entities that didn't have all data
+        gold_by_entity = {k: v for k, v in gold_by_entity.items() if k in entities}
+        desc_by_entity = {k: v for k, v in desc_by_entity.items() if k in entities}
+
+        article_by_entity = {k: v for k, v in article_by_entity.items() if k in entities}
+        text_by_article = {k: v for k, v in text_by_article.items() if k in article_by_entity.values()}
+
+        sentence_by_entity = {k: v for k, v in sentence_by_entity.items() if k in entities}
+        text_by_sentence = {k: v for k, v in text_by_sentence.items() if k in sentence_by_entity.values()}
 
         if to_print:
             print()
             print("Processed", cnt, "training articles, dev=" + str(dev))
             print()
-        return entities, gold_by_entity, desc_by_entity, article_by_entity, text_by_article
+        return list(entities), gold_by_entity, desc_by_entity, article_by_entity, text_by_article, sentence_by_entity, text_by_sentence
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 715282642..319b1e1c8 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=400, devlimit=50)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 86ed771e0ba83cea12be3f241d911bccd8a9afa1 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 23 May 2019 16:59:11 +0200
Subject: [PATCH 054/148] adding local sentence encoder

---
 .../pipeline/wiki_entity_linking/train_el.py  | 99 ++++++++++++-------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  2 +-
 2 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index d8082635a..63f8885cc 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -42,6 +42,7 @@ class EL_Model:
     HIDDEN_2_WIDTH = 32  # 6
     DESC_WIDTH = 64     # 4
     ARTICLE_WIDTH = 64   # 8
+    SENT_WIDTH = 64
 
     DROP = 0.1
 
@@ -55,6 +56,7 @@ class EL_Model:
         self._build_cnn(in_width=self.INPUT_DIM,
                         desc_width=self.DESC_WIDTH,
                         article_width=self.ARTICLE_WIDTH,
+                        sent_width=self.SENT_WIDTH,
                         hidden_1_width=self.HIDDEN_1_WIDTH,
                         hidden_2_width=self.HIDDEN_2_WIDTH)
 
@@ -77,8 +79,8 @@ class EL_Model:
                 print("article text", train_art_texts[train_art[entity]])
                 print()
 
-        train_pos_entities = [k for k,v in train_gold.items() if v]
-        train_neg_entities = [k for k,v in train_gold.items() if not v]
+        train_pos_entities = [k for k, v in train_gold.items() if v]
+        train_neg_entities = [k for k, v in train_gold.items() if not v]
 
         train_pos_count = len(train_pos_entities)
         train_neg_count = len(train_neg_entities)
@@ -122,12 +124,15 @@ class EL_Model:
             print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH)
             print(" DESC_WIDTH", self.DESC_WIDTH)
             print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH)
+            print(" SENT_WIDTH", self.SENT_WIDTH)
             print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH)
             print(" DROP", self.DROP)
             print()
 
-        self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_random", calc_random=True)
-        self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_pre", avg=True)
+        self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
+                       print_string="dev_random", calc_random=True)
+        self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
+                       print_string="dev_pre", avg=True)
         print()
 
         start = 0
@@ -139,10 +144,12 @@ class EL_Model:
 
             golds = [train_gold[e] for e in next_batch]
             descs = [train_desc[e] for e in next_batch]
-            articles = [train_art_texts[train_art[e]] for e in next_batch]
+            article_texts = [train_art_texts[train_art[e]] for e in next_batch]
+            sent_texts = [train_sent_texts[train_sent[e]] for e in next_batch]
 
-            self.update(entities=next_batch, golds=golds, descs=descs, texts=articles)
-            self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_inter", avg=True)
+            self.update(entities=next_batch, golds=golds, descs=descs, art_texts=article_texts, sent_texts=sent_texts)
+            self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
+                           print_string="dev_inter", avg=True)
 
             processed += len(next_batch)
 
@@ -153,7 +160,8 @@ class EL_Model:
             print()
             print("Trained on", processed, "entities in total")
 
-    def _test_dev(self, entities, gold_by_entity, desc_by_entity, article_by_entity, texts_by_id, print_string, avg=True, calc_random=False):
+    def _test_dev(self, entities, gold_by_entity, desc_by_entity, art_by_entity, art_texts, sent_by_entity, sent_texts,
+                  print_string, avg=True, calc_random=False):
         golds = [gold_by_entity[e] for e in entities]
 
         if calc_random:
@@ -161,29 +169,35 @@ class EL_Model:
 
         else:
             desc_docs = self.nlp.pipe([desc_by_entity[e] for e in entities])
-            article_docs = self.nlp.pipe([texts_by_id[article_by_entity[e]] for e in entities])
-            predictions = self._predict(entities=entities, article_docs=article_docs, desc_docs=desc_docs, avg=avg)
+            article_docs = self.nlp.pipe([art_texts[art_by_entity[e]] for e in entities])
+            sent_docs = self.nlp.pipe([sent_texts[sent_by_entity[e]] for e in entities])
+            predictions = self._predict(entities=entities, article_docs=article_docs, sent_docs=sent_docs,
+                                        desc_docs=desc_docs, avg=avg)
 
         # TODO: combine with prior probability
         p, r, f, acc = run_el.evaluate(predictions, golds, to_print=False, times_hundred=False)
         loss, gradient = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds))
 
-        print("p/r/F/acc/loss", print_string, round(p, 1), round(r, 1), round(f, 1), round(acc, 2), round(loss, 5))
+        print("p/r/F/acc/loss", print_string, round(p, 2), round(r, 2), round(f, 2), round(acc, 2), round(loss, 2))
 
         return loss, p, r, f
 
-    def _predict(self, entities, article_docs, desc_docs, avg=True, apply_threshold=True):
+    def _predict(self, entities, article_docs, sent_docs, desc_docs, avg=True, apply_threshold=True):
         if avg:
             with self.article_encoder.use_params(self.sgd_article.averages) \
-                 and self.desc_encoder.use_params(self.sgd_entity.averages):
+                 and self.desc_encoder.use_params(self.sgd_desc.averages):
                 doc_encodings = self.article_encoder(article_docs)
                 desc_encodings = self.desc_encoder(desc_docs)
+                sent_encodings = self.sent_encoder(sent_docs)
 
         else:
             doc_encodings = self.article_encoder(article_docs)
             desc_encodings = self.desc_encoder(desc_docs)
+            sent_encodings = self.sent_encoder(sent_docs)
+
+        concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) + list(desc_encodings[i]) for i in
+                            range(len(entities))]
 
-        concat_encodings = [list(desc_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
         np_array_list = np.asarray(concat_encodings)
 
         if avg:
@@ -201,16 +215,17 @@ class EL_Model:
 
     def _predict_random(self, entities, apply_threshold=True):
         if not apply_threshold:
-            return [float(random.uniform(0, 1)) for e in entities]
+            return [float(random.uniform(0, 1)) for _ in entities]
         else:
-            return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for e in entities]
+            return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for _ in entities]
 
-    def _build_cnn(self, in_width, desc_width, article_width, hidden_1_width, hidden_2_width):
+    def _build_cnn(self, in_width, desc_width, article_width, sent_width, hidden_1_width, hidden_2_width):
         with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
             self.desc_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=desc_width)
             self.article_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=article_width)
+            self.sent_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=sent_width)
 
-            in_width = desc_width + article_width
+            in_width = article_width + sent_width + desc_width
             out_width = hidden_2_width
 
             self.model = Affine(out_width, in_width) \
@@ -224,7 +239,8 @@ class EL_Model:
         cnn_maxout_pieces = 3
 
         with Model.define_operators({">>": chain}):
-            convolution = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_with, hidden_with * 3, pieces=cnn_maxout_pieces))))
+            convolution = Residual((ExtractWindow(nW=1) >>
+                                    LN(Maxout(hidden_with, hidden_with * 3, pieces=cnn_maxout_pieces))))
 
             encoder = SpacyVectors \
                       >> with_flatten(LN(Maxout(hidden_with, in_width)) >> convolution ** conv_depth, pad=conv_depth) \
@@ -241,7 +257,8 @@ class EL_Model:
 
     def _begin_training(self):
         self.sgd_article = create_default_optimizer(self.article_encoder.ops)
-        self.sgd_entity = create_default_optimizer(self.desc_encoder.ops)
+        self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
+        self.sgd_desc = create_default_optimizer(self.desc_encoder.ops)
         self.sgd = create_default_optimizer(self.model.ops)
 
     @staticmethod
@@ -251,17 +268,19 @@ class EL_Model:
         loss = (d_scores ** 2).mean()
         return loss, gradient
 
-    def update(self, entities, golds, descs, texts):
+    def update(self, entities, golds, descs, art_texts, sent_texts):
         golds = self.model.ops.asarray(golds)
 
+        art_docs = self.nlp.pipe(art_texts)
+        sent_docs = self.nlp.pipe(sent_texts)
         desc_docs = self.nlp.pipe(descs)
-        article_docs = self.nlp.pipe(texts)
 
-        doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP)
+        doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP)
+        sent_encodings, bp_sent = self.sent_encoder.begin_update(sent_docs, drop=self.DROP)
+        desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP)
 
-        desc_encodings, bp_entity = self.desc_encoder.begin_update(desc_docs, drop=self.DROP)
-
-        concat_encodings = [list(desc_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))]
+        concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) + list(desc_encodings[i])
+                            for i in range(len(entities))]
 
         predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP)
         predictions = self.model.ops.flatten(predictions)
@@ -282,17 +301,23 @@ class EL_Model:
         model_gradient = bp_model(gradient, sgd=self.sgd)
         # print("model_gradient", model_gradient)
 
-        # concat = desc + doc, but doc is the same within this function (TODO: multiple docs/articles)
-        doc_gradient = model_gradient[0][self.DESC_WIDTH:]
-        entity_gradients = list()
+        # concat = doc + sent + desc, but doc is the same within this function
+        sent_start = self.ARTICLE_WIDTH
+        desc_start = self.ARTICLE_WIDTH + self.SENT_WIDTH
+        doc_gradient = model_gradient[0][0:sent_start]
+        sent_gradients = list()
+        desc_gradients = list()
         for x in model_gradient:
-            entity_gradients.append(list(x[0:self.DESC_WIDTH]))
+            sent_gradients.append(list(x[sent_start:desc_start]))
+            desc_gradients.append(list(x[desc_start:]))
 
         # print("doc_gradient", doc_gradient)
-        # print("entity_gradients", entity_gradients)
+        # print("sent_gradients", sent_gradients)
+        # print("desc_gradients", desc_gradients)
 
         bp_doc([doc_gradient], sgd=self.sgd_article)
-        bp_entity(entity_gradients, sgd=self.sgd_entity)
+        bp_sent(sent_gradients, sgd=self.sgd_sent)
+        bp_desc(desc_gradients, sgd=self.sgd_desc)
 
     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
@@ -301,8 +326,6 @@ class EL_Model:
                                                                                          collect_correct=True,
                                                                                          collect_incorrect=True)
 
-        local_vectors = list()   # TODO: local vectors
-
         entities = set()
         gold_by_entity = dict()
         desc_by_entity = dict()
@@ -372,14 +395,15 @@ class EL_Model:
                     sentence_to_id = dict()
                     for match_id, start, end in matches:
                         span = article_doc[start:end]
-                        sent_text = span.sent
+                        sent_text = span.sent.text
                         sent_nr = sentence_to_id.get(sent_text,  None)
+                        mention = span.text
                         if sent_nr is None:
                             sent_nr = "S_" + str(next_sent_nr) + article_id
                             next_sent_nr += 1
                             text_by_sentence[sent_nr] = sent_text
                             sentence_to_id[sent_text] = sent_nr
-                        mention_entities = entities_by_mention[span.text]
+                        mention_entities = entities_by_mention[mention]
                         for entity in mention_entities:
                             entities.add(entity)
                             sentence_by_entity[entity] = sent_nr
@@ -399,5 +423,6 @@ class EL_Model:
             print()
             print("Processed", cnt, "training articles, dev=" + str(dev))
             print()
-        return list(entities), gold_by_entity, desc_by_entity, article_by_entity, text_by_article, sentence_by_entity, text_by_sentence
+        return list(entities), gold_by_entity, desc_by_entity, article_by_entity, text_by_article, \
+               sentence_by_entity, text_by_sentence
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 319b1e1c8..ec1f66d81 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=50)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From abf9af81c93e94ed1bbcc4f295d1184e57312fbe Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 24 May 2019 22:04:25 +0200
Subject: [PATCH 055/148] learn rate en epochs

---
 .../pipeline/wiki_entity_linking/train_el.py  | 85 ++++++++++---------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  2 +-
 2 files changed, 48 insertions(+), 39 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 63f8885cc..efad36362 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -34,6 +34,7 @@ class EL_Model:
     CUTOFF = 0.5
 
     BATCH_SIZE = 5
+    UPSAMPLE = True
 
     DOC_CUTOFF = 300    # number of characters from the doc context
     INPUT_DIM = 300     # dimension of pre-trained vectors
@@ -45,6 +46,8 @@ class EL_Model:
     SENT_WIDTH = 64
 
     DROP = 0.1
+    LEARN_RATE = 0.01
+    EPOCHS = 10
 
     name = "entity_linker"
 
@@ -67,6 +70,12 @@ class EL_Model:
         train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \
             self._get_training_data(training_dir, entity_descr_output, False, trainlimit, to_print=False)
 
+        dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \
+            self._get_training_data(training_dir, entity_descr_output, True, devlimit, to_print=False)
+
+        dev_pos_count = len([g for g in dev_gold.values() if g])
+        dev_neg_count = len([g for g in dev_gold.values() if not g])
+
         # inspect data
         if self.PRINT_INSPECT:
             for entity in train_ent:
@@ -85,28 +94,20 @@ class EL_Model:
         train_pos_count = len(train_pos_entities)
         train_neg_count = len(train_neg_entities)
 
-        if to_print:
-            print()
-            print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count)
+        if self.UPSAMPLE:
+            if to_print:
+                print()
+                print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count)
 
-        # upsample positives to 50-50 distribution
-        while train_pos_count < train_neg_count:
-            train_ent.append(random.choice(train_pos_entities))
-            train_pos_count += 1
+            # upsample positives to 50-50 distribution
+            while train_pos_count < train_neg_count:
+                train_ent.append(random.choice(train_pos_entities))
+                train_pos_count += 1
 
-        # upsample negatives to 50-50 distribution
-        while train_neg_count < train_pos_count:
-            train_ent.append(random.choice(train_neg_entities))
-            train_neg_count += 1
-
-        shuffle(train_ent)
-
-        dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \
-            self._get_training_data(training_dir, entity_descr_output, True, devlimit, to_print=False)
-        shuffle(dev_ent)
-
-        dev_pos_count = len([g for g in dev_gold.values() if g])
-        dev_neg_count = len([g for g in dev_gold.values() if not g])
+            # upsample negatives to 50-50 distribution
+            while train_neg_count < train_pos_count:
+                train_ent.append(random.choice(train_neg_entities))
+                train_neg_count += 1
 
         self._begin_training()
 
@@ -135,30 +136,34 @@ class EL_Model:
                        print_string="dev_pre", avg=True)
         print()
 
-        start = 0
-        stop = min(self.BATCH_SIZE, len(train_ent))
-        processed = 0
+        for i in range(self.EPOCHS):
+            print("EPOCH", i)
+            shuffle(train_ent)
 
-        while start < len(train_ent):
-            next_batch = train_ent[start:stop]
+            start = 0
+            stop = min(self.BATCH_SIZE, len(train_ent))
+            processed = 0
 
-            golds = [train_gold[e] for e in next_batch]
-            descs = [train_desc[e] for e in next_batch]
-            article_texts = [train_art_texts[train_art[e]] for e in next_batch]
-            sent_texts = [train_sent_texts[train_sent[e]] for e in next_batch]
+            while start < len(train_ent):
+                next_batch = train_ent[start:stop]
 
-            self.update(entities=next_batch, golds=golds, descs=descs, art_texts=article_texts, sent_texts=sent_texts)
-            self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
-                           print_string="dev_inter", avg=True)
+                golds = [train_gold[e] for e in next_batch]
+                descs = [train_desc[e] for e in next_batch]
+                article_texts = [train_art_texts[train_art[e]] for e in next_batch]
+                sent_texts = [train_sent_texts[train_sent[e]] for e in next_batch]
 
-            processed += len(next_batch)
+                self.update(entities=next_batch, golds=golds, descs=descs, art_texts=article_texts, sent_texts=sent_texts)
+                self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
+                               print_string="dev_inter", avg=True)
 
-            start = start + self.BATCH_SIZE
-            stop = min(stop + self.BATCH_SIZE, len(train_ent))
+                processed += len(next_batch)
 
-        if to_print:
-            print()
-            print("Trained on", processed, "entities in total")
+                start = start + self.BATCH_SIZE
+                stop = min(stop + self.BATCH_SIZE, len(train_ent))
+
+            if to_print:
+                print()
+                print("Trained on", processed, "entities in total")
 
     def _test_dev(self, entities, gold_by_entity, desc_by_entity, art_by_entity, art_texts, sent_by_entity, sent_texts,
                   print_string, avg=True, calc_random=False):
@@ -257,9 +262,13 @@ class EL_Model:
 
     def _begin_training(self):
         self.sgd_article = create_default_optimizer(self.article_encoder.ops)
+        self.sgd_article.learn_rate = self.LEARN_RATE
         self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
+        self.sgd_sent.learn_rate = self.LEARN_RATE
         self.sgd_desc = create_default_optimizer(self.desc_encoder.ops)
+        self.sgd_desc.learn_rate = self.LEARN_RATE
         self.sgd = create_default_optimizer(self.model.ops)
+        self.sgd.learn_rate = self.LEARN_RATE
 
     @staticmethod
     def get_loss(predictions, golds):
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index ec1f66d81..cd7804ca4 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=50)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=20, devlimit=20)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From cfc27d7ff92abdc2962df4a61e9a38b3d383693f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 26 May 2019 23:39:46 +0200
Subject: [PATCH 056/148] using Tok2Vec instead

---
 .../pipeline/wiki_entity_linking/train_el.py  | 82 +++++++++++++------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  2 +-
 2 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index efad36362..e0bea3f08 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -11,7 +11,7 @@ from thinc.neural._classes.convolution import ExtractWindow
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
-from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic
+from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic, Tok2Vec
 
 from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten
 from thinc.v2v import Model, Maxout, Affine, ReLu
@@ -39,15 +39,15 @@ class EL_Model:
     DOC_CUTOFF = 300    # number of characters from the doc context
     INPUT_DIM = 300     # dimension of pre-trained vectors
 
-    HIDDEN_1_WIDTH = 32   # 10
-    HIDDEN_2_WIDTH = 32  # 6
+    # HIDDEN_1_WIDTH = 32   # 10
+    # HIDDEN_2_WIDTH = 32  # 6
     DESC_WIDTH = 64     # 4
     ARTICLE_WIDTH = 64   # 8
     SENT_WIDTH = 64
 
     DROP = 0.1
-    LEARN_RATE = 0.01
-    EPOCHS = 10
+    LEARN_RATE = 0.001
+    EPOCHS = 20
 
     name = "entity_linker"
 
@@ -56,12 +56,9 @@ class EL_Model:
         self.nlp = nlp
         self.kb = kb
 
-        self._build_cnn(in_width=self.INPUT_DIM,
-                        desc_width=self.DESC_WIDTH,
+        self._build_cnn(desc_width=self.DESC_WIDTH,
                         article_width=self.ARTICLE_WIDTH,
-                        sent_width=self.SENT_WIDTH,
-                        hidden_1_width=self.HIDDEN_1_WIDTH,
-                        hidden_2_width=self.HIDDEN_2_WIDTH)
+                        sent_width=self.SENT_WIDTH)
 
     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
         # raise errors instead of runtime warnings in case of int/float overflow
@@ -122,27 +119,29 @@ class EL_Model:
             print(" CUTOFF", self.CUTOFF)
             print(" DOC_CUTOFF", self.DOC_CUTOFF)
             print(" INPUT_DIM", self.INPUT_DIM)
-            print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH)
+            # print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH)
             print(" DESC_WIDTH", self.DESC_WIDTH)
             print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH)
             print(" SENT_WIDTH", self.SENT_WIDTH)
-            print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH)
+            # print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH)
             print(" DROP", self.DROP)
+            print(" LEARNING RATE", self.LEARN_RATE)
+            print(" UPSAMPLE", self.UPSAMPLE)
             print()
 
         self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
                        print_string="dev_random", calc_random=True)
+
         self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
                        print_string="dev_pre", avg=True)
         print()
 
+        processed = 0
         for i in range(self.EPOCHS):
-            print("EPOCH", i)
             shuffle(train_ent)
 
             start = 0
             stop = min(self.BATCH_SIZE, len(train_ent))
-            processed = 0
 
             while start < len(train_ent):
                 next_batch = train_ent[start:stop]
@@ -153,17 +152,22 @@ class EL_Model:
                 sent_texts = [train_sent_texts[train_sent[e]] for e in next_batch]
 
                 self.update(entities=next_batch, golds=golds, descs=descs, art_texts=article_texts, sent_texts=sent_texts)
-                self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
-                               print_string="dev_inter", avg=True)
 
                 processed += len(next_batch)
 
                 start = start + self.BATCH_SIZE
                 stop = min(stop + self.BATCH_SIZE, len(train_ent))
 
-            if to_print:
-                print()
-                print("Trained on", processed, "entities in total")
+            if self.PRINT_TRAIN:
+                self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts,
+                               print_string="train_inter_epoch " + str(i), avg=True)
+
+            self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
+                           print_string="dev_inter_epoch " + str(i), avg=True)
+
+        if to_print:
+            print()
+            print("Trained on", processed, "entities across", self.EPOCHS, "epochs")
 
     def _test_dev(self, entities, gold_by_entity, desc_by_entity, art_by_entity, art_texts, sent_by_entity, sent_texts,
                   print_string, avg=True, calc_random=False):
@@ -224,11 +228,11 @@ class EL_Model:
         else:
             return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for _ in entities]
 
-    def _build_cnn(self, in_width, desc_width, article_width, sent_width, hidden_1_width, hidden_2_width):
+    def _build_cnn_depr(self, embed_width, desc_width, article_width, sent_width, hidden_1_width, hidden_2_width):
         with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
-            self.desc_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=desc_width)
-            self.article_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=article_width)
-            self.sent_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=sent_width)
+            self.desc_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=desc_width)
+            self.article_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=article_width)
+            self.sent_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=sent_width)
 
             in_width = article_width + sent_width + desc_width
             out_width = hidden_2_width
@@ -238,8 +242,28 @@ class EL_Model:
                 >> Affine(1, out_width) \
                 >> logistic
 
+    def _build_cnn(self, desc_width, article_width, sent_width):
+        with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
+            self.desc_encoder = self._encoder(width=desc_width)
+            self.article_encoder = self._encoder(width=article_width)
+            self.sent_encoder = self._encoder(width=sent_width)
+
+            in_width = desc_width + article_width + sent_width
+
+            output_layer = (
+                    zero_init(Affine(1, in_width, drop_factor=0.0)) >> logistic
+            )
+            self.model = output_layer
+            self.model.nO = 1
+
+    def _encoder(self, width):
+        tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3,
+                          subword_features=True, conv_depth=4, bilstm_depth=0)
+
+        return tok2vec >> flatten_add_lengths >> Pooling(mean_pool)
+
     @staticmethod
-    def _encoder(in_width, hidden_with, end_width):
+    def _encoder_depr(in_width, hidden_with, end_width):
         conv_depth = 2
         cnn_maxout_pieces = 3
 
@@ -263,12 +287,19 @@ class EL_Model:
     def _begin_training(self):
         self.sgd_article = create_default_optimizer(self.article_encoder.ops)
         self.sgd_article.learn_rate = self.LEARN_RATE
+        self.sgd_article.L2 = 0
+
         self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
         self.sgd_sent.learn_rate = self.LEARN_RATE
+        self.sgd_sent.L2 = 0
+
         self.sgd_desc = create_default_optimizer(self.desc_encoder.ops)
         self.sgd_desc.learn_rate = self.LEARN_RATE
+        self.sgd_desc.L2 = 0
+
         self.sgd = create_default_optimizer(self.model.ops)
         self.sgd.learn_rate = self.LEARN_RATE
+        self.sgd.L2 = 0
 
     @staticmethod
     def get_loss(predictions, golds):
@@ -300,9 +331,6 @@ class EL_Model:
 
         loss, gradient = self.get_loss(predictions, golds)
 
-        if self.PRINT_TRAIN:
-            print("loss train", round(loss, 5))
-
         gradient = float(gradient)
         # print("gradient", gradient)
         # print("loss", loss)
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index cd7804ca4..70fc200ab 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=20, devlimit=20)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=1000)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 8c4aa076bcb57ad7970de72229beb2d2e10335e4 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 27 May 2019 14:29:38 +0200
Subject: [PATCH 057/148] small fixes

---
 .../pipeline/wiki_entity_linking/train_el.py  | 36 ++++++++++++-------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  2 +-
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index e0bea3f08..e7d80d52b 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -29,7 +29,7 @@ from spacy.tokens import Doc
 class EL_Model:
 
     PRINT_INSPECT = False
-    PRINT_TRAIN = False
+    PRINT_TRAIN = True
     EPS = 0.0000000005
     CUTOFF = 0.5
 
@@ -40,14 +40,15 @@ class EL_Model:
     INPUT_DIM = 300     # dimension of pre-trained vectors
 
     # HIDDEN_1_WIDTH = 32   # 10
-    # HIDDEN_2_WIDTH = 32  # 6
+    HIDDEN_2_WIDTH = 32  # 6
     DESC_WIDTH = 64     # 4
     ARTICLE_WIDTH = 64   # 8
     SENT_WIDTH = 64
 
     DROP = 0.1
-    LEARN_RATE = 0.001
+    LEARN_RATE = 0.0001
     EPOCHS = 20
+    L2 = 1e-6
 
     name = "entity_linker"
 
@@ -62,7 +63,10 @@ class EL_Model:
 
     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
         # raise errors instead of runtime warnings in case of int/float overflow
-        np.seterr(all='raise')
+        # (not sure if we need this. set L2 to 0 because it throws an error otherwsise)
+        # np.seterr(all='raise')
+        # alternative:
+        np.seterr(divide="raise", over="warn", under="ignore", invalid="raise")
 
         train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \
             self._get_training_data(training_dir, entity_descr_output, False, trainlimit, to_print=False)
@@ -159,6 +163,7 @@ class EL_Model:
                 stop = min(stop + self.BATCH_SIZE, len(train_ent))
 
             if self.PRINT_TRAIN:
+                print()
                 self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts,
                                print_string="train_inter_epoch " + str(i), avg=True)
 
@@ -250,15 +255,20 @@ class EL_Model:
 
             in_width = desc_width + article_width + sent_width
 
-            output_layer = (
-                    zero_init(Affine(1, in_width, drop_factor=0.0)) >> logistic
-            )
-            self.model = output_layer
+            self.model = Affine(self.HIDDEN_2_WIDTH, in_width) \
+                         >> LN(Maxout(self.HIDDEN_2_WIDTH, self.HIDDEN_2_WIDTH)) \
+                         >> Affine(1, self.HIDDEN_2_WIDTH) \
+                         >> logistic
+
+            # output_layer = (
+            #         zero_init(Affine(1, in_width, drop_factor=0.0)) >> logistic
+            # )
+            # self.model = output_layer
             self.model.nO = 1
 
     def _encoder(self, width):
         tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3,
-                          subword_features=True, conv_depth=4, bilstm_depth=0)
+                          subword_features=False, conv_depth=4, bilstm_depth=0)
 
         return tok2vec >> flatten_add_lengths >> Pooling(mean_pool)
 
@@ -287,19 +297,19 @@ class EL_Model:
     def _begin_training(self):
         self.sgd_article = create_default_optimizer(self.article_encoder.ops)
         self.sgd_article.learn_rate = self.LEARN_RATE
-        self.sgd_article.L2 = 0
+        self.sgd_article.L2 = self.L2
 
         self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
         self.sgd_sent.learn_rate = self.LEARN_RATE
-        self.sgd_sent.L2 = 0
+        self.sgd_sent.L2 = self.L2
 
         self.sgd_desc = create_default_optimizer(self.desc_encoder.ops)
         self.sgd_desc.learn_rate = self.LEARN_RATE
-        self.sgd_desc.L2 = 0
+        self.sgd_desc.L2 = self.L2
 
         self.sgd = create_default_optimizer(self.model.ops)
         self.sgd.learn_rate = self.LEARN_RATE
-        self.sgd.L2 = 0
+        self.sgd.L2 = self.L2
 
     @staticmethod
     def get_loss(predictions, golds):
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 70fc200ab..319b1e1c8 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=1000)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 992fa92b6630ec8eb78ad378602a5774d7327de3 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 28 May 2019 00:05:22 +0200
Subject: [PATCH 058/148] refactor again to clusters of entities and cosine
 similarity

---
 .../pipeline/wiki_entity_linking/train_el.py  | 428 +++++++++---------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 2 files changed, 206 insertions(+), 224 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index e7d80d52b..ac8cae4a4 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -11,7 +11,7 @@ from thinc.neural._classes.convolution import ExtractWindow
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
-from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic, Tok2Vec
+from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic, Tok2Vec, cosine
 
 from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten
 from thinc.v2v import Model, Maxout, Affine, ReLu
@@ -20,6 +20,7 @@ from thinc.t2t import ParametricAttention
 from thinc.misc import Residual
 from thinc.misc import LayerNorm as LN
 
+from spacy.cli.pretrain import get_cossim_loss
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
 
@@ -34,20 +35,20 @@ class EL_Model:
     CUTOFF = 0.5
 
     BATCH_SIZE = 5
-    UPSAMPLE = True
+    # UPSAMPLE = True
 
     DOC_CUTOFF = 300    # number of characters from the doc context
     INPUT_DIM = 300     # dimension of pre-trained vectors
 
-    # HIDDEN_1_WIDTH = 32   # 10
-    HIDDEN_2_WIDTH = 32  # 6
-    DESC_WIDTH = 64     # 4
-    ARTICLE_WIDTH = 64   # 8
+    HIDDEN_1_WIDTH = 32
+    # HIDDEN_2_WIDTH = 32  # 6
+    DESC_WIDTH = 64
+    ARTICLE_WIDTH = 64
     SENT_WIDTH = 64
 
     DROP = 0.1
     LEARN_RATE = 0.0001
-    EPOCHS = 20
+    EPOCHS = 10
     L2 = 1e-6
 
     name = "entity_linker"
@@ -57,9 +58,10 @@ class EL_Model:
         self.nlp = nlp
         self.kb = kb
 
-        self._build_cnn(desc_width=self.DESC_WIDTH,
+        self._build_cnn(embed_width=self.INPUT_DIM,
+                        desc_width=self.DESC_WIDTH,
                         article_width=self.ARTICLE_WIDTH,
-                        sent_width=self.SENT_WIDTH)
+                        sent_width=self.SENT_WIDTH, hidden_1_width=self.HIDDEN_1_WIDTH)
 
     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
         # raise errors instead of runtime warnings in case of int/float overflow
@@ -70,24 +72,28 @@ class EL_Model:
 
         train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \
             self._get_training_data(training_dir, entity_descr_output, False, trainlimit, to_print=False)
+        train_clusters = list(train_ent.keys())
 
         dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \
             self._get_training_data(training_dir, entity_descr_output, True, devlimit, to_print=False)
+        dev_clusters = list(dev_ent.keys())
 
         dev_pos_count = len([g for g in dev_gold.values() if g])
         dev_neg_count = len([g for g in dev_gold.values() if not g])
 
         # inspect data
         if self.PRINT_INSPECT:
-            for entity in train_ent:
-                print("entity", entity)
-                print("gold", train_gold[entity])
-                print("desc", train_desc[entity])
-                print("sentence ID", train_sent[entity])
-                print("sentence text", train_sent_texts[train_sent[entity]])
-                print("article ID", train_art[entity])
-                print("article text", train_art_texts[train_art[entity]])
+            for cluster, entities in train_ent.items():
                 print()
+                for entity in entities:
+                    print("entity", entity)
+                    print("gold", train_gold[entity])
+                    print("desc", train_desc[entity])
+                    print("sentence ID", train_sent[entity])
+                    print("sentence text", train_sent_texts[train_sent[entity]])
+                    print("article ID", train_art[entity])
+                    print("article text", train_art_texts[train_art[entity]])
+                    print()
 
         train_pos_entities = [k for k, v in train_gold.items() if v]
         train_neg_entities = [k for k, v in train_gold.items() if not v]
@@ -95,29 +101,29 @@ class EL_Model:
         train_pos_count = len(train_pos_entities)
         train_neg_count = len(train_neg_entities)
 
-        if self.UPSAMPLE:
-            if to_print:
-                print()
-                print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count)
-
-            # upsample positives to 50-50 distribution
-            while train_pos_count < train_neg_count:
-                train_ent.append(random.choice(train_pos_entities))
-                train_pos_count += 1
-
+        # if self.UPSAMPLE:
+        #    if to_print:
+        #        print()
+        #        print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count)
+        #
+        #    # upsample positives to 50-50 distribution
+        #    while train_pos_count < train_neg_count:
+        #        train_ent.append(random.choice(train_pos_entities))
+        #        train_pos_count += 1
+        #
             # upsample negatives to 50-50 distribution
-            while train_neg_count < train_pos_count:
-                train_ent.append(random.choice(train_neg_entities))
-                train_neg_count += 1
+        #    while train_neg_count < train_pos_count:
+        #        train_ent.append(random.choice(train_neg_entities))
+        #        train_neg_count += 1
 
         self._begin_training()
 
         if to_print:
             print()
-            print("Training on", len(train_ent), "entities in", len(train_art_texts), "articles")
+            print("Training on", len(train_clusters), "entity clusters in", len(train_art_texts), "articles")
             print("Training instances pos/neg:", train_pos_count, train_neg_count)
             print()
-            print("Dev test on", len(dev_ent), "entities in", len(dev_art_texts), "articles")
+            print("Dev test on", len(dev_clusters), "entity clusters in", len(dev_art_texts), "articles")
             print("Dev instances pos/neg:", dev_pos_count, dev_neg_count)
             print()
             print(" CUTOFF", self.CUTOFF)
@@ -138,94 +144,104 @@ class EL_Model:
 
         self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
                        print_string="dev_pre", avg=True)
-        print()
 
         processed = 0
         for i in range(self.EPOCHS):
-            shuffle(train_ent)
+            shuffle(train_clusters)
 
             start = 0
-            stop = min(self.BATCH_SIZE, len(train_ent))
+            stop = min(self.BATCH_SIZE, len(train_clusters))
 
-            while start < len(train_ent):
-                next_batch = train_ent[start:stop]
+            while start < len(train_clusters):
+                next_batch = {c: train_ent[c] for c in train_clusters[start:stop]}
+                processed += len(next_batch.keys())
 
-                golds = [train_gold[e] for e in next_batch]
-                descs = [train_desc[e] for e in next_batch]
-                article_texts = [train_art_texts[train_art[e]] for e in next_batch]
-                sent_texts = [train_sent_texts[train_sent[e]] for e in next_batch]
-
-                self.update(entities=next_batch, golds=golds, descs=descs, art_texts=article_texts, sent_texts=sent_texts)
-
-                processed += len(next_batch)
+                self.update(entity_clusters=next_batch, golds=train_gold, descs=train_desc,
+                            art_texts=train_art_texts, arts=train_art,
+                            sent_texts=train_sent_texts, sents=train_sent)
 
                 start = start + self.BATCH_SIZE
-                stop = min(stop + self.BATCH_SIZE, len(train_ent))
+                stop = min(stop + self.BATCH_SIZE, len(train_clusters))
 
             if self.PRINT_TRAIN:
                 print()
                 self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts,
-                               print_string="train_inter_epoch " + str(i), avg=True)
+                                print_string="train_inter_epoch " + str(i), avg=True)
 
             self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
                            print_string="dev_inter_epoch " + str(i), avg=True)
 
         if to_print:
             print()
-            print("Trained on", processed, "entities across", self.EPOCHS, "epochs")
+            print("Trained on", processed, "entity clusters across", self.EPOCHS, "epochs")
 
-    def _test_dev(self, entities, gold_by_entity, desc_by_entity, art_by_entity, art_texts, sent_by_entity, sent_texts,
+    def _test_dev(self, entity_clusters, golds, descs, arts, art_texts, sents, sent_texts,
                   print_string, avg=True, calc_random=False):
-        golds = [gold_by_entity[e] for e in entities]
 
-        if calc_random:
-            predictions = self._predict_random(entities=entities)
+        correct = 0
+        incorrect = 0
 
-        else:
-            desc_docs = self.nlp.pipe([desc_by_entity[e] for e in entities])
-            article_docs = self.nlp.pipe([art_texts[art_by_entity[e]] for e in entities])
-            sent_docs = self.nlp.pipe([sent_texts[sent_by_entity[e]] for e in entities])
-            predictions = self._predict(entities=entities, article_docs=article_docs, sent_docs=sent_docs,
-                                        desc_docs=desc_docs, avg=avg)
+        for cluster, entities in entity_clusters.items():
+            correct_entities = [e for e in entities if golds[e]]
+            incorrect_entities = [e for e in entities if not golds[e]]
+            assert len(correct_entities) == 1
 
-        # TODO: combine with prior probability
-        p, r, f, acc = run_el.evaluate(predictions, golds, to_print=False, times_hundred=False)
-        loss, gradient = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds))
+            entities = list(entities)
+            shuffle(entities)
 
-        print("p/r/F/acc/loss", print_string, round(p, 2), round(r, 2), round(f, 2), round(acc, 2), round(loss, 2))
+            if calc_random:
+                predicted_entity = random.choice(entities)
+                if predicted_entity in correct_entities:
+                    correct += 1
+                else:
+                    incorrect += 1
 
-        return loss, p, r, f
+            else:
+                desc_docs = self.nlp.pipe([descs[e] for e in entities])
+                # article_texts = [art_texts[arts[e]] for e in entities]
 
-    def _predict(self, entities, article_docs, sent_docs, desc_docs, avg=True, apply_threshold=True):
+                sent_doc = self.nlp(sent_texts[sents[cluster]])
+                article_doc = self.nlp(art_texts[arts[cluster]])
+
+                predicted_index = self._predict(article_doc=article_doc, sent_doc=sent_doc,
+                                                desc_docs=desc_docs, avg=avg)
+                if entities[predicted_index] in correct_entities:
+                    correct += 1
+                else:
+                    incorrect += 1
+
+        if correct == incorrect == 0:
+            print("acc", print_string, "NA")
+            return 0
+
+        acc = correct / (correct + incorrect)
+        print("acc", print_string, round(acc, 2))
+        return acc
+
+    def _predict(self, article_doc, sent_doc, desc_docs, avg=True, apply_threshold=True):
         if avg:
             with self.article_encoder.use_params(self.sgd_article.averages) \
-                 and self.desc_encoder.use_params(self.sgd_desc.averages):
-                doc_encodings = self.article_encoder(article_docs)
+                 and self.desc_encoder.use_params(self.sgd_desc.averages)\
+                 and self.sent_encoder.use_params(self.sgd_sent.averages):
+                # doc_encoding = self.article_encoder(article_doc)
                 desc_encodings = self.desc_encoder(desc_docs)
-                sent_encodings = self.sent_encoder(sent_docs)
+                sent_encoding = self.sent_encoder([sent_doc])
 
         else:
-            doc_encodings = self.article_encoder(article_docs)
+            # doc_encodings = self.article_encoder(article_docs)
             desc_encodings = self.desc_encoder(desc_docs)
-            sent_encodings = self.sent_encoder(sent_docs)
+            sent_encoding = self.sent_encoder([sent_doc])
 
-        concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) + list(desc_encodings[i]) for i in
-                            range(len(entities))]
+        sent_enc = np.transpose(sent_encoding)
+        highest_sim = -5
+        best_i = -1
+        for i, desc_enc in enumerate(desc_encodings):
+            sim = cosine(desc_enc, sent_enc)
+            if sim >= highest_sim:
+                best_i = i
+                highest_sim = sim
 
-        np_array_list = np.asarray(concat_encodings)
-
-        if avg:
-            with self.model.use_params(self.sgd.averages):
-                predictions = self.model(np_array_list)
-        else:
-            predictions = self.model(np_array_list)
-
-        predictions = self.model.ops.flatten(predictions)
-        predictions = [float(p) for p in predictions]
-        if apply_threshold:
-            predictions = [float(1.0) if p > self.CUTOFF else float(0.0) for p in predictions]
-
-        return predictions
+        return best_i
 
     def _predict_random(self, entities, apply_threshold=True):
         if not apply_threshold:
@@ -233,47 +249,23 @@ class EL_Model:
         else:
             return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for _ in entities]
 
-    def _build_cnn_depr(self, embed_width, desc_width, article_width, sent_width, hidden_1_width, hidden_2_width):
+    def _build_cnn(self, embed_width, desc_width, article_width, sent_width, hidden_1_width):
         with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
-            self.desc_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=desc_width)
-            self.article_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=article_width)
-            self.sent_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=sent_width)
+            self.desc_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width,
+                                                   end_width=desc_width)
+            self.article_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width,
+                                                      end_width=article_width)
+            self.sent_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width,
+                                                   end_width=sent_width)
 
-            in_width = article_width + sent_width + desc_width
-            out_width = hidden_2_width
-
-            self.model = Affine(out_width, in_width) \
-                >> LN(Maxout(out_width, out_width)) \
-                >> Affine(1, out_width) \
-                >> logistic
-
-    def _build_cnn(self, desc_width, article_width, sent_width):
-        with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
-            self.desc_encoder = self._encoder(width=desc_width)
-            self.article_encoder = self._encoder(width=article_width)
-            self.sent_encoder = self._encoder(width=sent_width)
-
-            in_width = desc_width + article_width + sent_width
-
-            self.model = Affine(self.HIDDEN_2_WIDTH, in_width) \
-                         >> LN(Maxout(self.HIDDEN_2_WIDTH, self.HIDDEN_2_WIDTH)) \
-                         >> Affine(1, self.HIDDEN_2_WIDTH) \
-                         >> logistic
-
-            # output_layer = (
-            #         zero_init(Affine(1, in_width, drop_factor=0.0)) >> logistic
-            # )
-            # self.model = output_layer
-            self.model.nO = 1
-
-    def _encoder(self, width):
-        tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3,
-                          subword_features=False, conv_depth=4, bilstm_depth=0)
-
-        return tok2vec >> flatten_add_lengths >> Pooling(mean_pool)
+    # def _encoder(self, width):
+    #    tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3,
+    #                      subword_features=False, conv_depth=4, bilstm_depth=0)
+    #
+    #    return tok2vec >> flatten_add_lengths >> Pooling(mean_pool)
 
     @staticmethod
-    def _encoder_depr(in_width, hidden_with, end_width):
+    def _encoder(in_width, hidden_with, end_width):
         conv_depth = 2
         cnn_maxout_pieces = 3
 
@@ -307,64 +299,58 @@ class EL_Model:
         self.sgd_desc.learn_rate = self.LEARN_RATE
         self.sgd_desc.L2 = self.L2
 
-        self.sgd = create_default_optimizer(self.model.ops)
-        self.sgd.learn_rate = self.LEARN_RATE
-        self.sgd.L2 = self.L2
+        # self.sgd = create_default_optimizer(self.model.ops)
+        # self.sgd.learn_rate = self.LEARN_RATE
+        # self.sgd.L2 = self.L2
 
     @staticmethod
     def get_loss(predictions, golds):
-        d_scores = (predictions - golds)
-        gradient = d_scores.mean()
-        loss = (d_scores ** 2).mean()
-        return loss, gradient
+        loss, gradients = get_cossim_loss(predictions, golds)
+        return loss, gradients
 
-    def update(self, entities, golds, descs, art_texts, sent_texts):
-        golds = self.model.ops.asarray(golds)
+    def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents):
+        for cluster, entities in entity_clusters.items():
+            correct_entities = [e for e in entities if golds[e]]
+            incorrect_entities = [e for e in entities if not golds[e]]
 
-        art_docs = self.nlp.pipe(art_texts)
-        sent_docs = self.nlp.pipe(sent_texts)
-        desc_docs = self.nlp.pipe(descs)
+            assert len(correct_entities) == 1
+            entities = list(entities)
+            shuffle(entities)
 
-        doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP)
-        sent_encodings, bp_sent = self.sent_encoder.begin_update(sent_docs, drop=self.DROP)
-        desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP)
+            # article_text = art_texts[arts[cluster]]
+            cluster_sent = sent_texts[sents[cluster]]
 
-        concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) + list(desc_encodings[i])
-                            for i in range(len(entities))]
+            # art_docs = self.nlp.pipe(article_text)
+            sent_doc = self.nlp(cluster_sent)
 
-        predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP)
-        predictions = self.model.ops.flatten(predictions)
+            for e in entities:
+                if golds[e]:
+                 # TODO: more appropriate loss for the whole cluster (currently only pos entities)
+                 #  TODO: speed up
+                    desc_doc = self.nlp(descs[e])
 
-        # print("entities", entities)
-        # print("predictions", predictions)
-        # print("golds", golds)
+                    # doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP)
+                    sent_encodings, bp_sent = self.sent_encoder.begin_update([sent_doc], drop=self.DROP)
+                    desc_encodings, bp_desc = self.desc_encoder.begin_update([desc_doc], drop=self.DROP)
 
-        loss, gradient = self.get_loss(predictions, golds)
+                    sent_encoding = sent_encodings[0]
+                    desc_encoding = desc_encodings[0]
 
-        gradient = float(gradient)
-        # print("gradient", gradient)
-        # print("loss", loss)
+                    sent_enc = self.sent_encoder.ops.asarray([sent_encoding])
+                    desc_enc = self.sent_encoder.ops.asarray([desc_encoding])
 
-        model_gradient = bp_model(gradient, sgd=self.sgd)
-        # print("model_gradient", model_gradient)
+                    # print("sent_encoding", type(sent_encoding), sent_encoding)
+                    # print("desc_encoding", type(desc_encoding), desc_encoding)
+                    # print("getting los for entity", e)
 
-        # concat = doc + sent + desc, but doc is the same within this function
-        sent_start = self.ARTICLE_WIDTH
-        desc_start = self.ARTICLE_WIDTH + self.SENT_WIDTH
-        doc_gradient = model_gradient[0][0:sent_start]
-        sent_gradients = list()
-        desc_gradients = list()
-        for x in model_gradient:
-            sent_gradients.append(list(x[sent_start:desc_start]))
-            desc_gradients.append(list(x[desc_start:]))
+                    loss, gradient = self.get_loss(sent_enc, desc_enc)
 
-        # print("doc_gradient", doc_gradient)
-        # print("sent_gradients", sent_gradients)
-        # print("desc_gradients", desc_gradients)
+                    # print("gradient", gradient)
+                    # print("loss", loss)
 
-        bp_doc([doc_gradient], sgd=self.sgd_article)
-        bp_sent(sent_gradients, sgd=self.sgd_sent)
-        bp_desc(desc_gradients, sgd=self.sgd_desc)
+                    bp_sent(gradient, sgd=self.sgd_sent)
+                    # bp_desc(desc_gradients, sgd=self.sgd_desc)    TODO
+                    # print()
 
     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
@@ -373,13 +359,14 @@ class EL_Model:
                                                                                          collect_correct=True,
                                                                                          collect_incorrect=True)
 
-        entities = set()
+        entities_by_cluster = dict()
         gold_by_entity = dict()
         desc_by_entity = dict()
-        article_by_entity = dict()
+        article_by_cluster = dict()
         text_by_article = dict()
-        sentence_by_entity = dict()
+        sentence_by_cluster = dict()
         text_by_sentence = dict()
+        sentence_by_text = dict()
 
         cnt = 0
         next_entity_nr = 1
@@ -402,74 +389,69 @@ class EL_Model:
                         text_by_article[article_id] = truncated_text
 
                     # process all positive and negative entities, collect all relevant mentions in this article
-                    article_terms = set()
-                    entities_by_mention = dict()
-
                     for mention, entity_pos in correct_entries[article_id].items():
+                        cluster = article_id + "_" + mention
                         descr = id_to_descr.get(entity_pos)
+                        entities = set()
                         if descr:
-                            entity = "E_" + str(next_entity_nr) + "_" + article_id + "_" + mention
+                            entity = "E_" + str(next_entity_nr) + "_" + cluster
                             next_entity_nr += 1
                             gold_by_entity[entity] = 1
                             desc_by_entity[entity] = descr
-                            article_terms.add(mention)
-                            mention_entities = entities_by_mention.get(mention, set())
-                            mention_entities.add(entity)
-                            entities_by_mention[mention] = mention_entities
-
-                    for mention, entity_negs in incorrect_entries[article_id].items():
-                        for entity_neg in entity_negs:
-                            descr = id_to_descr.get(entity_neg)
-                            if descr:
-                                entity = "E_" + str(next_entity_nr) + "_" + article_id + "_" + mention
-                                next_entity_nr += 1
-                                gold_by_entity[entity] = 0
-                                desc_by_entity[entity] = descr
-                                article_terms.add(mention)
-                                mention_entities = entities_by_mention.get(mention, set())
-                                mention_entities.add(entity)
-                                entities_by_mention[mention] = mention_entities
-
-                    # find all matches in the doc for the mentions
-                    # TODO: fix this - doesn't look like all entities are found
-                    matcher = PhraseMatcher(self.nlp.vocab)
-                    patterns = list(self.nlp.tokenizer.pipe(article_terms))
-
-                    matcher.add("TerminologyList", None, *patterns)
-                    matches = matcher(article_doc)
-
-                    # store sentences
-                    sentence_to_id = dict()
-                    for match_id, start, end in matches:
-                        span = article_doc[start:end]
-                        sent_text = span.sent.text
-                        sent_nr = sentence_to_id.get(sent_text,  None)
-                        mention = span.text
-                        if sent_nr is None:
-                            sent_nr = "S_" + str(next_sent_nr) + article_id
-                            next_sent_nr += 1
-                            text_by_sentence[sent_nr] = sent_text
-                            sentence_to_id[sent_text] = sent_nr
-                        mention_entities = entities_by_mention[mention]
-                        for entity in mention_entities:
                             entities.add(entity)
-                            sentence_by_entity[entity] = sent_nr
-                            article_by_entity[entity] = article_id
 
-        # remove entities that didn't have all data
-        gold_by_entity = {k: v for k, v in gold_by_entity.items() if k in entities}
-        desc_by_entity = {k: v for k, v in desc_by_entity.items() if k in entities}
+                            entity_negs = incorrect_entries[article_id][mention]
+                            for entity_neg in entity_negs:
+                                descr = id_to_descr.get(entity_neg)
+                                if descr:
+                                    entity = "E_" + str(next_entity_nr) + "_" + cluster
+                                    next_entity_nr += 1
+                                    gold_by_entity[entity] = 0
+                                    desc_by_entity[entity] = descr
+                                    entities.add(entity)
 
-        article_by_entity = {k: v for k, v in article_by_entity.items() if k in entities}
-        text_by_article = {k: v for k, v in text_by_article.items() if k in article_by_entity.values()}
+                        found_matches = 0
+                        if len(entities) > 1:
+                            entities_by_cluster[cluster] = entities
+
+                            # find all matches in the doc for the mentions
+                            # TODO: fix this - doesn't look like all entities are found
+                            matcher = PhraseMatcher(self.nlp.vocab)
+                            patterns = list(self.nlp.tokenizer.pipe([mention]))
+
+                            matcher.add("TerminologyList", None, *patterns)
+                            matches = matcher(article_doc)
+
+
+                            # store sentences
+                            for match_id, start, end in matches:
+                                found_matches += 1
+                                span = article_doc[start:end]
+                                assert mention == span.text
+                                sent_text = span.sent.text
+                                sent_nr = sentence_by_text.get(sent_text,  None)
+                                if sent_nr is None:
+                                    sent_nr = "S_" + str(next_sent_nr) + article_id
+                                    next_sent_nr += 1
+                                    text_by_sentence[sent_nr] = sent_text
+                                    sentence_by_text[sent_text] = sent_nr
+                                article_by_cluster[cluster] = article_id
+                                sentence_by_cluster[cluster] = sent_nr
+
+                        if found_matches == 0:
+                            # TODO print("Could not find neg instances or sentence matches for", mention, "in", article_id)
+                            entities_by_cluster.pop(cluster, None)
+                            article_by_cluster.pop(cluster, None)
+                            sentence_by_cluster.pop(cluster, None)
+                            for entity in entities:
+                                gold_by_entity.pop(entity, None)
+                                desc_by_entity.pop(entity, None)
 
-        sentence_by_entity = {k: v for k, v in sentence_by_entity.items() if k in entities}
-        text_by_sentence = {k: v for k, v in text_by_sentence.items() if k in sentence_by_entity.values()}
 
         if to_print:
             print()
             print("Processed", cnt, "training articles, dev=" + str(dev))
             print()
-        return list(entities), gold_by_entity, desc_by_entity, article_by_entity, text_by_article, \
-               sentence_by_entity, text_by_sentence
+        return entities_by_cluster, gold_by_entity, desc_by_entity, article_by_cluster, text_by_article, \
+               sentence_by_cluster, text_by_sentence
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 319b1e1c8..a24ff30c5 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=100)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From a761929fa50365663c8e897c8e5664a22438b3bd Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 28 May 2019 18:14:49 +0200
Subject: [PATCH 059/148] context encoder combining sentence and article

---
 .../pipeline/wiki_entity_linking/train_el.py  | 257 ++++++++++--------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 2 files changed, 138 insertions(+), 121 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index ac8cae4a4..ea42f9ab6 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -11,11 +11,11 @@ from thinc.neural._classes.convolution import ExtractWindow
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
-from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic, Tok2Vec, cosine
+from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, cosine
 
 from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten
-from thinc.v2v import Model, Maxout, Affine, ReLu
-from thinc.t2v import Pooling, mean_pool, sum_pool
+from thinc.v2v import Model, Maxout, Affine
+from thinc.t2v import Pooling, mean_pool
 from thinc.t2t import ParametricAttention
 from thinc.misc import Residual
 from thinc.misc import LayerNorm as LN
@@ -30,24 +30,21 @@ from spacy.tokens import Doc
 class EL_Model:
 
     PRINT_INSPECT = False
-    PRINT_TRAIN = True
+    PRINT_BATCH_LOSS = False
     EPS = 0.0000000005
-    CUTOFF = 0.5
 
     BATCH_SIZE = 5
-    # UPSAMPLE = True
 
     DOC_CUTOFF = 300    # number of characters from the doc context
     INPUT_DIM = 300     # dimension of pre-trained vectors
 
     HIDDEN_1_WIDTH = 32
-    # HIDDEN_2_WIDTH = 32  # 6
     DESC_WIDTH = 64
-    ARTICLE_WIDTH = 64
+    ARTICLE_WIDTH = 128
     SENT_WIDTH = 64
 
     DROP = 0.1
-    LEARN_RATE = 0.0001
+    LEARN_RATE = 0.001
     EPOCHS = 10
     L2 = 1e-6
 
@@ -61,13 +58,10 @@ class EL_Model:
         self._build_cnn(embed_width=self.INPUT_DIM,
                         desc_width=self.DESC_WIDTH,
                         article_width=self.ARTICLE_WIDTH,
-                        sent_width=self.SENT_WIDTH, hidden_1_width=self.HIDDEN_1_WIDTH)
+                        sent_width=self.SENT_WIDTH,
+                        hidden_1_width=self.HIDDEN_1_WIDTH)
 
     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
-        # raise errors instead of runtime warnings in case of int/float overflow
-        # (not sure if we need this. set L2 to 0 because it throws an error otherwsise)
-        # np.seterr(all='raise')
-        # alternative:
         np.seterr(divide="raise", over="warn", under="ignore", invalid="raise")
 
         train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \
@@ -101,21 +95,6 @@ class EL_Model:
         train_pos_count = len(train_pos_entities)
         train_neg_count = len(train_neg_entities)
 
-        # if self.UPSAMPLE:
-        #    if to_print:
-        #        print()
-        #        print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count)
-        #
-        #    # upsample positives to 50-50 distribution
-        #    while train_pos_count < train_neg_count:
-        #        train_ent.append(random.choice(train_pos_entities))
-        #        train_pos_count += 1
-        #
-            # upsample negatives to 50-50 distribution
-        #    while train_neg_count < train_pos_count:
-        #        train_ent.append(random.choice(train_neg_entities))
-        #        train_neg_count += 1
-
         self._begin_training()
 
         if to_print:
@@ -126,24 +105,25 @@ class EL_Model:
             print("Dev test on", len(dev_clusters), "entity clusters in", len(dev_art_texts), "articles")
             print("Dev instances pos/neg:", dev_pos_count, dev_neg_count)
             print()
-            print(" CUTOFF", self.CUTOFF)
             print(" DOC_CUTOFF", self.DOC_CUTOFF)
             print(" INPUT_DIM", self.INPUT_DIM)
-            # print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH)
+            print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH)
             print(" DESC_WIDTH", self.DESC_WIDTH)
             print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH)
             print(" SENT_WIDTH", self.SENT_WIDTH)
-            # print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH)
             print(" DROP", self.DROP)
             print(" LEARNING RATE", self.LEARN_RATE)
-            print(" UPSAMPLE", self.UPSAMPLE)
+            print(" BATCH SIZE", self.BATCH_SIZE)
             print()
 
-        self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
-                       print_string="dev_random", calc_random=True)
+        dev_random = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
+                                    calc_random=True)
+        print("acc", "dev_random", round(dev_random, 2))
 
-        self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
-                       print_string="dev_pre", avg=True)
+        dev_pre = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
+                                 avg=True)
+        print("acc", "dev_pre", round(dev_pre, 2))
+        print()
 
         processed = 0
         for i in range(self.EPOCHS):
@@ -163,45 +143,58 @@ class EL_Model:
                 start = start + self.BATCH_SIZE
                 stop = min(stop + self.BATCH_SIZE, len(train_clusters))
 
-            if self.PRINT_TRAIN:
-                print()
-                self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts,
-                                print_string="train_inter_epoch " + str(i), avg=True)
+            train_acc = self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts, avg=True)
+            dev_acc = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, avg=True)
 
-            self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
-                           print_string="dev_inter_epoch " + str(i), avg=True)
+            print(i, "acc train/dev", round(train_acc, 2), round(dev_acc, 2))
 
         if to_print:
             print()
             print("Trained on", processed, "entity clusters across", self.EPOCHS, "epochs")
 
-    def _test_dev(self, entity_clusters, golds, descs, arts, art_texts, sents, sent_texts,
-                  print_string, avg=True, calc_random=False):
-
+    def _test_dev(self, entity_clusters, golds, descs, arts, art_texts, sents, sent_texts, avg=True, calc_random=False):
         correct = 0
         incorrect = 0
 
-        for cluster, entities in entity_clusters.items():
-            correct_entities = [e for e in entities if golds[e]]
-            incorrect_entities = [e for e in entities if not golds[e]]
-            assert len(correct_entities) == 1
+        if calc_random:
+            for cluster, entities in entity_clusters.items():
+                correct_entities = [e for e in entities if golds[e]]
+                assert len(correct_entities) == 1
 
-            entities = list(entities)
-            shuffle(entities)
+                entities = list(entities)
+                shuffle(entities)
 
-            if calc_random:
-                predicted_entity = random.choice(entities)
-                if predicted_entity in correct_entities:
-                    correct += 1
-                else:
-                    incorrect += 1
+                if calc_random:
+                    predicted_entity = random.choice(entities)
+                    if predicted_entity in correct_entities:
+                        correct += 1
+                    else:
+                        incorrect += 1
+
+        else:
+            all_clusters = list()
+            arts_list = list()
+            sents_list = list()
+
+            for cluster in entity_clusters.keys():
+                all_clusters.append(cluster)
+                arts_list.append(art_texts[arts[cluster]])
+                sents_list.append(sent_texts[sents[cluster]])
+
+            art_docs = list(self.nlp.pipe(arts_list))
+            sent_docs = list(self.nlp.pipe(sents_list))
+
+            for i, cluster in enumerate(all_clusters):
+                entities = entity_clusters[cluster]
+                correct_entities = [e for e in entities if golds[e]]
+                assert len(correct_entities) == 1
+
+                entities = list(entities)
+                shuffle(entities)
 
-            else:
                 desc_docs = self.nlp.pipe([descs[e] for e in entities])
-                # article_texts = [art_texts[arts[e]] for e in entities]
-
-                sent_doc = self.nlp(sent_texts[sents[cluster]])
-                article_doc = self.nlp(art_texts[arts[cluster]])
+                sent_doc = sent_docs[i]
+                article_doc = art_docs[i]
 
                 predicted_index = self._predict(article_doc=article_doc, sent_doc=sent_doc,
                                                 desc_docs=desc_docs, avg=avg)
@@ -211,52 +204,56 @@ class EL_Model:
                     incorrect += 1
 
         if correct == incorrect == 0:
-            print("acc", print_string, "NA")
             return 0
 
         acc = correct / (correct + incorrect)
-        print("acc", print_string, round(acc, 2))
         return acc
 
     def _predict(self, article_doc, sent_doc, desc_docs, avg=True, apply_threshold=True):
+        # print()
+        # print("predicting article")
+
         if avg:
             with self.article_encoder.use_params(self.sgd_article.averages) \
                  and self.desc_encoder.use_params(self.sgd_desc.averages)\
-                 and self.sent_encoder.use_params(self.sgd_sent.averages):
-                # doc_encoding = self.article_encoder(article_doc)
+                 and self.sent_encoder.use_params(self.sgd_sent.averages)\
+                 and self.cont_encoder.use_params(self.sgd_cont.averages):
                 desc_encodings = self.desc_encoder(desc_docs)
+                doc_encoding = self.article_encoder([article_doc])
                 sent_encoding = self.sent_encoder([sent_doc])
 
         else:
-            # doc_encodings = self.article_encoder(article_docs)
             desc_encodings = self.desc_encoder(desc_docs)
+            doc_encoding = self.article_encoder([article_doc])
             sent_encoding = self.sent_encoder([sent_doc])
 
-        sent_enc = np.transpose(sent_encoding)
+        # print("desc_encodings", desc_encodings)
+        # print("doc_encoding", doc_encoding)
+        # print("sent_encoding", sent_encoding)
+        concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
+        # print("concat_encoding", concat_encoding)
+
+        cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
+        # print("cont_encodings", cont_encodings)
+        context_enc = np.transpose(cont_encodings)
+        # print("context_enc", context_enc)
+
         highest_sim = -5
         best_i = -1
         for i, desc_enc in enumerate(desc_encodings):
-            sim = cosine(desc_enc, sent_enc)
+            sim = cosine(desc_enc, context_enc)
             if sim >= highest_sim:
                 best_i = i
                 highest_sim = sim
 
         return best_i
 
-    def _predict_random(self, entities, apply_threshold=True):
-        if not apply_threshold:
-            return [float(random.uniform(0, 1)) for _ in entities]
-        else:
-            return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for _ in entities]
-
     def _build_cnn(self, embed_width, desc_width, article_width, sent_width, hidden_1_width):
-        with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
-            self.desc_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width,
-                                                   end_width=desc_width)
-            self.article_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width,
-                                                      end_width=article_width)
-            self.sent_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width,
-                                                   end_width=sent_width)
+        self.desc_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width, end_width=desc_width)
+        self.cont_encoder = self._context_encoder(embed_width=embed_width, article_width=article_width,
+                                                     sent_width=sent_width, hidden_width=hidden_1_width,
+                                                     end_width=desc_width)
+
 
     # def _encoder(self, width):
     #    tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3,
@@ -264,12 +261,19 @@ class EL_Model:
     #
     #    return tok2vec >> flatten_add_lengths >> Pooling(mean_pool)
 
+    def _context_encoder(self, embed_width, article_width, sent_width, hidden_width, end_width):
+        self.article_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
+        self.sent_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
+
+        model = Affine(end_width, article_width+sent_width, drop_factor=0.0)
+        return model
+
     @staticmethod
     def _encoder(in_width, hidden_with, end_width):
         conv_depth = 2
         cnn_maxout_pieces = 3
 
-        with Model.define_operators({">>": chain}):
+        with Model.define_operators({">>": chain, "**": clone}):
             convolution = Residual((ExtractWindow(nW=1) >>
                                     LN(Maxout(hidden_with, hidden_with * 3, pieces=cnn_maxout_pieces))))
 
@@ -295,62 +299,75 @@ class EL_Model:
         self.sgd_sent.learn_rate = self.LEARN_RATE
         self.sgd_sent.L2 = self.L2
 
+        self.sgd_cont = create_default_optimizer(self.cont_encoder.ops)
+        self.sgd_cont.learn_rate = self.LEARN_RATE
+        self.sgd_cont.L2 = self.L2
+
         self.sgd_desc = create_default_optimizer(self.desc_encoder.ops)
         self.sgd_desc.learn_rate = self.LEARN_RATE
         self.sgd_desc.L2 = self.L2
 
-        # self.sgd = create_default_optimizer(self.model.ops)
-        # self.sgd.learn_rate = self.LEARN_RATE
-        # self.sgd.L2 = self.L2
-
     @staticmethod
     def get_loss(predictions, golds):
         loss, gradients = get_cossim_loss(predictions, golds)
         return loss, gradients
 
     def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents):
+        all_clusters = list(entity_clusters.keys())
+
+        arts_list = list()
+        sents_list = list()
+        descs_list = list()
+
         for cluster, entities in entity_clusters.items():
-            correct_entities = [e for e in entities if golds[e]]
-            incorrect_entities = [e for e in entities if not golds[e]]
-
-            assert len(correct_entities) == 1
-            entities = list(entities)
-            shuffle(entities)
-
-            # article_text = art_texts[arts[cluster]]
-            cluster_sent = sent_texts[sents[cluster]]
-
-            # art_docs = self.nlp.pipe(article_text)
-            sent_doc = self.nlp(cluster_sent)
-
+            art = art_texts[arts[cluster]]
+            sent = sent_texts[sents[cluster]]
             for e in entities:
+                # TODO: more appropriate loss for the whole cluster (currently only pos entities)
                 if golds[e]:
-                 # TODO: more appropriate loss for the whole cluster (currently only pos entities)
-                 #  TODO: speed up
-                    desc_doc = self.nlp(descs[e])
+                    arts_list.append(art)
+                    sents_list.append(sent)
+                    descs_list.append(descs[e])
 
-                    # doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP)
-                    sent_encodings, bp_sent = self.sent_encoder.begin_update([sent_doc], drop=self.DROP)
-                    desc_encodings, bp_desc = self.desc_encoder.begin_update([desc_doc], drop=self.DROP)
+        desc_docs = self.nlp.pipe(descs_list)
+        desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP)
 
-                    sent_encoding = sent_encodings[0]
-                    desc_encoding = desc_encodings[0]
+        art_docs = self.nlp.pipe(arts_list)
+        sent_docs = self.nlp.pipe(sents_list)
 
-                    sent_enc = self.sent_encoder.ops.asarray([sent_encoding])
-                    desc_enc = self.sent_encoder.ops.asarray([desc_encoding])
+        doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP)
+        sent_encodings, bp_sent = self.sent_encoder.begin_update(sent_docs, drop=self.DROP)
 
-                    # print("sent_encoding", type(sent_encoding), sent_encoding)
-                    # print("desc_encoding", type(desc_encoding), desc_encoding)
-                    # print("getting los for entity", e)
+        concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
+                            range(len(all_clusters))]
+        cont_encodings, bp_cont = self.cont_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
 
-                    loss, gradient = self.get_loss(sent_enc, desc_enc)
+        # print("sent_encodings", type(sent_encodings), sent_encodings)
+        # print("desc_encodings", type(desc_encodings), desc_encodings)
+        # print("doc_encodings", type(doc_encodings), doc_encodings)
+        # print("getting los for", len(arts_list), "entities")
 
-                    # print("gradient", gradient)
-                    # print("loss", loss)
+        loss, gradient = self.get_loss(cont_encodings, desc_encodings)
 
-                    bp_sent(gradient, sgd=self.sgd_sent)
-                    # bp_desc(desc_gradients, sgd=self.sgd_desc)    TODO
-                    # print()
+        # print("gradient", gradient)
+        if self.PRINT_BATCH_LOSS:
+            print("batch loss", loss)
+
+        context_gradient = bp_cont(gradient, sgd=self.sgd_cont)
+
+        # gradient : concat (doc+sent) vs. desc
+        sent_start = self.ARTICLE_WIDTH
+        sent_gradients = list()
+        doc_gradients = list()
+        for x in context_gradient:
+            doc_gradients.append(list(x[0:sent_start]))
+            sent_gradients.append(list(x[sent_start:]))
+
+        # print("doc_gradients", doc_gradients)
+        # print("sent_gradients", sent_gradients)
+
+        bp_doc(doc_gradients, sgd=self.sgd_article)
+        bp_sent(sent_gradients, sgd=self.sgd_sent)
 
     def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
         id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index a24ff30c5..25c1e4721 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=100)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=5000, devlimit=100)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 268a52ead7bbad21a22df11e9446971102193bcf Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 29 May 2019 16:07:53 +0200
Subject: [PATCH 060/148] experimenting with cosine sim for negative examples
 (not OK yet)

---
 .../pipeline/wiki_entity_linking/train_el.py  | 44 ++++++++++++++++---
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  2 +-
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index ea42f9ab6..ba8a6a6c9 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -8,6 +8,7 @@ import numpy as np
 import random
 from random import shuffle
 from thinc.neural._classes.convolution import ExtractWindow
+from thinc.neural.util import get_array_module
 
 from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
 
@@ -20,7 +21,7 @@ from thinc.t2t import ParametricAttention
 from thinc.misc import Residual
 from thinc.misc import LayerNorm as LN
 
-from spacy.cli.pretrain import get_cossim_loss
+# from spacy.cli.pretrain import get_cossim_loss
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
 
@@ -307,27 +308,56 @@ class EL_Model:
         self.sgd_desc.learn_rate = self.LEARN_RATE
         self.sgd_desc.L2 = self.L2
 
-    @staticmethod
-    def get_loss(predictions, golds):
-        loss, gradients = get_cossim_loss(predictions, golds)
+    def get_loss(self, v1, v2, targets):
+        loss, gradients = self.get_cossim_loss(v1, v2, targets)
         return loss, gradients
 
+    def get_cossim_loss(self, yh, y, t):
+        # Add a small constant to avoid 0 vectors
+        # print()
+        # print("yh", yh)
+        # print("y", y)
+        # print("t", t)
+        yh = yh + 1e-8
+        y = y + 1e-8
+        # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
+        xp = get_array_module(yh)
+        norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
+        norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
+        mul_norms = norm_yh * norm_y
+        cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms
+        # print("cos", cos)
+        d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2))
+        # print("abs", xp.abs(cos - t))
+        loss = xp.abs(cos - t).sum()
+        # print("loss", loss)
+        # print("d_yh", d_yh)
+        inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))])
+        # print("inverse", inverse)
+        return loss, -inverse
+
     def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents):
         all_clusters = list(entity_clusters.keys())
 
         arts_list = list()
         sents_list = list()
         descs_list = list()
+        targets = list()
 
         for cluster, entities in entity_clusters.items():
             art = art_texts[arts[cluster]]
             sent = sent_texts[sents[cluster]]
             for e in entities:
-                # TODO: more appropriate loss for the whole cluster (currently only pos entities)
                 if golds[e]:
                     arts_list.append(art)
                     sents_list.append(sent)
                     descs_list.append(descs[e])
+                    targets.append([1])
+                else:
+                    arts_list.append(art)
+                    sents_list.append(sent)
+                    descs_list.append(descs[e])
+                    targets.append([-1])
 
         desc_docs = self.nlp.pipe(descs_list)
         desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP)
@@ -339,7 +369,7 @@ class EL_Model:
         sent_encodings, bp_sent = self.sent_encoder.begin_update(sent_docs, drop=self.DROP)
 
         concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
-                            range(len(all_clusters))]
+                            range(len(targets))]
         cont_encodings, bp_cont = self.cont_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
 
         # print("sent_encodings", type(sent_encodings), sent_encodings)
@@ -347,7 +377,7 @@ class EL_Model:
         # print("doc_encodings", type(doc_encodings), doc_encodings)
         # print("getting los for", len(arts_list), "entities")
 
-        loss, gradient = self.get_loss(cont_encodings, desc_encodings)
+        loss, gradient = self.get_loss(cont_encodings, desc_encodings, targets)
 
         # print("gradient", gradient)
         if self.PRINT_BATCH_LOSS:
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 25c1e4721..a24ff30c5 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=5000, devlimit=100)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=100)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset

From 9e88763dab895d7ee86a21d78c0e2c950e8d6850 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 3 Jun 2019 08:04:49 +0200
Subject: [PATCH 061/148] 60% acc run

---
 .../pipeline/wiki_entity_linking/train_el.py  | 159 ++++++++----------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   3 +-
 2 files changed, 74 insertions(+), 88 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index ba8a6a6c9..a2db2dc95 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -23,7 +23,6 @@ from thinc.misc import LayerNorm as LN
 
 # from spacy.cli.pretrain import get_cossim_loss
 from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
 
 """ TODO: this code needs to be implemented in pipes.pyx"""
 
@@ -46,7 +45,7 @@ class EL_Model:
 
     DROP = 0.1
     LEARN_RATE = 0.001
-    EPOCHS = 10
+    EPOCHS = 20
     L2 = 1e-6
 
     name = "entity_linker"
@@ -211,9 +210,6 @@ class EL_Model:
         return acc
 
     def _predict(self, article_doc, sent_doc, desc_docs, avg=True, apply_threshold=True):
-        # print()
-        # print("predicting article")
-
         if avg:
             with self.article_encoder.use_params(self.sgd_article.averages) \
                  and self.desc_encoder.use_params(self.sgd_desc.averages)\
@@ -228,16 +224,10 @@ class EL_Model:
             doc_encoding = self.article_encoder([article_doc])
             sent_encoding = self.sent_encoder([sent_doc])
 
-        # print("desc_encodings", desc_encodings)
-        # print("doc_encoding", doc_encoding)
-        # print("sent_encoding", sent_encoding)
         concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
-        # print("concat_encoding", concat_encoding)
 
         cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
-        # print("cont_encodings", cont_encodings)
         context_enc = np.transpose(cont_encodings)
-        # print("context_enc", context_enc)
 
         highest_sim = -5
         best_i = -1
@@ -353,11 +343,11 @@ class EL_Model:
                     sents_list.append(sent)
                     descs_list.append(descs[e])
                     targets.append([1])
-                else:
-                    arts_list.append(art)
-                    sents_list.append(sent)
-                    descs_list.append(descs[e])
-                    targets.append([-1])
+                # else:
+                #    arts_list.append(art)
+                #    sents_list.append(sent)
+                #    descs_list.append(descs[e])
+                #    targets.append([-1])
 
         desc_docs = self.nlp.pipe(descs_list)
         desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP)
@@ -372,18 +362,17 @@ class EL_Model:
                             range(len(targets))]
         cont_encodings, bp_cont = self.cont_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
 
-        # print("sent_encodings", type(sent_encodings), sent_encodings)
-        # print("desc_encodings", type(desc_encodings), desc_encodings)
-        # print("doc_encodings", type(doc_encodings), doc_encodings)
-        # print("getting los for", len(arts_list), "entities")
+        loss, cont_gradient = self.get_loss(cont_encodings, desc_encodings, targets)
 
-        loss, gradient = self.get_loss(cont_encodings, desc_encodings, targets)
+        # loss, desc_gradient = self.get_loss(desc_encodings, cont_encodings, targets)
+        # cont_gradient = cont_gradient / 2
+        # desc_gradient = desc_gradient / 2
+        # bp_desc(desc_gradient, sgd=self.sgd_desc)
 
-        # print("gradient", gradient)
         if self.PRINT_BATCH_LOSS:
             print("batch loss", loss)
 
-        context_gradient = bp_cont(gradient, sgd=self.sgd_cont)
+        context_gradient = bp_cont(cont_gradient, sgd=self.sgd_cont)
 
         # gradient : concat (doc+sent) vs. desc
         sent_start = self.ARTICLE_WIDTH
@@ -393,9 +382,6 @@ class EL_Model:
             doc_gradients.append(list(x[0:sent_start]))
             sent_gradients.append(list(x[sent_start:]))
 
-        # print("doc_gradients", doc_gradients)
-        # print("sent_gradients", sent_gradients)
-
         bp_doc(doc_gradients, sgd=self.sgd_article)
         bp_sent(sent_gradients, sgd=self.sgd_sent)
 
@@ -426,74 +412,75 @@ class EL_Model:
                     article_id = f.replace(".txt", "")
                     if cnt % 500 == 0 and to_print:
                         print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
-                    cnt += 1
 
-                    # parse the article text
-                    with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
-                        text = file.read()
-                        article_doc = self.nlp(text)
-                        truncated_text = text[0:min(self.DOC_CUTOFF, len(text))]
-                        text_by_article[article_id] = truncated_text
+                    try:
+                        # parse the article text
+                        with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
+                            text = file.read()
+                            article_doc = self.nlp(text)
+                            truncated_text = text[0:min(self.DOC_CUTOFF, len(text))]
+                            text_by_article[article_id] = truncated_text
 
-                    # process all positive and negative entities, collect all relevant mentions in this article
-                    for mention, entity_pos in correct_entries[article_id].items():
-                        cluster = article_id + "_" + mention
-                        descr = id_to_descr.get(entity_pos)
-                        entities = set()
-                        if descr:
-                            entity = "E_" + str(next_entity_nr) + "_" + cluster
-                            next_entity_nr += 1
-                            gold_by_entity[entity] = 1
-                            desc_by_entity[entity] = descr
-                            entities.add(entity)
+                        # process all positive and negative entities, collect all relevant mentions in this article
+                        for mention, entity_pos in correct_entries[article_id].items():
+                            cluster = article_id + "_" + mention
+                            descr = id_to_descr.get(entity_pos)
+                            entities = set()
+                            if descr:
+                                entity = "E_" + str(next_entity_nr) + "_" + cluster
+                                next_entity_nr += 1
+                                gold_by_entity[entity] = 1
+                                desc_by_entity[entity] = descr
+                                entities.add(entity)
 
-                            entity_negs = incorrect_entries[article_id][mention]
-                            for entity_neg in entity_negs:
-                                descr = id_to_descr.get(entity_neg)
-                                if descr:
-                                    entity = "E_" + str(next_entity_nr) + "_" + cluster
-                                    next_entity_nr += 1
-                                    gold_by_entity[entity] = 0
-                                    desc_by_entity[entity] = descr
-                                    entities.add(entity)
+                                entity_negs = incorrect_entries[article_id][mention]
+                                for entity_neg in entity_negs:
+                                    descr = id_to_descr.get(entity_neg)
+                                    if descr:
+                                        entity = "E_" + str(next_entity_nr) + "_" + cluster
+                                        next_entity_nr += 1
+                                        gold_by_entity[entity] = 0
+                                        desc_by_entity[entity] = descr
+                                        entities.add(entity)
 
-                        found_matches = 0
-                        if len(entities) > 1:
-                            entities_by_cluster[cluster] = entities
+                            found_matches = 0
+                            if len(entities) > 1:
+                                entities_by_cluster[cluster] = entities
 
-                            # find all matches in the doc for the mentions
-                            # TODO: fix this - doesn't look like all entities are found
-                            matcher = PhraseMatcher(self.nlp.vocab)
-                            patterns = list(self.nlp.tokenizer.pipe([mention]))
+                                # find all matches in the doc for the mentions
+                                # TODO: fix this - doesn't look like all entities are found
+                                matcher = PhraseMatcher(self.nlp.vocab)
+                                patterns = list(self.nlp.tokenizer.pipe([mention]))
 
-                            matcher.add("TerminologyList", None, *patterns)
-                            matches = matcher(article_doc)
+                                matcher.add("TerminologyList", None, *patterns)
+                                matches = matcher(article_doc)
 
+                                # store sentences
+                                for match_id, start, end in matches:
+                                    span = article_doc[start:end]
+                                    if mention == span.text:
+                                        found_matches += 1
+                                        sent_text = span.sent.text
+                                        sent_nr = sentence_by_text.get(sent_text,  None)
+                                        if sent_nr is None:
+                                            sent_nr = "S_" + str(next_sent_nr) + article_id
+                                            next_sent_nr += 1
+                                            text_by_sentence[sent_nr] = sent_text
+                                            sentence_by_text[sent_text] = sent_nr
+                                        article_by_cluster[cluster] = article_id
+                                        sentence_by_cluster[cluster] = sent_nr
 
-                            # store sentences
-                            for match_id, start, end in matches:
-                                found_matches += 1
-                                span = article_doc[start:end]
-                                assert mention == span.text
-                                sent_text = span.sent.text
-                                sent_nr = sentence_by_text.get(sent_text,  None)
-                                if sent_nr is None:
-                                    sent_nr = "S_" + str(next_sent_nr) + article_id
-                                    next_sent_nr += 1
-                                    text_by_sentence[sent_nr] = sent_text
-                                    sentence_by_text[sent_text] = sent_nr
-                                article_by_cluster[cluster] = article_id
-                                sentence_by_cluster[cluster] = sent_nr
-
-                        if found_matches == 0:
-                            # TODO print("Could not find neg instances or sentence matches for", mention, "in", article_id)
-                            entities_by_cluster.pop(cluster, None)
-                            article_by_cluster.pop(cluster, None)
-                            sentence_by_cluster.pop(cluster, None)
-                            for entity in entities:
-                                gold_by_entity.pop(entity, None)
-                                desc_by_entity.pop(entity, None)
-
+                            if found_matches == 0:
+                                # print("Could not find neg instances or sentence matches for", mention, "in", article_id)
+                                entities_by_cluster.pop(cluster, None)
+                                article_by_cluster.pop(cluster, None)
+                                sentence_by_cluster.pop(cluster, None)
+                                for entity in entities:
+                                    gold_by_entity.pop(entity, None)
+                                    desc_by_entity.pop(entity, None)
+                        cnt += 1
+                    except:
+                        print("Problem parsing article", article_id)
 
         if to_print:
             print()
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index a24ff30c5..2ebf9973e 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=100)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset
@@ -120,7 +120,6 @@ if __name__ == "__main__":
         run_el.run_el_dev(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=2000)
         print()
 
-
     # TODO coreference resolution
     # add_coref()
 

From fb37cdb2d30a6ac3a66df9cddb39951c4bcc93e8 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 3 Jun 2019 21:32:54 +0200
Subject: [PATCH 062/148] implementing el pipe in pipes.pyx (not tested yet)

---
 .../pipeline/wiki_entity_linking/train_el.py  |  21 +--
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 spacy/_ml.py                                  |  22 +++
 spacy/pipeline/pipes.pyx                      | 156 ++++++++++++++----
 4 files changed, 160 insertions(+), 41 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index a2db2dc95..b9a0dc843 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -24,8 +24,6 @@ from thinc.misc import LayerNorm as LN
 # from spacy.cli.pretrain import get_cossim_loss
 from spacy.matcher import PhraseMatcher
 
-""" TODO: this code needs to be implemented in pipes.pyx"""
-
 
 class EL_Model:
 
@@ -45,7 +43,7 @@ class EL_Model:
 
     DROP = 0.1
     LEARN_RATE = 0.001
-    EPOCHS = 20
+    EPOCHS = 5
     L2 = 1e-6
 
     name = "entity_linker"
@@ -213,8 +211,7 @@ class EL_Model:
         if avg:
             with self.article_encoder.use_params(self.sgd_article.averages) \
                  and self.desc_encoder.use_params(self.sgd_desc.averages)\
-                 and self.sent_encoder.use_params(self.sgd_sent.averages)\
-                 and self.cont_encoder.use_params(self.sgd_cont.averages):
+                 and self.sent_encoder.use_params(self.sgd_sent.averages):
                 desc_encodings = self.desc_encoder(desc_docs)
                 doc_encoding = self.article_encoder([article_doc])
                 sent_encoding = self.sent_encoder([sent_doc])
@@ -226,7 +223,13 @@ class EL_Model:
 
         concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
 
-        cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
+        if avg:
+            with self.cont_encoder.use_params(self.sgd_cont.averages):
+                cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
+
+        else:
+            cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
+
         context_enc = np.transpose(cont_encodings)
 
         highest_sim = -5
@@ -298,8 +301,8 @@ class EL_Model:
         self.sgd_desc.learn_rate = self.LEARN_RATE
         self.sgd_desc.L2 = self.L2
 
-    def get_loss(self, v1, v2, targets):
-        loss, gradients = self.get_cossim_loss(v1, v2, targets)
+    def get_loss(self, pred, gold, targets):
+        loss, gradients = self.get_cossim_loss(pred, gold, targets)
         return loss, gradients
 
     def get_cossim_loss(self, yh, y, t):
@@ -327,8 +330,6 @@ class EL_Model:
         return loss, -inverse
 
     def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents):
-        all_clusters = list(entity_clusters.keys())
-
         arts_list = list()
         sents_list = list()
         descs_list = list()
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 2ebf9973e..40d737a6f 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -111,7 +111,7 @@ if __name__ == "__main__":
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=20)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 349b88df9..29772c5ee 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -652,6 +652,28 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
     return model
 
 
+def build_nel_encoder(in_width, hidden_width, end_width, **cfg):
+    conv_depth = cfg.get("conv_depth", 2)
+    cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
+
+    with Model.define_operators({">>": chain, "**": clone}):
+        convolution = Residual((ExtractWindow(nW=1) >>
+                                LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
+
+        encoder = SpacyVectors \
+                  >> with_flatten(LN(Maxout(hidden_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \
+                  >> flatten_add_lengths \
+                  >> ParametricAttention(hidden_width) \
+                  >> Pooling(mean_pool) \
+                  >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
+                  >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
+
+        # TODO: ReLu or LN(Maxout)  ?
+        # sum_pool or mean_pool ?
+
+    encoder.nO = end_width
+    return encoder
+
 @layerize
 def flatten(seqs, drop=0.0):
     ops = Model.ops
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 69521c1b2..c8afd431e 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -13,6 +13,8 @@ from thinc.v2v import Affine, Maxout, Softmax
 from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical, copy_array
 
+from spacy.cli.pretrain import get_cossim_loss
+
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
 from ..syntax.ner cimport BiluoPushDown
@@ -23,14 +25,17 @@ from ..vocab cimport Vocab
 from ..syntax import nonproj
 from ..attrs import POS, ID
 from ..parts_of_speech import X
-from .._ml import Tok2Vec, build_tagger_model
+from .._ml import Tok2Vec, build_tagger_model, cosine
 from .._ml import build_text_classifier, build_simple_cnn_text_classifier
-from .._ml import build_bow_text_classifier
+from .._ml import build_bow_text_classifier, build_nel_encoder
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import masked_language_model, create_default_optimizer
 from ..errors import Errors, TempErrors
 from .. import util
 
+# TODO: remove
+from examples.pipeline.wiki_entity_linking import kb_creator
+
 
 def _load_cfg(path):
     if path.exists():
@@ -1065,50 +1070,141 @@ class EntityLinker(Pipe):
     name = 'entity_linker'
 
     @classmethod
-    def Model(cls, nr_class=1, **cfg):
-        # TODO: non-dummy EL implementation
-        return None
+    def Model(cls, **cfg):
+        embed_width = cfg.get("embed_width", 300)
+        hidden_width = cfg.get("hidden_width", 32)
+        entity_width = cfg.get("entity_width", 64)
+        article_width = cfg.get("article_width", 128)
+        sent_width = cfg.get("sent_width", 64)
+
+        entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width)
+
+        article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
+        sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
+
+        # dimension of the mention encoder needs to match the dimension of the entity encoder
+        mention_width = entity_encoder.nO
+        mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
+
+        return entity_encoder, article_encoder, sent_encoder, mention_encoder
+
+    def __init__(self, **cfg):
+        # TODO: bring-your-own-model
+        self.mention_encoder = True
 
-    def __init__(self, model=True, **cfg):
-        self.model = False
         self.cfg = dict(cfg)
         self.kb = self.cfg["kb"]
 
+        # TODO: fix this. store entity vectors in the KB ?
+        self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv')
+
+    def use_avg_params(self):
+        """Modify the pipe's encoders/models, to use their average parameter values."""
+        with self.article_encoder.use_params(self.sgd_article.averages) \
+                 and self.entity_encoder.use_params(self.sgd_entity.averages)\
+                 and self.sent_encoder.use_params(self.sgd_sent.averages) \
+                 and self.mention_encoder.use_params(self.sgd_mention.averages):
+            yield
+
+    def require_model(self):
+        """Raise an error if the component's model is not initialized."""
+        if getattr(self, "mention_encoder", None) in (None, True, False):
+            raise ValueError(Errors.E109.format(name=self.name))
+
+    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
+        if self.mention_encoder is True:
+            self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
+            self.sgd_article = create_default_optimizer(self.article_encoder.ops)
+            self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
+            self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
+            self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
+
+    def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
+        """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) """
+        self.require_model()
+
+        entity_docs, article_docs, sentence_docs = docs
+        assert len(entity_docs) == len(article_docs) == len(sentence_docs)
+
+        if isinstance(entity_docs, Doc):
+            entity_docs = [entity_docs]
+            article_docs = [article_docs]
+            sentence_docs = [sentence_docs]
+
+        entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop)
+        doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
+        sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
+
+        concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
+                            range(len(article_docs))]
+        mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
+
+        loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
+
+        mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont)
+
+        # gradient : concat (doc+sent) vs. desc
+        sent_start = self.article_encoder.nO
+        sent_gradients = list()
+        doc_gradients = list()
+        for x in mention_gradient:
+            doc_gradients.append(list(x[0:sent_start]))
+            sent_gradients.append(list(x[sent_start:]))
+
+        bp_doc(doc_gradients, sgd=self.sgd_article)
+        bp_sent(sent_gradients, sgd=self.sgd_sent)
+
+        if losses is not None:
+            losses.setdefault(self.name, 0.0)
+            losses[self.name] += loss
+        return loss
+
+    def get_loss(self, docs, golds, scores):
+        loss, gradients = get_cossim_loss(scores, golds)
+        return loss, gradients
+
     def __call__(self, doc):
-        self.set_annotations([doc], scores=None, tensors=None)
+        entities, kb_ids = self.predict([doc])
+        self.set_annotations([doc], entities, kb_ids)
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
-        """Apply the pipe to a stream of documents.
-        Both __call__ and pipe should delegate to the `predict()`
-        and `set_annotations()` methods.
-        """
         for docs in util.minibatch(stream, size=batch_size):
             docs = list(docs)
-            self.set_annotations(docs, scores=None, tensors=None)
+            entities, kb_ids = self.predict(docs)
+            self.set_annotations(docs, entities, kb_ids)
             yield from docs
 
-    def set_annotations(self, docs, scores, tensors=None):
-        """
-        Currently implemented as taking the KB entry with highest prior probability for each named entity
-        TODO: actually use context etc
-        """
-        for i, doc in enumerate(docs):
-            for ent in doc.ents:
+    def predict(self, docs):
+        self.require_model()
+        for i, article_doc in enumerate(docs):
+            doc_encoding = self.article_encoder([article_doc])
+            for ent in article_doc.ents:
+                sent_doc = ent.sent.as_doc()
+                sent_encoding = self.sent_encoder([sent_doc])
+                concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
+                mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]]))
+                mention_enc_t = np.transpose(mention_encoding)
+
                 candidates = self.kb.get_candidates(ent.text)
                 if candidates:
-                    best_candidate = max(candidates, key=lambda c: c.prior_prob)
-                    for token in ent:
-                        token.ent_kb_id_ = best_candidate.entity_
+                    highest_sim = -5
+                    best_i = -1
+                    with self.use_avg_params:
+                        for c in candidates:
+                            kb_id = c.entity_
+                            description = self.id_to_descr.get(kb_id)
+                            entity_encodings = self.entity_encoder([description])  # TODO: static entity vectors ?
+                            sim = cosine(entity_encodings, mention_enc_t)
+                            if sim >= highest_sim:
+                                best_i = i
+                                highest_sim = sim
 
-    def get_loss(self, docs, golds, scores):
-        # TODO
-        pass
-
-    def add_label(self, label):
-        # TODO
-        pass
+                    # TODO best_candidate = max(candidates, key=lambda c: c.prior_prob)
 
+    def set_annotations(self, docs, entities, kb_ids=None):
+        for token, kb_id in zip(entities, kb_ids):
+            token.ent_kb_id_ = kb_id
 
 class Sentencizer(object):
     """Segment the Doc into sentences using a rule-based strategy.

From 9abbd0899fe2fb64601f02bca206dcad1431365c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 5 Jun 2019 00:09:46 +0200
Subject: [PATCH 063/148] separate entity encoder to get 64D descriptions

---
 .../wiki_entity_linking/train_descriptions.py | 113 ++++++++++++++++++
 .../pipeline/wiki_entity_linking/train_el.py  |  18 +--
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  20 +++-
 spacy/pipeline/pipes.pyx                      |  22 ++--
 4 files changed, 152 insertions(+), 21 deletions(-)
 create mode 100644 examples/pipeline/wiki_entity_linking/train_descriptions.py

diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py
new file mode 100644
index 000000000..63149b5f7
--- /dev/null
+++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py
@@ -0,0 +1,113 @@
+from random import shuffle
+
+from examples.pipeline.wiki_entity_linking import kb_creator
+
+import numpy as np
+
+from spacy._ml import zero_init, create_default_optimizer
+from spacy.cli.pretrain import get_cossim_loss
+
+from thinc.v2v import Model
+from thinc.api import chain
+from thinc.neural._classes.affine import Affine
+
+
+class EntityEncoder:
+
+    INPUT_DIM = 300  # dimension of pre-trained vectors
+    DESC_WIDTH = 64
+
+    DROP = 0
+    EPOCHS = 5
+    STOP_THRESHOLD = 0.05
+
+    BATCH_SIZE = 1000
+
+    def __init__(self, kb, nlp):
+        self.nlp = nlp
+        self.kb = kb
+
+    def run(self, entity_descr_output):
+        id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
+
+        processed, loss = self._train_model(entity_descr_output, id_to_descr)
+        print("Trained on", processed, "entities across", self.EPOCHS, "epochs")
+        print("Final loss:", loss)
+        print()
+
+        # TODO: apply and write to file afterwards !
+        # self._apply_encoder(id_to_descr)
+
+    def _train_model(self, entity_descr_output, id_to_descr):
+        # TODO: when loss gets too low, a 'mean of empty slice' warning is thrown by numpy
+
+        self._build_network(self.INPUT_DIM, self.DESC_WIDTH)
+
+        processed = 0
+        loss = 1
+
+        for i in range(self.EPOCHS):
+            entity_keys = list(id_to_descr.keys())
+            shuffle(entity_keys)
+
+            batch_nr = 0
+            start = 0
+            stop = min(self.BATCH_SIZE, len(entity_keys))
+
+            while loss > self.STOP_THRESHOLD and start < len(entity_keys):
+                batch = []
+                for e in entity_keys[start:stop]:
+                    descr = id_to_descr[e]
+                    doc = self.nlp(descr)
+                    doc_vector = self._get_doc_embedding(doc)
+                    batch.append(doc_vector)
+
+                loss = self.update(batch)
+                print(i, batch_nr, loss)
+                processed += len(batch)
+
+                batch_nr += 1
+                start = start + self.BATCH_SIZE
+                stop = min(stop + self.BATCH_SIZE, len(entity_keys))
+
+        return processed, loss
+
+    def _apply_encoder(self, id_to_descr):
+        for id, descr in id_to_descr.items():
+            doc = self.nlp(descr)
+            doc_vector = self._get_doc_embedding(doc)
+            encoding = self.encoder(np.asarray([doc_vector]))
+
+    @staticmethod
+    def _get_doc_embedding(doc):
+        indices = np.zeros((len(doc),), dtype="i")
+        for i, word in enumerate(doc):
+            if word.orth in doc.vocab.vectors.key2row:
+                indices[i] = doc.vocab.vectors.key2row[word.orth]
+            else:
+                indices[i] = 0
+        word_vectors = doc.vocab.vectors.data[indices]
+        doc_vector = np.mean(word_vectors, axis=0)  # TODO: min? max?
+        return doc_vector
+
+    def _build_network(self, orig_width, hidden_with):
+        with Model.define_operators({">>": chain}):
+            self.encoder = (
+                Affine(hidden_with, orig_width)
+            )
+            self.model = self.encoder >> zero_init(Affine(orig_width, hidden_with, drop_factor=0.0))
+
+        self.sgd = create_default_optimizer(self.model.ops)
+
+    def update(self, vectors):
+        predictions, bp_model = self.model.begin_update(np.asarray(vectors), drop=self.DROP)
+
+        loss, d_scores = self.get_loss(scores=predictions, golds=np.asarray(vectors))
+        bp_model(d_scores, sgd=self.sgd)
+
+        return loss / len(vectors)
+
+    @staticmethod
+    def get_loss(golds, scores):
+        loss, gradients = get_cossim_loss(scores, golds)
+        return loss, gradients
diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index b9a0dc843..143e38d99 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -31,7 +31,7 @@ class EL_Model:
     PRINT_BATCH_LOSS = False
     EPS = 0.0000000005
 
-    BATCH_SIZE = 5
+    BATCH_SIZE = 100
 
     DOC_CUTOFF = 300    # number of characters from the doc context
     INPUT_DIM = 300     # dimension of pre-trained vectors
@@ -41,9 +41,9 @@ class EL_Model:
     ARTICLE_WIDTH = 128
     SENT_WIDTH = 64
 
-    DROP = 0.1
-    LEARN_RATE = 0.001
-    EPOCHS = 5
+    DROP = 0.4
+    LEARN_RATE = 0.005
+    EPOCHS = 10
     L2 = 1e-6
 
     name = "entity_linker"
@@ -62,12 +62,14 @@ class EL_Model:
     def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
         np.seterr(divide="raise", over="warn", under="ignore", invalid="raise")
 
+        id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
+
         train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \
-            self._get_training_data(training_dir, entity_descr_output, False, trainlimit, to_print=False)
+            self._get_training_data(training_dir, id_to_descr, False, trainlimit, to_print=False)
         train_clusters = list(train_ent.keys())
 
         dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \
-            self._get_training_data(training_dir, entity_descr_output, True, devlimit, to_print=False)
+            self._get_training_data(training_dir, id_to_descr, True, devlimit, to_print=False)
         dev_clusters = list(dev_ent.keys())
 
         dev_pos_count = len([g for g in dev_gold.values() if g])
@@ -386,9 +388,7 @@ class EL_Model:
         bp_doc(doc_gradients, sgd=self.sgd_article)
         bp_sent(sent_gradients, sgd=self.sgd_sent)
 
-    def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print):
-        id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
-
+    def _get_training_data(self, training_dir, id_to_descr, dev, limit, to_print):
         correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir,
                                                                                          collect_correct=True,
                                                                                          collect_incorrect=True)
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 40d737a6f..1f4b4b67e 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
+from examples.pipeline.wiki_entity_linking.train_descriptions import EntityEncoder
 from examples.pipeline.wiki_entity_linking.train_el import EL_Model
 
 import spacy
@@ -38,11 +39,14 @@ if __name__ == "__main__":
     to_read_kb = True
     to_test_kb = False
 
+    # run entity description pre-training
+    run_desc_training = True
+
     # create training dataset
     create_wp_training = False
 
-    # run training
-    run_training = True
+    # run EL training
+    run_el_training = False
 
     # apply named entity linking to the dev dataset
     apply_to_dev = False
@@ -101,17 +105,25 @@ if __name__ == "__main__":
             run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp)
             print()
 
+    # STEP 4b : read KB back in from file, create entity descriptions
+    # TODO: write back to file
+    if run_desc_training:
+        print("STEP 4b: training entity descriptions", datetime.datetime.now())
+        my_nlp = spacy.load('en_core_web_md')
+        EntityEncoder(my_kb, my_nlp).run(entity_descr_output=ENTITY_DESCR)
+        print()
+
     # STEP 5: create a training dataset from WP
     if create_wp_training:
         print("STEP 5: create training dataset", datetime.datetime.now())
         training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
 
     # STEP 6: apply the EL algorithm on the training dataset
-    if run_training:
+    if run_el_training:
         print("STEP 6: training", datetime.datetime.now())
         my_nlp = spacy.load('en_core_web_md')
         trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=20)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500)
         print()
 
     # STEP 7: apply the EL algorithm on the dev dataset
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index c8afd431e..d0c83b56e 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1177,6 +1177,8 @@ class EntityLinker(Pipe):
 
     def predict(self, docs):
         self.require_model()
+        final_entities = list()
+        final_kb_ids = list()
         for i, article_doc in enumerate(docs):
             doc_encoding = self.article_encoder([article_doc])
             for ent in article_doc.ents:
@@ -1188,23 +1190,27 @@ class EntityLinker(Pipe):
 
                 candidates = self.kb.get_candidates(ent.text)
                 if candidates:
-                    highest_sim = -5
-                    best_i = -1
                     with self.use_avg_params:
+                        scores = list()
                         for c in candidates:
+                            prior_prob = c.prior_prob
                             kb_id = c.entity_
                             description = self.id_to_descr.get(kb_id)
                             entity_encodings = self.entity_encoder([description])  # TODO: static entity vectors ?
                             sim = cosine(entity_encodings, mention_enc_t)
-                            if sim >= highest_sim:
-                                best_i = i
-                                highest_sim = sim
+                            score = prior_prob + sim - (prior_prob*sim)  # TODO: weights ?
+                            scores.append(score)
 
-                    # TODO best_candidate = max(candidates, key=lambda c: c.prior_prob)
+                        best_index = scores.index(max(scores))
+                        best_candidate = candidates[best_index]
+                        final_entities.append(ent)
+                        final_kb_ids.append(best_candidate)
+
+        return final_entities, final_kb_ids
 
     def set_annotations(self, docs, entities, kb_ids=None):
-        for token, kb_id in zip(entities, kb_ids):
-            token.ent_kb_id_ = kb_id
+        for entity, kb_id in zip(entities, kb_ids):
+            entity.ent_kb_id_ = kb_id
 
 class Sentencizer(object):
     """Segment the Doc into sentences using a rule-based strategy.

From 5c723c32c3e5e639f99005130c050afcf8230346 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 5 Jun 2019 18:29:18 +0200
Subject: [PATCH 064/148] entity vectors in the KB + serialization of them

---
 examples/pipeline/dummy_entity_linking.py     |   8 +-
 .../wiki_entity_linking/kb_creator.py         |   5 +-
 .../wiki_entity_linking/train_descriptions.py |  41 ++++-
 .../wiki_entity_linking/wiki_nel_pipeline.py  |   2 +-
 spacy/kb.pxd                                  |  31 ++--
 spacy/kb.pyx                                  | 143 +++++++++++++-----
 spacy/pipeline/pipes.pyx                      |   2 +-
 spacy/structs.pxd                             |  14 +-
 spacy/tests/pipeline/test_entity_linker.py    |  56 ++++---
 spacy/tests/serialize/test_serialize_kb.py    |  15 +-
 10 files changed, 223 insertions(+), 94 deletions(-)

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
index ae36a57b3..3f1fabdfd 100644
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -9,20 +9,20 @@ from spacy.kb import KnowledgeBase
 
 
 def create_kb(vocab):
-    kb = KnowledgeBase(vocab=vocab)
+    kb = KnowledgeBase(vocab=vocab, entity_vector_length=1)
 
     # adding entities
     entity_0 = "Q1004791_Douglas"
     print("adding entity", entity_0)
-    kb.add_entity(entity=entity_0, prob=0.5)
+    kb.add_entity(entity=entity_0, prob=0.5, entity_vector=[0])
 
     entity_1 = "Q42_Douglas_Adams"
     print("adding entity", entity_1)
-    kb.add_entity(entity=entity_1, prob=0.5)
+    kb.add_entity(entity=entity_1, prob=0.5, entity_vector=[1])
 
     entity_2 = "Q5301561_Douglas_Haig"
     print("adding entity", entity_2)
-    kb.add_entity(entity=entity_2, prob=0.5)
+    kb.add_entity(entity=entity_2, prob=0.5, entity_vector=[2])
 
     # adding aliases
     print()
diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index bb00f918d..ae3422c91 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -16,7 +16,7 @@ def create_kb(vocab, max_entities_per_alias, min_occ,
               count_input, prior_prob_input,
               to_print=False, write_entity_defs=True):
     """ Create the knowledge base from Wikidata entries """
-    kb = KnowledgeBase(vocab=vocab)
+    kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)  # TODO: entity vectors !
 
     print()
     print("1. _read_wikidata_entities", datetime.datetime.now())
@@ -38,7 +38,8 @@ def create_kb(vocab, max_entities_per_alias, min_occ,
     print()
     print("3. adding", len(entity_list), "entities", datetime.datetime.now())
     print()
-    kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None)
+    # TODO: vector_list !
+    kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None)
 
     print()
     print("4. adding aliases", datetime.datetime.now())
diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py
index 63149b5f7..88b1bf819 100644
--- a/examples/pipeline/wiki_entity_linking/train_descriptions.py
+++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py
@@ -19,7 +19,7 @@ class EntityEncoder:
 
     DROP = 0
     EPOCHS = 5
-    STOP_THRESHOLD = 0.05
+    STOP_THRESHOLD = 0.1
 
     BATCH_SIZE = 1000
 
@@ -38,6 +38,8 @@ class EntityEncoder:
         # TODO: apply and write to file afterwards !
         # self._apply_encoder(id_to_descr)
 
+        self._test_encoder()
+
     def _train_model(self, entity_descr_output, id_to_descr):
         # TODO: when loss gets too low, a 'mean of empty slice' warning is thrown by numpy
 
@@ -111,3 +113,40 @@ class EntityEncoder:
     def get_loss(golds, scores):
         loss, gradients = get_cossim_loss(scores, golds)
         return loss, gradients
+
+    def _test_encoder(self):
+        """ Test encoder on some dummy examples """
+        desc_A1 = "Fictional character in The Simpsons"
+        desc_A2 = "Simpsons - fictional human"
+        desc_A3 = "Fictional character in The Flintstones"
+        desc_A4 = "Politician from the US"
+
+        A1_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A1))])
+        A2_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A2))])
+        A3_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A3))])
+        A4_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A4))])
+
+        loss_a1_a1, _ = get_cossim_loss(A1_doc_vector, A1_doc_vector)
+        loss_a1_a2, _ = get_cossim_loss(A1_doc_vector, A2_doc_vector)
+        loss_a1_a3, _ = get_cossim_loss(A1_doc_vector, A3_doc_vector)
+        loss_a1_a4, _ = get_cossim_loss(A1_doc_vector, A4_doc_vector)
+
+        print("sim doc A1 A1", loss_a1_a1)
+        print("sim doc A1 A2", loss_a1_a2)
+        print("sim doc A1 A3", loss_a1_a3)
+        print("sim doc A1 A4", loss_a1_a4)
+
+        A1_encoded = self.encoder(A1_doc_vector)
+        A2_encoded = self.encoder(A2_doc_vector)
+        A3_encoded = self.encoder(A3_doc_vector)
+        A4_encoded = self.encoder(A4_doc_vector)
+
+        loss_a1_a1, _ = get_cossim_loss(A1_encoded, A1_encoded)
+        loss_a1_a2, _ = get_cossim_loss(A1_encoded, A2_encoded)
+        loss_a1_a3, _ = get_cossim_loss(A1_encoded, A3_encoded)
+        loss_a1_a4, _ = get_cossim_loss(A1_encoded, A4_encoded)
+
+        print("sim encoded A1 A1", loss_a1_a1)
+        print("sim encoded A1 A2", loss_a1_a2)
+        print("sim encoded A1 A3", loss_a1_a3)
+        print("sim encoded A1 A4", loss_a1_a4)
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 1f4b4b67e..d813238b7 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -93,7 +93,7 @@ if __name__ == "__main__":
         print("STEP 4: to_read_kb", datetime.datetime.now())
         my_vocab = Vocab()
         my_vocab.from_disk(VOCAB_DIR)
-        my_kb = KnowledgeBase(vocab=my_vocab)
+        my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64)  # TODO entity vectors
         my_kb.load_bulk(KB_FILE)
         print("kb entities:", my_kb.get_size_entities())
         print("kb aliases:", my_kb.get_size_aliases())
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 494848e5e..9c5a73d59 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -12,6 +12,8 @@ from .typedefs cimport hash_t
 from .structs cimport EntryC, AliasC
 ctypedef vector[EntryC] entry_vec
 ctypedef vector[AliasC] alias_vec
+ctypedef vector[float] float_vec
+ctypedef vector[float_vec] float_matrix
 
 
 # Object used by the Entity Linker that summarizes one entity-alias candidate combination.
@@ -20,6 +22,7 @@ cdef class Candidate:
     cdef readonly KnowledgeBase kb
     cdef hash_t entity_hash
     cdef float entity_freq
+    cdef vector[float] entity_vector
     cdef hash_t alias_hash
     cdef float prior_prob
 
@@ -27,6 +30,7 @@ cdef class Candidate:
 cdef class KnowledgeBase:
     cdef Pool mem
     cpdef readonly Vocab vocab
+    cdef int64_t entity_vector_length
 
     # This maps 64bit keys (hash of unique entity string)
     # to 64bit values (position of the _EntryC struct in the _entries vector).
@@ -59,7 +63,7 @@ cdef class KnowledgeBase:
     # model, that embeds different features of the entities into vectors. We'll
     # still want some per-entity features, like the Wikipedia text or entity
     # co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions.
-    cdef object _vectors_table
+    cdef float_matrix _vectors_table
 
     # It's very useful to track categorical features, at least for output, even
     # if they're not useful in the model itself. For instance, we should be
@@ -69,8 +73,15 @@ cdef class KnowledgeBase:
     cdef object _features_table
 
 
+    cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
+        """Add an entity vector to the vectors table."""
+        cdef int64_t new_index = self._vectors_table.size()
+        self._vectors_table.push_back(entity_vector)
+        return new_index
+
+
     cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
-                                     int32_t* vector_rows, int feats_row) nogil:
+                                     int32_t vector_index, int feats_row) nogil:
         """Add an entry to the vector of entries.
         After calling this method, make sure to update also the _entry_index using the return value"""
         # This is what we'll map the entity hash key to. It's where the entry will sit
@@ -80,7 +91,7 @@ cdef class KnowledgeBase:
         # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
         cdef EntryC entry
         entry.entity_hash = entity_hash
-        entry.vector_rows = vector_rows
+        entry.vector_index = vector_index
         entry.feats_row = feats_row
         entry.prob = prob
 
@@ -113,7 +124,7 @@ cdef class KnowledgeBase:
         # Avoid struct initializer to enable nogil
         cdef EntryC entry
         entry.entity_hash = dummy_hash
-        entry.vector_rows = &dummy_value
+        entry.vector_index = dummy_value
         entry.feats_row = dummy_value
         entry.prob = dummy_value
 
@@ -131,15 +142,16 @@ cdef class KnowledgeBase:
         self._aliases_table.push_back(alias)
 
     cpdef load_bulk(self, loc)
-    cpdef set_entities(self, entity_list, prob_list, vector_list, feature_list)
+    cpdef set_entities(self, entity_list, prob_list, vector_list)
     cpdef set_aliases(self, alias_list, entities_list, probabilities_list)
 
 
 cdef class Writer:
     cdef FILE* _fp
 
-    cdef int write_header(self, int64_t nr_entries) except -1
-    cdef int write_entry(self, hash_t entry_hash, float entry_prob) except -1
+    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
+    cdef int write_vector_element(self, float element) except -1
+    cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1
 
     cdef int write_alias_length(self, int64_t alias_length) except -1
     cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
@@ -150,8 +162,9 @@ cdef class Writer:
 cdef class Reader:
     cdef FILE* _fp
 
-    cdef int read_header(self, int64_t* nr_entries) except -1
-    cdef int read_entry(self, hash_t* entity_hash, float* prob) except -1
+    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
+    cdef int read_vector_element(self, float* element) except -1
+    cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1
 
     cdef int read_alias_length(self, int64_t* alias_length) except -1
     cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index d471130d0..790bb4992 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -26,10 +26,11 @@ from libcpp.vector cimport vector
 
 cdef class Candidate:
 
-    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, alias_hash, prior_prob):
+    def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
         self.kb = kb
         self.entity_hash = entity_hash
         self.entity_freq = entity_freq
+        self.entity_vector = entity_vector
         self.alias_hash = alias_hash
         self.prior_prob = prior_prob
 
@@ -57,19 +58,26 @@ cdef class Candidate:
     def entity_freq(self):
         return self.entity_freq
 
+    @property
+    def entity_vector(self):
+        return self.entity_vector
+
     @property
     def prior_prob(self):
         return self.prior_prob
 
 
 cdef class KnowledgeBase:
-    def __init__(self, Vocab vocab):
+
+    def __init__(self, Vocab vocab, entity_vector_length):
         self.vocab = vocab
         self.mem = Pool()
+        self.entity_vector_length = entity_vector_length
+
         self._entry_index = PreshMap()
         self._alias_index = PreshMap()
 
-        # TODO initialize self._entries and self._aliases_table ?
+        # Should we initialize self._entries and self._aliases_table to specific starting size ?
 
         self.vocab.strings.add("")
         self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
@@ -89,10 +97,10 @@ cdef class KnowledgeBase:
     def get_alias_strings(self):
         return [self.vocab.strings[x] for x in self._alias_index]
 
-    def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None):
+    def add_entity(self, unicode entity, float prob, vector[float] entity_vector):
         """
         Add an entity to the KB, optionally specifying its log probability based on corpus frequency
-        Return the hash of the entity ID/name at the end
+        Return the hash of the entity ID/name at the end.
         """
         cdef hash_t entity_hash = self.vocab.strings.add(entity)
 
@@ -101,31 +109,41 @@ cdef class KnowledgeBase:
             user_warning(Warnings.W018.format(entity=entity))
             return
 
-        cdef int32_t dummy_value = 342
-        new_index = self.c_add_entity(entity_hash=entity_hash, prob=prob,
-                                      vector_rows=&dummy_value, feats_row=dummy_value)
-        self._entry_index[entity_hash] = new_index
+        if len(entity_vector) != self.entity_vector_length:
+            # TODO: proper error
+            raise ValueError("Entity vector length should have been", self.entity_vector_length)
 
-        # TODO self._vectors_table.get_pointer(vectors),
-        # self._features_table.get(features))
+        vector_index = self.c_add_vector(entity_vector=entity_vector)
+
+        new_index = self.c_add_entity(entity_hash=entity_hash,
+                                      prob=prob,
+                                      vector_index=vector_index,
+                                      feats_row=-1)  # Features table currently not implemented
+        self._entry_index[entity_hash] = new_index
 
         return entity_hash
 
-    cpdef set_entities(self, entity_list, prob_list, vector_list, feature_list):
+    cpdef set_entities(self, entity_list, prob_list, vector_list):
         nr_entities = len(entity_list)
         self._entry_index = PreshMap(nr_entities+1)
         self._entries = entry_vec(nr_entities+1)
 
         i = 0
         cdef EntryC entry
-        cdef int32_t dummy_value = 342
         while i < nr_entities:
-            # TODO features and vectors
-            entity_hash = self.vocab.strings.add(entity_list[i])
+            entity_vector = entity_list[i]
+            if len(entity_vector) != self.entity_vector_length:
+                # TODO: proper error
+                raise ValueError("Entity vector length should have been", self.entity_vector_length)
+
+            entity_hash = self.vocab.strings.add(entity_vector)
             entry.entity_hash = entity_hash
             entry.prob = prob_list[i]
-            entry.vector_rows = &dummy_value
-            entry.feats_row = dummy_value
+
+            vector_index = self.c_add_vector(entity_vector=vector_list[i])
+            entry.vector_index = vector_index
+
+            entry.feats_row = -1   # Features table currently not implemented
 
             self._entries[i+1] = entry
             self._entry_index[entity_hash] = i+1
@@ -186,7 +204,7 @@ cdef class KnowledgeBase:
 
         cdef hash_t alias_hash = self.vocab.strings.add(alias)
 
-        # Return if this alias was added before
+        # Check whether this alias was added before
         if alias_hash in self._alias_index:
             user_warning(Warnings.W017.format(alias=alias))
             return
@@ -208,9 +226,7 @@ cdef class KnowledgeBase:
 
         return alias_hash
 
-
     def get_candidates(self, unicode alias):
-        """ TODO: where to put this functionality ?"""
         cdef hash_t alias_hash = self.vocab.strings[alias]
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
@@ -218,6 +234,7 @@ cdef class KnowledgeBase:
         return [Candidate(kb=self,
                           entity_hash=self._entries[entry_index].entity_hash,
                           entity_freq=self._entries[entry_index].prob,
+                          entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
                           alias_hash=alias_hash,
                           prior_prob=prob)
                 for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
@@ -226,16 +243,23 @@ cdef class KnowledgeBase:
 
     def dump(self, loc):
         cdef Writer writer = Writer(loc)
-        writer.write_header(self.get_size_entities())
+        writer.write_header(self.get_size_entities(), self.entity_vector_length)
+
+        # dumping the entity vectors in their original order
+        i = 0
+        for entity_vector in self._vectors_table:
+            for element in entity_vector:
+                writer.write_vector_element(element)
+            i = i+1
 
         # dumping the entry records in the order in which they are in the _entries vector.
         # index 0 is a dummy object not stored in the _entry_index and can be ignored.
         i = 1
         for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
             entry = self._entries[entry_index]
-            assert entry.entity_hash ==  entry_hash
+            assert entry.entity_hash == entry_hash
             assert entry_index == i
-            writer.write_entry(entry.entity_hash, entry.prob)
+            writer.write_entry(entry.entity_hash, entry.prob, entry.vector_index)
             i = i+1
 
         writer.write_alias_length(self.get_size_aliases())
@@ -262,31 +286,47 @@ cdef class KnowledgeBase:
         cdef hash_t alias_hash
         cdef int64_t entry_index
         cdef float prob
+        cdef int32_t vector_index
         cdef EntryC entry
         cdef AliasC alias
-        cdef int32_t dummy_value = 342
+        cdef float vector_element
 
         cdef Reader reader = Reader(loc)
 
-        # Step 1: load entities
-
+        # STEP 0: load header and initialize KB
         cdef int64_t nr_entities
-        reader.read_header(&nr_entities)
+        cdef int64_t entity_vector_length
+        reader.read_header(&nr_entities, &entity_vector_length)
+
+        self.entity_vector_length = entity_vector_length
         self._entry_index = PreshMap(nr_entities+1)
         self._entries = entry_vec(nr_entities+1)
+        self._vectors_table = float_matrix(nr_entities+1)
 
+        # STEP 1: load entity vectors
+        cdef int i = 0
+        cdef int j = 0
+        while i < nr_entities:
+            entity_vector = float_vec(entity_vector_length)
+            j = 0
+            while j < entity_vector_length:
+                reader.read_vector_element(&vector_element)
+                entity_vector[j] = vector_element
+                j = j+1
+            self._vectors_table[i] = entity_vector
+            i = i+1
+
+        # STEP 2: load entities
         # we assume that the entity data was written in sequence
         # index 0 is a dummy object not stored in the _entry_index and can be ignored.
-        # TODO: should we initialize the dummy objects ?
-        cdef int i = 1
+        i = 1
         while i <= nr_entities:
-            reader.read_entry(&entity_hash, &prob)
+            reader.read_entry(&entity_hash, &prob, &vector_index)
 
-            # TODO features and vectors
             entry.entity_hash = entity_hash
             entry.prob = prob
-            entry.vector_rows = &dummy_value
-            entry.feats_row = dummy_value
+            entry.vector_index = vector_index
+            entry.feats_row = -1    # Features table currently not implemented
 
             self._entries[i] = entry
             self._entry_index[entity_hash] = i
@@ -296,7 +336,8 @@ cdef class KnowledgeBase:
         # check that all entities were read in properly
         assert nr_entities == self.get_size_entities()
 
-        # Step 2: load aliases
+        # STEP 3: load aliases
+
         cdef int64_t nr_aliases
         reader.read_alias_length(&nr_aliases)
         self._alias_index = PreshMap(nr_aliases+1)
@@ -344,13 +385,18 @@ cdef class Writer:
         cdef size_t status = fclose(self._fp)
         assert status == 0
 
-    cdef int write_header(self, int64_t nr_entries) except -1:
+    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
         self._write(&nr_entries, sizeof(nr_entries))
+        self._write(&entity_vector_length, sizeof(entity_vector_length))
 
-    cdef int write_entry(self, hash_t entry_hash, float entry_prob) except -1:
-        # TODO: feats_rows and vector rows
+    cdef int write_vector_element(self, float element) except -1:
+        self._write(&element, sizeof(element))
+
+    cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1:
         self._write(&entry_hash, sizeof(entry_hash))
         self._write(&entry_prob, sizeof(entry_prob))
+        self._write(&vector_index, sizeof(vector_index))
+        # Features table currently not implemented and not written to file
 
     cdef int write_alias_length(self, int64_t alias_length) except -1:
         self._write(&alias_length, sizeof(alias_length))
@@ -381,14 +427,27 @@ cdef class Reader:
     def __dealloc__(self):
         fclose(self._fp)
 
-    cdef int read_header(self, int64_t* nr_entries) except -1:
+    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
         status = self._read(nr_entries, sizeof(int64_t))
         if status < 1:
             if feof(self._fp):
                 return 0  # end of file
             raise IOError("error reading header from input file")
 
-    cdef int read_entry(self, hash_t* entity_hash, float* prob) except -1:
+        status = self._read(entity_vector_length, sizeof(int64_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading header from input file")
+
+    cdef int read_vector_element(self, float* element) except -1:
+        status = self._read(element, sizeof(float))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading entity vector from input file")
+
+    cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1:
         status = self._read(entity_hash, sizeof(hash_t))
         if status < 1:
             if feof(self._fp):
@@ -401,6 +460,12 @@ cdef class Reader:
                 return 0  # end of file
             raise IOError("error reading entity prob from input file")
 
+        status = self._read(vector_index, sizeof(int32_t))
+        if status < 1:
+            if feof(self._fp):
+                return 0  # end of file
+            raise IOError("error reading entity vector from input file")
+
         if feof(self._fp):
             return 0
         else:
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index d0c83b56e..d9fbe59ff 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -3,7 +3,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-cimport numpy as np
+import numpy as np
 
 import numpy
 import srsly
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 69a1f4961..8de4d5f4c 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -84,16 +84,12 @@ cdef struct EntryC:
     # The hash of this entry's unique ID/name in the kB
     hash_t entity_hash
 
-    # Allows retrieval of one or more vectors.
-    # Each element of vector_rows should be an index into a vectors table.
-    # Every entry should have the same number of vectors, so we can avoid storing
-    # the number of vectors in each knowledge-base struct
-    int32_t* vector_rows
+    # Allows retrieval of the entity vector, as an index into a vectors table of the KB.
+    # Can be expanded later to refer to multiple rows (compositional model to reduce storage footprint).
+    int32_t vector_index
 
-    # Allows retrieval of a struct of non-vector features. We could make this a
-    # pointer, but we have 32 bits left over in the struct after prob, so we'd
-    # like this to only be 32 bits. We can also set this to -1, for the common
-    # case where there are no features.
+    # Allows retrieval of a struct of non-vector features.
+    # This is currently not implemented and set to -1 for the common case where there are no features.
     int32_t feats_row
 
     # log probability of entity, based on corpus frequency
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 61baece68..b44332df4 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -14,12 +14,12 @@ def nlp():
 
 def test_kb_valid_entities(nlp):
     """Test the valid construction of a KB with 3 entities and two aliases"""
-    mykb = KnowledgeBase(nlp.vocab)
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity=u'Q1', prob=0.9)
-    mykb.add_entity(entity=u'Q2')
-    mykb.add_entity(entity=u'Q3', prob=0.5)
+    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity=u'Q2', prob=0.5, entity_vector=[2])
+    mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3])
 
     # adding aliases
     mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
@@ -32,12 +32,12 @@ def test_kb_valid_entities(nlp):
 
 def test_kb_invalid_entities(nlp):
     """Test the invalid construction of a KB with an alias linked to a non-existing entity"""
-    mykb = KnowledgeBase(nlp.vocab)
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity=u'Q1', prob=0.9)
-    mykb.add_entity(entity=u'Q2', prob=0.2)
-    mykb.add_entity(entity=u'Q3', prob=0.5)
+    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3])
 
     # adding aliases - should fail because one of the given IDs is not valid
     with pytest.raises(ValueError):
@@ -46,12 +46,12 @@ def test_kb_invalid_entities(nlp):
 
 def test_kb_invalid_probabilities(nlp):
     """Test the invalid construction of a KB with wrong prior probabilities"""
-    mykb = KnowledgeBase(nlp.vocab)
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity=u'Q1', prob=0.9)
-    mykb.add_entity(entity=u'Q2', prob=0.2)
-    mykb.add_entity(entity=u'Q3', prob=0.5)
+    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3])
 
     # adding aliases - should fail because the sum of the probabilities exceeds 1
     with pytest.raises(ValueError):
@@ -60,26 +60,38 @@ def test_kb_invalid_probabilities(nlp):
 
 def test_kb_invalid_combination(nlp):
     """Test the invalid construction of a KB with non-matching entity and probability lists"""
-    mykb = KnowledgeBase(nlp.vocab)
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity=u'Q1', prob=0.9)
-    mykb.add_entity(entity=u'Q2', prob=0.2)
-    mykb.add_entity(entity=u'Q3', prob=0.5)
+    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3])
 
     # adding aliases - should fail because the entities and probabilities vectors are not of equal length
     with pytest.raises(ValueError):
         mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.3, 0.4, 0.1])
 
 
-def test_candidate_generation(nlp):
-    """Test correct candidate generation"""
-    mykb = KnowledgeBase(nlp.vocab)
+def test_kb_invalid_entity_vector(nlp):
+    """Test the invalid construction of a KB with non-matching entity vector lengths"""
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
 
     # adding entities
-    mykb.add_entity(entity=u'Q1', prob=0.9)
-    mykb.add_entity(entity=u'Q2', prob=0.2)
-    mykb.add_entity(entity=u'Q3', prob=0.5)
+    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1, 2, 3])
+
+    # this should fail because the kb's expected entity vector length is 3
+    with pytest.raises(ValueError):
+        mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2])
+
+
+def test_candidate_generation(nlp):
+    """Test correct candidate generation"""
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+
+    # adding entities
+    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3])
 
     # adding aliases
     mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 7b1380623..7a8022890 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -20,7 +20,7 @@ def test_serialize_kb_disk(en_vocab):
         print(file_path, type(file_path))
         kb1.dump(str(file_path))
 
-        kb2 = KnowledgeBase(vocab=en_vocab)
+        kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
         kb2.load_bulk(str(file_path))
 
     # final assertions
@@ -28,12 +28,13 @@ def test_serialize_kb_disk(en_vocab):
 
 
 def _get_dummy_kb(vocab):
-    kb = KnowledgeBase(vocab=vocab)
+    kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
+
+    kb.add_entity(entity="Q53", prob=0.33, entity_vector=[0, 5, 3])
+    kb.add_entity(entity="Q17", prob=0.2, entity_vector=[7, 1, 0])
+    kb.add_entity(entity="Q007", prob=0.7, entity_vector=[0, 0, 7])
+    kb.add_entity(entity="Q44", prob=0.4, entity_vector=[4, 4, 4])
 
-    kb.add_entity(entity="Q53", prob=0.33)
-    kb.add_entity(entity="Q17", prob=0.2)
-    kb.add_entity(entity="Q007", prob=0.7)
-    kb.add_entity(entity="Q44", prob=0.4)
     kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
     kb.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
     kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
@@ -62,10 +63,12 @@ def _check_kb(kb):
 
     assert candidates[0].entity_ == "Q007"
     assert 0.6999 < candidates[0].entity_freq < 0.701
+    assert candidates[0].entity_vector == [0, 0, 7]
     assert candidates[0].alias_ == "double07"
     assert 0.899 < candidates[0].prior_prob < 0.901
 
     assert candidates[1].entity_ == "Q17"
     assert 0.199 < candidates[1].entity_freq < 0.201
+    assert candidates[1].entity_vector == [7, 1, 0]
     assert candidates[1].alias_ == "double07"
     assert 0.099 < candidates[1].prior_prob < 0.101

From d8b435ceffcf9143a78678d7c87a8e4e4216dcc5 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 6 Jun 2019 19:51:27 +0200
Subject: [PATCH 065/148] pretraining description vectors and storing them in
 the KB

---
 .../wiki_entity_linking/kb_creator.py         | 68 ++++++++++++----
 .../pipeline/wiki_entity_linking/run_el.py    |  9 +++
 .../wiki_entity_linking/train_descriptions.py | 79 +++++++++++--------
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 39 ++++-----
 spacy/kb.pyx                                  | 14 +++-
 spacy/language.py                             |  2 +-
 6 files changed, 133 insertions(+), 78 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index ae3422c91..74e8efabd 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 import spacy
+from examples.pipeline.wiki_entity_linking.train_descriptions import EntityEncoder
 from spacy.kb import KnowledgeBase
 
 import csv
@@ -10,25 +11,47 @@ import datetime
 from . import wikipedia_processor as wp
 from . import wikidata_processor as wd
 
+INPUT_DIM = 300  # dimension of pre-trained vectors
+DESC_WIDTH = 64
 
-def create_kb(vocab, max_entities_per_alias, min_occ,
+def create_kb(nlp, max_entities_per_alias, min_occ,
               entity_def_output, entity_descr_output,
-              count_input, prior_prob_input,
-              to_print=False, write_entity_defs=True):
+              count_input, prior_prob_input, to_print=False):
     """ Create the knowledge base from Wikidata entries """
-    kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)  # TODO: entity vectors !
+    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH)
 
-    print()
-    print("1. _read_wikidata_entities", datetime.datetime.now())
-    print()
-    title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None)
+    # disable parts of the pipeline when rerunning
+    read_raw_data = False
 
-    # write the title-ID and ID-description mappings to file
-    if write_entity_defs:
+    if read_raw_data:
+        print()
+        print("1. _read_wikidata_entities", datetime.datetime.now())
+        print()
+        title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None)
+
+        # write the title-ID and ID-description mappings to file
         _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr)
 
+    else:
+        # read the mappings from file
+        title_to_id = _get_entity_to_id(entity_def_output)
+        id_to_descr = _get_id_to_description(entity_descr_output)
+
     title_list = list(title_to_id.keys())
+
+    # TODO: remove this filter (just for quicker testing of code)
+    title_list = title_list[0:34200]
+    title_to_id = {t: title_to_id[t] for t in title_list}
+
+    # print("title_list", len(title_list), title_list[0:3])
+
     entity_list = [title_to_id[x] for x in title_list]
+    # print("entity_list", len(entity_list), entity_list[0:3])
+
+    # TODO: should we remove entities from the KB where there is no description ?
+    description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
+    # print("description_list", len(description_list), description_list[0:3])
+
 
     print()
     print("2. _get_entity_frequencies", datetime.datetime.now())
@@ -36,13 +59,27 @@ def create_kb(vocab, max_entities_per_alias, min_occ,
     entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list)
 
     print()
-    print("3. adding", len(entity_list), "entities", datetime.datetime.now())
+    print("3. train entity encoder", datetime.datetime.now())
     print()
-    # TODO: vector_list !
-    kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None)
+
+    encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH)
+    encoder.train(description_list=description_list, to_print=True)
+    print()
+
+    print("4. get entity embeddings", datetime.datetime.now())
+    print()
+    embeddings = encoder.apply_encoder(description_list)
+    # print("descriptions", description_list[0:3])
+    # print("embeddings", len(embeddings), embeddings[0:3])
+    #print("embeddings[0]", len(embeddings[0]), embeddings[0][0:3])
 
     print()
-    print("4. adding aliases", datetime.datetime.now())
+    print("5. adding", len(entity_list), "entities", datetime.datetime.now())
+    print()
+    kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings)
+
+    print()
+    print("6. adding aliases", datetime.datetime.now())
     print()
     _add_aliases(kb, title_to_id=title_to_id,
                  max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
@@ -67,7 +104,6 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_
         for qid, descr in id_to_descr.items():
             descr_file.write(str(qid) + "|" + descr + "\n")
 
-
 def _get_entity_to_id(entity_def_output):
     entity_to_id = dict()
     with open(entity_def_output, 'r', encoding='utf8') as csvfile:
@@ -99,11 +135,11 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
         print("wp titles:", wp_titles)
 
     # adding aliases with prior probabilities
+        # we can read this file sequentially, it's sorted by alias, and then by count
     with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
         # skip header
         prior_file.readline()
         line = prior_file.readline()
-        # we can read this file sequentially, it's sorted by alias, and then by count
         previous_alias = None
         total_count = 0
         counts = list()
diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
index c0c219829..f6797587e 100644
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -12,6 +12,15 @@ from examples.pipeline.wiki_entity_linking import training_set_creator
 # import neuralcoref
 
 
+def run_kb_toy_example(kb):
+    for mention in ("Bush", "President", "Homer"):
+        candidates = kb.get_candidates(mention)
+
+        print("generating candidates for " + mention + " :")
+        for c in candidates:
+            print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
+        print()
+
 def run_el_toy_example(nlp, kb):
     _prepare_pipeline(nlp, kb)
 
diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py
index 88b1bf819..8513a25fd 100644
--- a/examples/pipeline/wiki_entity_linking/train_descriptions.py
+++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py
@@ -14,72 +14,83 @@ from thinc.neural._classes.affine import Affine
 
 class EntityEncoder:
 
-    INPUT_DIM = 300  # dimension of pre-trained vectors
-    DESC_WIDTH = 64
-
     DROP = 0
     EPOCHS = 5
-    STOP_THRESHOLD = 0.1
+    STOP_THRESHOLD = 0.9 # 0.1
 
     BATCH_SIZE = 1000
 
-    def __init__(self, kb, nlp):
+    def __init__(self, nlp, input_dim, desc_width):
         self.nlp = nlp
-        self.kb = kb
+        self.input_dim = input_dim
+        self.desc_width = desc_width
 
-    def run(self, entity_descr_output):
-        id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
+    def apply_encoder(self, description_list):
+        if self.encoder is None:
+            raise ValueError("Can not apply encoder before training it")
 
-        processed, loss = self._train_model(entity_descr_output, id_to_descr)
-        print("Trained on", processed, "entities across", self.EPOCHS, "epochs")
-        print("Final loss:", loss)
-        print()
+        print("Encoding", len(description_list), "entities")
 
-        # TODO: apply and write to file afterwards !
-        # self._apply_encoder(id_to_descr)
+        batch_size = 10000
 
-        self._test_encoder()
+        start = 0
+        stop = min(batch_size, len(description_list))
+        encodings = []
 
-    def _train_model(self, entity_descr_output, id_to_descr):
+        while start < len(description_list):
+            docs = list(self.nlp.pipe(description_list[start:stop]))
+            doc_embeddings = [self._get_doc_embedding(doc) for doc in docs]
+            enc = self.encoder(np.asarray(doc_embeddings))
+            encodings.extend(enc.tolist())
+
+            start = start + batch_size
+            stop = min(stop + batch_size, len(description_list))
+            print("encoded :", len(encodings))
+
+        return encodings
+
+    def train(self, description_list, to_print=False):
+        processed, loss = self._train_model(description_list)
+
+        if to_print:
+            print("Trained on", processed, "entities across", self.EPOCHS, "epochs")
+            print("Final loss:", loss)
+
+        # self._test_encoder()
+
+    def _train_model(self, description_list):
         # TODO: when loss gets too low, a 'mean of empty slice' warning is thrown by numpy
 
-        self._build_network(self.INPUT_DIM, self.DESC_WIDTH)
+        self._build_network(self.input_dim, self.desc_width)
 
         processed = 0
         loss = 1
+        descriptions = description_list.copy()   # copy this list so that shuffling does not affect other functions
 
         for i in range(self.EPOCHS):
-            entity_keys = list(id_to_descr.keys())
-            shuffle(entity_keys)
+            shuffle(descriptions)
 
             batch_nr = 0
             start = 0
-            stop = min(self.BATCH_SIZE, len(entity_keys))
+            stop = min(self.BATCH_SIZE, len(descriptions))
 
-            while loss > self.STOP_THRESHOLD and start < len(entity_keys):
+            while loss > self.STOP_THRESHOLD and start < len(descriptions):
                 batch = []
-                for e in entity_keys[start:stop]:
-                    descr = id_to_descr[e]
+                for descr in descriptions[start:stop]:
                     doc = self.nlp(descr)
                     doc_vector = self._get_doc_embedding(doc)
                     batch.append(doc_vector)
 
-                loss = self.update(batch)
+                loss = self._update(batch)
                 print(i, batch_nr, loss)
                 processed += len(batch)
 
                 batch_nr += 1
                 start = start + self.BATCH_SIZE
-                stop = min(stop + self.BATCH_SIZE, len(entity_keys))
+                stop = min(stop + self.BATCH_SIZE, len(descriptions))
 
         return processed, loss
 
-    def _apply_encoder(self, id_to_descr):
-        for id, descr in id_to_descr.items():
-            doc = self.nlp(descr)
-            doc_vector = self._get_doc_embedding(doc)
-            encoding = self.encoder(np.asarray([doc_vector]))
-
     @staticmethod
     def _get_doc_embedding(doc):
         indices = np.zeros((len(doc),), dtype="i")
@@ -101,16 +112,16 @@ class EntityEncoder:
 
         self.sgd = create_default_optimizer(self.model.ops)
 
-    def update(self, vectors):
+    def _update(self, vectors):
         predictions, bp_model = self.model.begin_update(np.asarray(vectors), drop=self.DROP)
 
-        loss, d_scores = self.get_loss(scores=predictions, golds=np.asarray(vectors))
+        loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors))
         bp_model(d_scores, sgd=self.sgd)
 
         return loss / len(vectors)
 
     @staticmethod
-    def get_loss(golds, scores):
+    def _get_loss(golds, scores):
         loss, gradients = get_cossim_loss(scores, golds)
         return loss, gradients
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index d813238b7..a669634f9 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -2,7 +2,6 @@
 from __future__ import unicode_literals
 
 from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
-from examples.pipeline.wiki_entity_linking.train_descriptions import EntityEncoder
 from examples.pipeline.wiki_entity_linking.train_el import EL_Model
 
 import spacy
@@ -28,6 +27,7 @@ TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
 if __name__ == "__main__":
     print("START", datetime.datetime.now())
     print()
+    nlp = spacy.load('en_core_web_lg')
     my_kb = None
 
     # one-time methods to create KB and write to file
@@ -37,10 +37,7 @@ if __name__ == "__main__":
 
     # read KB back in from file
     to_read_kb = True
-    to_test_kb = False
-
-    # run entity description pre-training
-    run_desc_training = True
+    to_test_kb = True
 
     # create training dataset
     create_wp_training = False
@@ -51,6 +48,8 @@ if __name__ == "__main__":
     # apply named entity linking to the dev dataset
     apply_to_dev = False
 
+    to_test_pipeline = False
+
     # STEP 1 : create prior probabilities from WP
     # run only once !
     if to_create_prior_probs:
@@ -69,9 +68,7 @@ if __name__ == "__main__":
     # run only once !
     if to_create_kb:
         print("STEP 3a: to_create_kb", datetime.datetime.now())
-        my_nlp = spacy.load('en_core_web_sm')
-        my_vocab = my_nlp.vocab
-        my_kb = kb_creator.create_kb(my_vocab,
+        my_kb = kb_creator.create_kb(nlp,
                                      max_entities_per_alias=10,
                                      min_occ=5,
                                      entity_def_output=ENTITY_DEFS,
@@ -85,7 +82,7 @@ if __name__ == "__main__":
 
         print("STEP 3b: write KB", datetime.datetime.now())
         my_kb.dump(KB_FILE)
-        my_vocab.to_disk(VOCAB_DIR)
+        nlp.vocab.to_disk(VOCAB_DIR)
         print()
 
     # STEP 4 : read KB back in from file
@@ -101,18 +98,9 @@ if __name__ == "__main__":
 
         # test KB
         if to_test_kb:
-            my_nlp = spacy.load('en_core_web_sm')
-            run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp)
+            run_el.run_kb_toy_example(kb=my_kb)
             print()
 
-    # STEP 4b : read KB back in from file, create entity descriptions
-    # TODO: write back to file
-    if run_desc_training:
-        print("STEP 4b: training entity descriptions", datetime.datetime.now())
-        my_nlp = spacy.load('en_core_web_md')
-        EntityEncoder(my_kb, my_nlp).run(entity_descr_output=ENTITY_DESCR)
-        print()
-
     # STEP 5: create a training dataset from WP
     if create_wp_training:
         print("STEP 5: create training dataset", datetime.datetime.now())
@@ -121,15 +109,18 @@ if __name__ == "__main__":
     # STEP 6: apply the EL algorithm on the training dataset
     if run_el_training:
         print("STEP 6: training", datetime.datetime.now())
-        my_nlp = spacy.load('en_core_web_md')
-        trainer = EL_Model(kb=my_kb, nlp=my_nlp)
+        trainer = EL_Model(kb=my_kb, nlp=nlp)
         trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500)
         print()
 
-    # STEP 7: apply the EL algorithm on the dev dataset
+    # STEP 7: apply the EL algorithm on the dev dataset (TODO: overlaps with code from run_el_training ?)
     if apply_to_dev:
-        my_nlp = spacy.load('en_core_web_md')
-        run_el.run_el_dev(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=2000)
+        run_el.run_el_dev(kb=my_kb, nlp=nlp, training_dir=TRAINING_DIR, limit=2000)
+        print()
+
+    # test KB
+    if to_test_pipeline:
+        run_el.run_el_toy_example(kb=my_kb, nlp=nlp)
         print()
 
     # TODO coreference resolution
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 790bb4992..30440227f 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -124,6 +124,14 @@ cdef class KnowledgeBase:
         return entity_hash
 
     cpdef set_entities(self, entity_list, prob_list, vector_list):
+        if len(entity_list) != len(prob_list):
+            # TODO: proper error
+            raise ValueError("Entity list and prob list should have the same length")
+
+        if len(entity_list) != len(vector_list):
+            # TODO: proper error
+            raise ValueError("Entity list and vector list should have the same length")
+
         nr_entities = len(entity_list)
         self._entry_index = PreshMap(nr_entities+1)
         self._entries = entry_vec(nr_entities+1)
@@ -131,12 +139,12 @@ cdef class KnowledgeBase:
         i = 0
         cdef EntryC entry
         while i < nr_entities:
-            entity_vector = entity_list[i]
+            entity_vector = vector_list[i]
             if len(entity_vector) != self.entity_vector_length:
                 # TODO: proper error
-                raise ValueError("Entity vector length should have been", self.entity_vector_length)
+                raise ValueError("Entity vector is", len(entity_vector), "length but should have been", self.entity_vector_length)
 
-            entity_hash = self.vocab.strings.add(entity_vector)
+            entity_hash = self.vocab.strings.add(entity_list[i])
             entry.entity_hash = entity_hash
             entry.prob = prob_list[i]
 
diff --git a/spacy/language.py b/spacy/language.py
index 39d95c689..ec3232bd5 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -117,7 +117,7 @@ class Language(object):
         "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
         "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
         "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
-        "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
+        "entity_linker": lambda nlp, **cfg: EntityLinker(**cfg),
         "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
         "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
         "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),

From 61f0e2af654ae6202a9b283794021c84d458fd5b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 6 Jun 2019 20:22:14 +0200
Subject: [PATCH 066/148] code cleanup

---
 .../wiki_entity_linking/kb_creator.py         | 13 ++------
 .../pipeline/wiki_entity_linking/run_el.py    | 12 ++-----
 .../training_set_creator.py                   |  1 -
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 14 ++++++--
 spacy/pipeline/pipes.pyx                      | 32 ++++++++-----------
 5 files changed, 31 insertions(+), 41 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index 74e8efabd..ee632bd48 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -43,15 +43,10 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
     title_list = title_list[0:34200]
     title_to_id = {t: title_to_id[t] for t in title_list}
 
-    # print("title_list", len(title_list), title_list[0:3])
-
     entity_list = [title_to_id[x] for x in title_list]
-    # print("entity_list", len(entity_list), entity_list[0:3])
 
-    # TODO: should we remove entities from the KB where there is no description ?
+    # Currently keeping entities from the KB where there is no description - putting a default void description
     description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
-    # print("description_list", len(description_list), description_list[0:3])
-
 
     print()
     print("2. _get_entity_frequencies", datetime.datetime.now())
@@ -69,9 +64,6 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
     print("4. get entity embeddings", datetime.datetime.now())
     print()
     embeddings = encoder.apply_encoder(description_list)
-    # print("descriptions", description_list[0:3])
-    # print("embeddings", len(embeddings), embeddings[0:3])
-    #print("embeddings[0]", len(embeddings[0]), embeddings[0][0:3])
 
     print()
     print("5. adding", len(entity_list), "entities", datetime.datetime.now())
@@ -104,6 +96,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_
         for qid, descr in id_to_descr.items():
             descr_file.write(str(qid) + "|" + descr + "\n")
 
+
 def _get_entity_to_id(entity_def_output):
     entity_to_id = dict()
     with open(entity_def_output, 'r', encoding='utf8') as csvfile:
@@ -135,7 +128,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
         print("wp titles:", wp_titles)
 
     # adding aliases with prior probabilities
-        # we can read this file sequentially, it's sorted by alias, and then by count
+    # we can read this file sequentially, it's sorted by alias, and then by count
     with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
         # skip header
         prior_file.readline()
diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
index f6797587e..c3074ab5c 100644
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -13,7 +13,7 @@ from examples.pipeline.wiki_entity_linking import training_set_creator
 
 
 def run_kb_toy_example(kb):
-    for mention in ("Bush", "President", "Homer"):
+    for mention in ("Bush", "Douglas Adams", "Homer"):
         candidates = kb.get_candidates(mention)
 
         print("generating candidates for " + mention + " :")
@@ -128,18 +128,12 @@ def evaluate(predictions, golds, to_print=True, times_hundred=True):
     return precision, recall, fscore, accuracy
 
 
-def _prepare_pipeline(nlp, kb):
-    # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
-    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
-    nlp.add_pipe(el_pipe, last=True)
+
 
 
 # TODO
-def add_coref():
+def add_coref(nlp):
     """ Add coreference resolution to our model """
-    nlp = spacy.load('en_core_web_sm')
-    # nlp = spacy.load('en')
-
     # TODO: this doesn't work yet
     # neuralcoref.add_to_pipe(nlp)
     print("done adding to pipe")
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index b1c63c55c..ac8ad0744 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -18,7 +18,6 @@ ENTITY_FILE = "gold_entities.csv"
 def create_training(kb, entity_def_input, training_output):
     if not kb:
         raise ValueError("kb should be defined")
-    # nlp = spacy.load('en_core_web_sm')
     wp_to_id = kb_creator._get_entity_to_id(entity_def_input)
     _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000)  # TODO: full dataset
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index a669634f9..390a6800b 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -37,11 +37,13 @@ if __name__ == "__main__":
 
     # read KB back in from file
     to_read_kb = True
-    to_test_kb = True
+    to_test_kb = False
 
     # create training dataset
     create_wp_training = False
 
+    train_pipe = True
+
     # run EL training
     run_el_training = False
 
@@ -106,7 +108,15 @@ if __name__ == "__main__":
         print("STEP 5: create training dataset", datetime.datetime.now())
         training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
 
-    # STEP 6: apply the EL algorithm on the training dataset
+    # STEP 6: create the entity linking pipe
+    if train_pipe:
+        # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
+        el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb})
+        nlp.add_pipe(el_pipe, last=True)
+
+    ### BELOW CODE IS DEPRECATED ###
+
+    # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx
     if run_el_training:
         print("STEP 6: training", datetime.datetime.now())
         trainer = EL_Model(kb=my_kb, nlp=nlp)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index d9fbe59ff..c5187a593 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1067,41 +1067,37 @@ cdef class EntityRecognizer(Parser):
 
 
 class EntityLinker(Pipe):
+    """Pipeline component for named entity linking.
+
+    DOCS: TODO
+    """
     name = 'entity_linker'
 
     @classmethod
     def Model(cls, **cfg):
         embed_width = cfg.get("embed_width", 300)
         hidden_width = cfg.get("hidden_width", 32)
-        entity_width = cfg.get("entity_width", 64)
         article_width = cfg.get("article_width", 128)
         sent_width = cfg.get("sent_width", 64)
-
-        entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width)
+        entity_width = cfg["kb"].entity_vector_length
 
         article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
         sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
 
         # dimension of the mention encoder needs to match the dimension of the entity encoder
-        mention_width = entity_encoder.nO
+        mention_width = article_width + sent_width
         mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
 
-        return entity_encoder, article_encoder, sent_encoder, mention_encoder
+        return article_encoder, sent_encoder, mention_encoder
 
     def __init__(self, **cfg):
-        # TODO: bring-your-own-model
         self.mention_encoder = True
-
         self.cfg = dict(cfg)
         self.kb = self.cfg["kb"]
 
-        # TODO: fix this. store entity vectors in the KB ?
-        self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv')
-
     def use_avg_params(self):
         """Modify the pipe's encoders/models, to use their average parameter values."""
         with self.article_encoder.use_params(self.sgd_article.averages) \
-                 and self.entity_encoder.use_params(self.sgd_entity.averages)\
                  and self.sent_encoder.use_params(self.sgd_sent.averages) \
                  and self.mention_encoder.use_params(self.sgd_mention.averages):
             yield
@@ -1113,14 +1109,13 @@ class EntityLinker(Pipe):
 
     def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
         if self.mention_encoder is True:
-            self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
+            self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
             self.sgd_article = create_default_optimizer(self.article_encoder.ops)
             self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
             self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
-            self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
 
     def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
-        """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) """
+        """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """
         self.require_model()
 
         entity_docs, article_docs, sentence_docs = docs
@@ -1131,7 +1126,7 @@ class EntityLinker(Pipe):
             article_docs = [article_docs]
             sentence_docs = [sentence_docs]
 
-        entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop)
+        entity_encodings = None #TODO
         doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
         sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
 
@@ -1195,10 +1190,9 @@ class EntityLinker(Pipe):
                         for c in candidates:
                             prior_prob = c.prior_prob
                             kb_id = c.entity_
-                            description = self.id_to_descr.get(kb_id)
-                            entity_encodings = self.entity_encoder([description])  # TODO: static entity vectors ?
-                            sim = cosine(entity_encodings, mention_enc_t)
-                            score = prior_prob + sim - (prior_prob*sim)  # TODO: weights ?
+                            entity_encoding = c.entity_vector
+                            sim = cosine([entity_encoding], mention_enc_t)
+                            score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
                             scores.append(score)
 
                         best_index = scores.index(max(scores))

From a5c061f50633831ce49e4cc6660d177569bb9767 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 7 Jun 2019 12:58:42 +0200
Subject: [PATCH 067/148] storing NEL training data in GoldParse objects

---
 .../wiki_entity_linking/train_descriptions.py |  1 +
 .../training_set_creator.py                   | 63 ++++++++++++++++++-
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 28 ++++++++-
 spacy/kb.pyx                                  |  5 ++
 spacy/pipeline/pipes.pyx                      |  8 ++-
 5 files changed, 99 insertions(+), 6 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py
index 8513a25fd..f2c3fa05d 100644
--- a/examples/pipeline/wiki_entity_linking/train_descriptions.py
+++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py
@@ -1,3 +1,4 @@
+# coding: utf-8
 from random import shuffle
 
 from examples.pipeline.wiki_entity_linking import kb_creator
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index ac8ad0744..c1879e2fb 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -1,11 +1,15 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import os
 import re
-import csv
 import bz2
 import datetime
+from os import listdir
 
+from examples.pipeline.wiki_entity_linking import run_el
+from spacy.gold import GoldParse
+from spacy.matcher import PhraseMatcher
 from . import wikipedia_processor as wp, kb_creator
 
 """
@@ -294,5 +298,62 @@ def read_training_entities(training_output, collect_correct=True, collect_incorr
     return correct_entries_per_article, incorrect_entries_per_article
 
 
+def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_print):
+    correct_entries, incorrect_entries = read_training_entities(training_output=training_dir,
+                                                                collect_correct=True,
+                                                                collect_incorrect=True)
+
+    docs = list()
+    golds = list()
+
+    cnt = 0
+    next_entity_nr = 1
+    files = listdir(training_dir)
+    for f in files:
+        if not limit or cnt < limit:
+            if dev == run_el.is_dev(f):
+                article_id = f.replace(".txt", "")
+                if cnt % 500 == 0 and to_print:
+                    print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
+
+                try:
+                    # parse the article text
+                    with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
+                        text = file.read()
+                        article_doc = nlp(text)
+                        truncated_text = text[0:min(doc_cutoff, len(text))]
+
+                    gold_entities = dict()
+
+                    # process all positive and negative entities, collect all relevant mentions in this article
+                    for mention, entity_pos in correct_entries[article_id].items():
+                        # find all matches in the doc for the mentions
+                        # TODO: fix this - doesn't look like all entities are found
+                        matcher = PhraseMatcher(nlp.vocab)
+                        patterns = list(nlp.tokenizer.pipe([mention]))
+
+                        matcher.add("TerminologyList", None, *patterns)
+                        matches = matcher(article_doc)
+
+                        # store gold entities
+                        for match_id, start, end in matches:
+                            gold_entities[(start, end, entity_pos)] = 1.0
+
+                    gold = GoldParse(doc=article_doc, cats=gold_entities)
+                    docs.append(article_doc)
+                    golds.append(gold)
+
+                    cnt += 1
+                except Exception as e:
+                    print("Problem parsing article", article_id)
+                    print(e)
+
+    if to_print:
+        print()
+        print("Processed", cnt, "training articles, dev=" + str(dev))
+        print()
+    return docs, golds
+
+
 
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 390a6800b..08f4adda0 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -23,6 +23,9 @@ VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
 
 TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
 
+MAX_CANDIDATES=10
+MIN_PAIR_OCC=5
+DOC_CHAR_CUTOFF=300
 
 if __name__ == "__main__":
     print("START", datetime.datetime.now())
@@ -71,8 +74,8 @@ if __name__ == "__main__":
     if to_create_kb:
         print("STEP 3a: to_create_kb", datetime.datetime.now())
         my_kb = kb_creator.create_kb(nlp,
-                                     max_entities_per_alias=10,
-                                     min_occ=5,
+                                     max_entities_per_alias=MAX_CANDIDATES,
+                                     min_occ=MIN_PAIR_OCC,
                                      entity_def_output=ENTITY_DEFS,
                                      entity_descr_output=ENTITY_DESCR,
                                      count_input=ENTITY_COUNTS,
@@ -110,10 +113,29 @@ if __name__ == "__main__":
 
     # STEP 6: create the entity linking pipe
     if train_pipe:
-        # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
+        id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR)
+
+        docs, golds = training_set_creator.read_training(nlp=nlp,
+                                                         training_dir=TRAINING_DIR,
+                                                         id_to_descr=id_to_descr,
+                                                         doc_cutoff=DOC_CHAR_CUTOFF,
+                                                         dev=False,
+                                                         limit=10,
+                                                         to_print=False)
+
+        # for doc, gold in zip(docs, golds):
+            # print("doc", doc)
+            # for entity, label in gold.cats.items():
+                # print("entity", entity, label)
+            # print()
+
         el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb})
         nlp.add_pipe(el_pipe, last=True)
 
+        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
+        with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
+            nlp.begin_training()
+
     ### BELOW CODE IS DEPRECATED ###
 
     # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 30440227f..ade2360be 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -82,6 +82,11 @@ cdef class KnowledgeBase:
         self.vocab.strings.add("")
         self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
 
+    @property
+    def entity_vector_length(self):
+        """RETURNS (uint64): length of the entity vectors"""
+        return self.entity_vector_length
+
     def __len__(self):
         return self.get_size_entities()
 
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index c5187a593..a3caae455 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1081,8 +1081,8 @@ class EntityLinker(Pipe):
         sent_width = cfg.get("sent_width", 64)
         entity_width = cfg["kb"].entity_vector_length
 
-        article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
-        sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
+        article_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=article_width, **cfg)
+        sent_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg)
 
         # dimension of the mention encoder needs to match the dimension of the entity encoder
         mention_width = article_width + sent_width
@@ -1118,6 +1118,10 @@ class EntityLinker(Pipe):
         """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """
         self.require_model()
 
+        if len(docs) != len(golds):
+            raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs),
+                                                n_golds=len(golds)))
+
         entity_docs, article_docs, sentence_docs = docs
         assert len(entity_docs) == len(article_docs) == len(sentence_docs)
 

From 0486ccabfdbfd6ee4531574ad18b5dde085b43be Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 7 Jun 2019 13:54:45 +0200
Subject: [PATCH 068/148] introduce goldparse.links

---
 .../training_set_creator.py                   | 14 ++--
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 34 +++++---
 spacy/gold.pxd                                |  1 +
 spacy/gold.pyx                                |  5 +-
 spacy/pipeline/pipes.pyx                      | 81 +++++++++++--------
 5 files changed, 82 insertions(+), 53 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index c1879e2fb..156bce05f 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -303,8 +303,7 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
                                                                 collect_correct=True,
                                                                 collect_incorrect=True)
 
-    docs = list()
-    golds = list()
+    data = []
 
     cnt = 0
     next_entity_nr = 1
@@ -323,7 +322,7 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
                         article_doc = nlp(text)
                         truncated_text = text[0:min(doc_cutoff, len(text))]
 
-                    gold_entities = dict()
+                    gold_entities = list()
 
                     # process all positive and negative entities, collect all relevant mentions in this article
                     for mention, entity_pos in correct_entries[article_id].items():
@@ -337,11 +336,10 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
 
                         # store gold entities
                         for match_id, start, end in matches:
-                            gold_entities[(start, end, entity_pos)] = 1.0
+                            gold_entities.append((start, end, entity_pos))
 
-                    gold = GoldParse(doc=article_doc, cats=gold_entities)
-                    docs.append(article_doc)
-                    golds.append(gold)
+                    gold = GoldParse(doc=article_doc, links=gold_entities)
+                    data.append((article_doc, gold))
 
                     cnt += 1
                 except Exception as e:
@@ -352,7 +350,7 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
         print()
         print("Processed", cnt, "training articles, dev=" + str(dev))
         print()
-    return docs, golds
+    return data
 
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 08f4adda0..b66f8b316 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -1,6 +1,10 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import random
+
+from spacy.util import minibatch, compounding
+
 from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
 from examples.pipeline.wiki_entity_linking.train_el import EL_Model
 
@@ -23,9 +27,11 @@ VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
 
 TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
 
-MAX_CANDIDATES=10
-MIN_PAIR_OCC=5
-DOC_CHAR_CUTOFF=300
+MAX_CANDIDATES = 10
+MIN_PAIR_OCC = 5
+DOC_CHAR_CUTOFF = 300
+EPOCHS = 5
+DROPOUT = 0.1
 
 if __name__ == "__main__":
     print("START", datetime.datetime.now())
@@ -115,7 +121,7 @@ if __name__ == "__main__":
     if train_pipe:
         id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR)
 
-        docs, golds = training_set_creator.read_training(nlp=nlp,
+        train_data = training_set_creator.read_training(nlp=nlp,
                                                          training_dir=TRAINING_DIR,
                                                          id_to_descr=id_to_descr,
                                                          doc_cutoff=DOC_CHAR_CUTOFF,
@@ -123,12 +129,6 @@ if __name__ == "__main__":
                                                          limit=10,
                                                          to_print=False)
 
-        # for doc, gold in zip(docs, golds):
-            # print("doc", doc)
-            # for entity, label in gold.cats.items():
-                # print("entity", entity, label)
-            # print()
-
         el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb})
         nlp.add_pipe(el_pipe, last=True)
 
@@ -136,6 +136,20 @@ if __name__ == "__main__":
         with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
             nlp.begin_training()
 
+            for itn in range(EPOCHS):
+                random.shuffle(train_data)
+                losses = {}
+                batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+                for batch in batches:
+                    docs, golds = zip(*batch)
+                    nlp.update(
+                        docs,
+                        golds,
+                        drop=DROPOUT,
+                        losses=losses,
+                    )
+                print("Losses", losses)
+
     ### BELOW CODE IS DEPRECATED ###
 
     # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx
diff --git a/spacy/gold.pxd b/spacy/gold.pxd
index a1550b1ef..8943a155a 100644
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@@ -31,6 +31,7 @@ cdef class GoldParse:
     cdef public list ents
     cdef public dict brackets
     cdef public object cats
+    cdef public list links
 
     cdef readonly list cand_to_gold
     cdef readonly list gold_to_cand
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 569979a5f..4fb22f3f0 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -427,7 +427,7 @@ cdef class GoldParse:
 
     def __init__(self, doc, annot_tuples=None, words=None, tags=None,
                  heads=None, deps=None, entities=None, make_projective=False,
-                 cats=None, **_):
+                 cats=None, links=None, **_):
         """Create a GoldParse.
 
         doc (Doc): The document the annotations refer to.
@@ -450,6 +450,8 @@ cdef class GoldParse:
             examples of a label to have the value 0.0. Labels not in the
             dictionary are treated as missing - the gradient for those labels
             will be zero.
+        links (iterable): A sequence of `(start_char, end_char, kb_id)` tuples,
+            representing the external ID of an entity in a knowledge base.
         RETURNS (GoldParse): The newly constructed object.
         """
         if words is None:
@@ -485,6 +487,7 @@ cdef class GoldParse:
         self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
 
         self.cats = {} if cats is None else dict(cats)
+        self.links = links
         self.words = [None] * len(doc)
         self.tags = [None] * len(doc)
         self.heads = [None] * len(doc)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index a3caae455..f15ffd036 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1115,48 +1115,61 @@ class EntityLinker(Pipe):
             self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
 
     def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
-        """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """
         self.require_model()
 
         if len(docs) != len(golds):
-            raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs),
+            raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs),
                                                 n_golds=len(golds)))
 
-        entity_docs, article_docs, sentence_docs = docs
-        assert len(entity_docs) == len(article_docs) == len(sentence_docs)
+        if isinstance(docs, Doc):
+            docs = [docs]
+            golds = [golds]
 
-        if isinstance(entity_docs, Doc):
-            entity_docs = [entity_docs]
-            article_docs = [article_docs]
-            sentence_docs = [sentence_docs]
+        for doc, gold in zip(docs, golds):
+            print("doc", doc)
+            for entity in gold.links:
+                start, end, gold_kb = entity
+                print("entity", entity)
+                mention = doc[start:end].text
+                print("mention", mention)
+                candidates = self.kb.get_candidates(mention)
+                for c in candidates:
+                    prior_prob = c.prior_prob
+                    kb_id = c.entity_
+                    print("candidate", kb_id, prior_prob)
+                    entity_encoding = c.entity_vector
+                print()
 
-        entity_encodings = None #TODO
-        doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
-        sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
+            print()
 
-        concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
-                            range(len(article_docs))]
-        mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
-
-        loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
-
-        mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont)
-
-        # gradient : concat (doc+sent) vs. desc
-        sent_start = self.article_encoder.nO
-        sent_gradients = list()
-        doc_gradients = list()
-        for x in mention_gradient:
-            doc_gradients.append(list(x[0:sent_start]))
-            sent_gradients.append(list(x[sent_start:]))
-
-        bp_doc(doc_gradients, sgd=self.sgd_article)
-        bp_sent(sent_gradients, sgd=self.sgd_sent)
-
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += loss
-        return loss
+        # entity_encodings = None #TODO
+        # doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
+        # sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
+        #
+        # concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
+        #                     range(len(article_docs))]
+        # mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
+        #
+        # loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
+        #
+        # mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont)
+        #
+        # # gradient : concat (doc+sent) vs. desc
+        # sent_start = self.article_encoder.nO
+        # sent_gradients = list()
+        # doc_gradients = list()
+        # for x in mention_gradient:
+        #     doc_gradients.append(list(x[0:sent_start]))
+        #     sent_gradients.append(list(x[sent_start:]))
+        #
+        # bp_doc(doc_gradients, sgd=self.sgd_article)
+        # bp_sent(sent_gradients, sgd=self.sgd_sent)
+        #
+        # if losses is not None:
+        #     losses.setdefault(self.name, 0.0)
+        #     losses[self.name] += loss
+        # return loss
+        return None
 
     def get_loss(self, docs, golds, scores):
         loss, gradients = get_cossim_loss(scores, golds)

From 7de1ee69b819cba8b66db370dcb1ec169b4a7b74 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 7 Jun 2019 15:55:10 +0200
Subject: [PATCH 069/148] training loop in proper pipe format

---
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 13 +--
 spacy/pipeline/pipes.pyx                      | 84 ++++++++++---------
 2 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index b66f8b316..ded4bdc24 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -126,7 +126,7 @@ if __name__ == "__main__":
                                                          id_to_descr=id_to_descr,
                                                          doc_cutoff=DOC_CHAR_CUTOFF,
                                                          dev=False,
-                                                         limit=10,
+                                                         limit=100,
                                                          to_print=False)
 
         el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb})
@@ -137,6 +137,8 @@ if __name__ == "__main__":
             nlp.begin_training()
 
             for itn in range(EPOCHS):
+                print()
+                print("EPOCH", itn)
                 random.shuffle(train_data)
                 losses = {}
                 batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@@ -150,15 +152,6 @@ if __name__ == "__main__":
                     )
                 print("Losses", losses)
 
-    ### BELOW CODE IS DEPRECATED ###
-
-    # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx
-    if run_el_training:
-        print("STEP 6: training", datetime.datetime.now())
-        trainer = EL_Model(kb=my_kb, nlp=nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500)
-        print()
-
     # STEP 7: apply the EL algorithm on the dev dataset (TODO: overlaps with code from run_el_training ?)
     if apply_to_dev:
         run_el.run_el_dev(kb=my_kb, nlp=nlp, training_dir=TRAINING_DIR, limit=2000)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index f15ffd036..01302b618 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1125,51 +1125,59 @@ class EntityLinker(Pipe):
             docs = [docs]
             golds = [golds]
 
+        article_docs = list()
+        sentence_docs = list()
+        entity_encodings = list()
+
         for doc, gold in zip(docs, golds):
-            print("doc", doc)
             for entity in gold.links:
                 start, end, gold_kb = entity
-                print("entity", entity)
-                mention = doc[start:end].text
-                print("mention", mention)
-                candidates = self.kb.get_candidates(mention)
+                mention = doc[start:end]
+                sentence = mention.sent
+
+                candidates = self.kb.get_candidates(mention.text)
                 for c in candidates:
-                    prior_prob = c.prior_prob
                     kb_id = c.entity_
-                    print("candidate", kb_id, prior_prob)
-                    entity_encoding = c.entity_vector
-                print()
+                    # TODO: currently only training on the positive instances
+                    if kb_id == gold_kb:
+                        prior_prob = c.prior_prob
+                        entity_encoding = c.entity_vector
 
-            print()
+                        entity_encodings.append(entity_encoding)
+                        article_docs.append(doc)
+                        sentence_docs.append(sentence.as_doc())
 
-        # entity_encodings = None #TODO
-        # doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
-        # sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
-        #
-        # concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
-        #                     range(len(article_docs))]
-        # mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
-        #
-        # loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
-        #
-        # mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont)
-        #
-        # # gradient : concat (doc+sent) vs. desc
-        # sent_start = self.article_encoder.nO
-        # sent_gradients = list()
-        # doc_gradients = list()
-        # for x in mention_gradient:
-        #     doc_gradients.append(list(x[0:sent_start]))
-        #     sent_gradients.append(list(x[sent_start:]))
-        #
-        # bp_doc(doc_gradients, sgd=self.sgd_article)
-        # bp_sent(sent_gradients, sgd=self.sgd_sent)
-        #
-        # if losses is not None:
-        #     losses.setdefault(self.name, 0.0)
-        #     losses[self.name] += loss
-        # return loss
-        return None
+        if len(entity_encodings) > 0:
+            doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
+            sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
+
+            concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
+                                range(len(article_docs))]
+            mention_encodings, bp_mention = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=drop)
+
+            entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
+
+            loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
+
+            mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention)
+
+            # gradient : concat (doc+sent) vs. desc
+            sent_start = self.article_encoder.nO
+            sent_gradients = list()
+            doc_gradients = list()
+            for x in mention_gradient:
+                doc_gradients.append(list(x[0:sent_start]))
+                sent_gradients.append(list(x[sent_start:]))
+
+            bp_doc(doc_gradients, sgd=self.sgd_article)
+            bp_sent(sent_gradients, sgd=self.sgd_sent)
+
+            if losses is not None:
+                losses.setdefault(self.name, 0.0)
+                losses[self.name] += loss
+            return loss
+
+        return 0
 
     def get_loss(self, docs, golds, scores):
         loss, gradients = get_cossim_loss(scores, golds)

From 83dc7b46fd1b39023c6eb883471c961d9e5bd51c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 10 Jun 2019 21:25:26 +0200
Subject: [PATCH 070/148] first tests with EL pipe

---
 .../wiki_entity_linking/kb_creator.py         |  4 +--
 .../wiki_entity_linking/train_descriptions.py |  4 +--
 .../training_set_creator.py                   |  4 +--
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 19 +++++-----
 spacy/pipeline/pipes.pyx                      | 36 ++++++++++++++-----
 5 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index ee632bd48..e7e3d077d 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
     title_list = list(title_to_id.keys())
 
     # TODO: remove this filter (just for quicker testing of code)
-    title_list = title_list[0:34200]
-    title_to_id = {t: title_to_id[t] for t in title_list}
+    # title_list = title_list[0:34200]
+    # title_to_id = {t: title_to_id[t] for t in title_list}
 
     entity_list = [title_to_id[x] for x in title_list]
 
diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py
index f2c3fa05d..e1a2f1797 100644
--- a/examples/pipeline/wiki_entity_linking/train_descriptions.py
+++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py
@@ -17,7 +17,7 @@ class EntityEncoder:
 
     DROP = 0
     EPOCHS = 5
-    STOP_THRESHOLD = 0.9 # 0.1
+    STOP_THRESHOLD = 0.1
 
     BATCH_SIZE = 1000
 
@@ -32,7 +32,7 @@ class EntityEncoder:
 
         print("Encoding", len(description_list), "entities")
 
-        batch_size = 10000
+        batch_size = 100000
 
         start = 0
         stop = min(batch_size, len(description_list))
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index 156bce05f..38a86058d 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -298,7 +298,7 @@ def read_training_entities(training_output, collect_correct=True, collect_incorr
     return correct_entries_per_article, incorrect_entries_per_article
 
 
-def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_print):
+def read_training(nlp, training_dir, dev, limit, to_print):
     correct_entries, incorrect_entries = read_training_entities(training_output=training_dir,
                                                                 collect_correct=True,
                                                                 collect_incorrect=True)
@@ -306,7 +306,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
     data = []
 
     cnt = 0
-    next_entity_nr = 1
     files = listdir(training_dir)
     for f in files:
         if not limit or cnt < limit:
@@ -320,7 +319,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
                     with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
                         text = file.read()
                         article_doc = nlp(text)
-                        truncated_text = text[0:min(doc_cutoff, len(text))]
 
                     gold_entities = list()
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index ded4bdc24..4be1ae2fb 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -121,15 +121,16 @@ if __name__ == "__main__":
     if train_pipe:
         id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR)
 
-        train_data = training_set_creator.read_training(nlp=nlp,
-                                                         training_dir=TRAINING_DIR,
-                                                         id_to_descr=id_to_descr,
-                                                         doc_cutoff=DOC_CHAR_CUTOFF,
-                                                         dev=False,
-                                                         limit=100,
-                                                         to_print=False)
+        train_limit = 10
+        print("Training on", train_limit, "articles")
 
-        el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb})
+        train_data = training_set_creator.read_training(nlp=nlp,
+                                                        training_dir=TRAINING_DIR,
+                                                        dev=False,
+                                                        limit=train_limit,
+                                                        to_print=False)
+
+        el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF})
         nlp.add_pipe(el_pipe, last=True)
 
         other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
@@ -141,7 +142,7 @@ if __name__ == "__main__":
                 print("EPOCH", itn)
                 random.shuffle(train_data)
                 losses = {}
-                batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+                batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
                 for batch in batches:
                     docs, golds = zip(*batch)
                     nlp.update(
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 01302b618..e5ed2ec23 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -11,9 +11,8 @@ from collections import OrderedDict
 from thinc.api import chain
 from thinc.v2v import Affine, Maxout, Softmax
 from thinc.misc import LayerNorm
-from thinc.neural.util import to_categorical, copy_array
-
-from spacy.cli.pretrain import get_cossim_loss
+from thinc.neural.util import to_categorical
+from thinc.neural.util import get_array_module
 
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
@@ -33,9 +32,6 @@ from .._ml import masked_language_model, create_default_optimizer
 from ..errors import Errors, TempErrors
 from .. import util
 
-# TODO: remove
-from examples.pipeline.wiki_entity_linking import kb_creator
-
 
 def _load_cfg(path):
     if path.exists():
@@ -1094,6 +1090,7 @@ class EntityLinker(Pipe):
         self.mention_encoder = True
         self.cfg = dict(cfg)
         self.kb = self.cfg["kb"]
+        self.doc_cutoff = self.cfg["doc_cutoff"]
 
     def use_avg_params(self):
         """Modify the pipe's encoders/models, to use their average parameter values."""
@@ -1134,6 +1131,7 @@ class EntityLinker(Pipe):
                 start, end, gold_kb = entity
                 mention = doc[start:end]
                 sentence = mention.sent
+                first_par = doc[0:self.doc_cutoff].as_doc()
 
                 candidates = self.kb.get_candidates(mention.text)
                 for c in candidates:
@@ -1144,7 +1142,7 @@ class EntityLinker(Pipe):
                         entity_encoding = c.entity_vector
 
                         entity_encodings.append(entity_encoding)
-                        article_docs.append(doc)
+                        article_docs.append(first_par)
                         sentence_docs.append(sentence.as_doc())
 
         if len(entity_encodings) > 0:
@@ -1158,6 +1156,10 @@ class EntityLinker(Pipe):
             entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
 
             loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
+            # print("scores", mention_encodings)
+            # print("golds", entity_encodings)
+            # print("loss", loss)
+            # print("d_scores", d_scores)
 
             mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention)
 
@@ -1180,9 +1182,26 @@ class EntityLinker(Pipe):
         return 0
 
     def get_loss(self, docs, golds, scores):
-        loss, gradients = get_cossim_loss(scores, golds)
+        targets = [[1] for _  in golds]  # assuming we're only using positive examples
+        loss, gradients = self.get_cossim_loss_2(yh=scores, y=golds, t=targets)
+        #loss = loss / len(golds)
         return loss, gradients
 
+    def get_cossim_loss_2(self, yh, y, t):
+        # Add a small constant to avoid 0 vectors
+        yh = yh + 1e-8
+        y = y + 1e-8
+        # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
+        xp = get_array_module(yh)
+        norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
+        norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
+        mul_norms = norm_yh * norm_y
+        cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms
+        d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2))
+        loss = xp.abs(cos - t).sum()
+        inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))])
+        return loss, -inverse
+
     def __call__(self, doc):
         entities, kb_ids = self.predict([doc])
         self.set_annotations([doc], entities, kb_ids)
@@ -1220,6 +1239,7 @@ class EntityLinker(Pipe):
                             score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
                             scores.append(score)
 
+                        # TODO: thresholding
                         best_index = scores.index(max(scores))
                         best_candidate = candidates[best_index]
                         final_entities.append(ent)

From fe1ed432eff61f087a06c89840f37dc75d24ee59 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 11 Jun 2019 11:40:58 +0200
Subject: [PATCH 071/148] eval on dev set, varying combo's of prior and context
 scores

---
 .../wiki_entity_linking/kb_creator.py         |   2 +-
 .../pipeline/wiki_entity_linking/run_el.py    |  19 ---
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 130 ++++++++++++++----
 spacy/pipeline/pipes.pyx                      |  46 ++++---
 4 files changed, 127 insertions(+), 70 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index e7e3d077d..d097ac449 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -20,7 +20,7 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
     """ Create the knowledge base from Wikidata entries """
     kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH)
 
-    # disable parts of the pipeline when rerunning
+    # disable this part of the pipeline when rerunning the KB generation from preprocessed files
     read_raw_data = False
 
     if read_raw_data:
diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
index c3074ab5c..52ccccfda 100644
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -21,29 +21,10 @@ def run_kb_toy_example(kb):
             print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
         print()
 
-def run_el_toy_example(nlp, kb):
-    _prepare_pipeline(nlp, kb)
 
-    candidates = kb.get_candidates("Bush")
-
-    print("generating candidates for 'Bush' :")
-    for c in candidates:
-        print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
-    print()
-
-    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
-           "Douglas reminds us to always bring our towel. " \
-           "The main character in Doug's novel is the man Arthur Dent, " \
-           "but Douglas doesn't write about George Washington or Homer Simpson."
-    doc = nlp(text)
-
-    for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
 
 
 def run_el_dev(nlp, kb, training_dir, limit=None):
-    _prepare_pipeline(nlp, kb)
-
     correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir,
                                                                                  collect_correct=True,
                                                                                  collect_incorrect=False)
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 4be1ae2fb..6e4ca6970 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -6,7 +6,6 @@ import random
 from spacy.util import minibatch, compounding
 
 from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
-from examples.pipeline.wiki_entity_linking.train_el import EL_Model
 
 import spacy
 from spacy.vocab import Vocab
@@ -30,10 +29,11 @@ TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
 MAX_CANDIDATES = 10
 MIN_PAIR_OCC = 5
 DOC_CHAR_CUTOFF = 300
-EPOCHS = 5
+EPOCHS = 10
 DROPOUT = 0.1
 
-if __name__ == "__main__":
+
+def run_pipeline():
     print("START", datetime.datetime.now())
     print()
     nlp = spacy.load('en_core_web_lg')
@@ -51,15 +51,11 @@ if __name__ == "__main__":
     # create training dataset
     create_wp_training = False
 
+    # train the EL pipe
     train_pipe = True
 
-    # run EL training
-    run_el_training = False
-
-    # apply named entity linking to the dev dataset
-    apply_to_dev = False
-
-    to_test_pipeline = False
+    # test the EL pipe on a simple example
+    to_test_pipeline = True
 
     # STEP 1 : create prior probabilities from WP
     # run only once !
@@ -119,10 +115,11 @@ if __name__ == "__main__":
 
     # STEP 6: create the entity linking pipe
     if train_pipe:
-        id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR)
-
-        train_limit = 10
+        train_limit = 5
+        dev_limit = 2
         print("Training on", train_limit, "articles")
+        print("Dev testing on", dev_limit, "articles")
+        print()
 
         train_data = training_set_creator.read_training(nlp=nlp,
                                                         training_dir=TRAINING_DIR,
@@ -130,6 +127,12 @@ if __name__ == "__main__":
                                                         limit=train_limit,
                                                         to_print=False)
 
+        dev_data = training_set_creator.read_training(nlp=nlp,
+                                                      training_dir=TRAINING_DIR,
+                                                      dev=True,
+                                                      limit=dev_limit,
+                                                        to_print=False)
+
         el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF})
         nlp.add_pipe(el_pipe, last=True)
 
@@ -137,12 +140,12 @@ if __name__ == "__main__":
         with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
             nlp.begin_training()
 
-            for itn in range(EPOCHS):
-                print()
-                print("EPOCH", itn)
-                random.shuffle(train_data)
-                losses = {}
-                batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
+        for itn in range(EPOCHS):
+            random.shuffle(train_data)
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
+
+            with nlp.disable_pipes(*other_pipes):
                 for batch in batches:
                     docs, golds = zip(*batch)
                     nlp.update(
@@ -151,20 +154,89 @@ if __name__ == "__main__":
                         drop=DROPOUT,
                         losses=losses,
                     )
-                print("Losses", losses)
 
-    # STEP 7: apply the EL algorithm on the dev dataset (TODO: overlaps with code from run_el_training ?)
-    if apply_to_dev:
-        run_el.run_el_dev(kb=my_kb, nlp=nlp, training_dir=TRAINING_DIR, limit=2000)
-        print()
+            el_pipe.context_weight = 1
+            el_pipe.prior_weight = 1
+            dev_acc_1_1 = _measure_accuracy(dev_data, nlp)
+            train_acc_1_1 = _measure_accuracy(train_data, nlp)
 
-    # test KB
+            el_pipe.context_weight = 0
+            el_pipe.prior_weight = 1
+            dev_acc_0_1 = _measure_accuracy(dev_data, nlp)
+            train_acc_0_1 = _measure_accuracy(train_data, nlp)
+
+            el_pipe.context_weight = 1
+            el_pipe.prior_weight = 0
+            dev_acc_1_0 = _measure_accuracy(dev_data, nlp)
+            train_acc_1_0 = _measure_accuracy(train_data, nlp)
+
+            print("Epoch, train loss, train/dev acc, 1-1, 0-1, 1-0:", itn, losses['entity_linker'],
+                  round(train_acc_1_1, 2), round(train_acc_0_1, 2), round(train_acc_1_0, 2), "/",
+                  round(dev_acc_1_1, 2), round(dev_acc_0_1, 2), round(dev_acc_1_0, 2))
+
+    # test Entity Linker
     if to_test_pipeline:
-        run_el.run_el_toy_example(kb=my_kb, nlp=nlp)
         print()
-
-    # TODO coreference resolution
-    # add_coref()
+        run_el_toy_example(kb=my_kb, nlp=nlp)
+        print()
 
     print()
     print("STOP", datetime.datetime.now())
+
+
+def _measure_accuracy(data, nlp):
+    correct = 0
+    incorrect = 0
+
+    texts = [d.text for d, g in data]
+    docs = list(nlp.pipe(texts))
+    golds = [g for d, g in data]
+
+    for doc, gold in zip(docs, golds):
+        correct_entries_per_article = dict()
+        for entity in gold.links:
+            start, end, gold_kb = entity
+            correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
+
+        for ent in doc.ents:
+            if ent.label_ == "PERSON":  # TODO: expand to other types
+                pred_entity = ent.kb_id_
+                start = ent.start
+                end = ent.end
+                gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
+                if gold_entity is not None:
+                    if gold_entity == pred_entity:
+                        correct += 1
+                    else:
+                        incorrect += 1
+
+    if correct == incorrect == 0:
+        return 0
+
+    acc = correct / (correct + incorrect)
+    return acc
+
+
+def run_el_toy_example(nlp, kb):
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
+           "Douglas reminds us to always bring our towel. " \
+           "The main character in Doug's novel is the man Arthur Dent, " \
+           "but Douglas doesn't write about George Washington or Homer Simpson."
+    doc = nlp(text)
+
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
+
+    print()
+
+    # Q4426480 is her husband, Q3568763 her tutor
+    text = "Ada Lovelace loved her husband William King dearly. " \
+           "Ada Lovelace was tutored by her favorite physics tutor William King."
+    doc = nlp(text)
+
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
+
+
+if __name__ == "__main__":
+    run_pipeline()
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index e5ed2ec23..9ef9df601 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1068,6 +1068,8 @@ class EntityLinker(Pipe):
     DOCS: TODO
     """
     name = 'entity_linker'
+    context_weight = 1
+    prior_weight = 1
 
     @classmethod
     def Model(cls, **cfg):
@@ -1093,14 +1095,15 @@ class EntityLinker(Pipe):
         self.doc_cutoff = self.cfg["doc_cutoff"]
 
     def use_avg_params(self):
-        """Modify the pipe's encoders/models, to use their average parameter values."""
-        with self.article_encoder.use_params(self.sgd_article.averages) \
-                 and self.sent_encoder.use_params(self.sgd_sent.averages) \
-                 and self.mention_encoder.use_params(self.sgd_mention.averages):
-            yield
+        # Modify the pipe's encoders/models, to use their average parameter values.
+        # TODO: this doesn't work yet because there's no exit method
+        self.article_encoder.use_params(self.sgd_article.averages)
+        self.sent_encoder.use_params(self.sgd_sent.averages)
+        self.mention_encoder.use_params(self.sgd_mention.averages)
+
 
     def require_model(self):
-        """Raise an error if the component's model is not initialized."""
+        # Raise an error if the component's model is not initialized.
         if getattr(self, "mention_encoder", None) in (None, True, False):
             raise ValueError(Errors.E109.format(name=self.name))
 
@@ -1110,6 +1113,7 @@ class EntityLinker(Pipe):
             self.sgd_article = create_default_optimizer(self.article_encoder.ops)
             self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
             self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
+        return self.sgd_article
 
     def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
         self.require_model()
@@ -1229,27 +1233,27 @@ class EntityLinker(Pipe):
 
                 candidates = self.kb.get_candidates(ent.text)
                 if candidates:
-                    with self.use_avg_params:
-                        scores = list()
-                        for c in candidates:
-                            prior_prob = c.prior_prob
-                            kb_id = c.entity_
-                            entity_encoding = c.entity_vector
-                            sim = cosine([entity_encoding], mention_enc_t)
-                            score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
-                            scores.append(score)
+                    scores = list()
+                    for c in candidates:
+                        prior_prob = c.prior_prob * self.prior_weight
+                        kb_id = c.entity_
+                        entity_encoding = c.entity_vector
+                        sim = cosine(np.asarray([entity_encoding]), mention_enc_t) * self.context_weight
+                        score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
+                        scores.append(score)
 
-                        # TODO: thresholding
-                        best_index = scores.index(max(scores))
-                        best_candidate = candidates[best_index]
-                        final_entities.append(ent)
-                        final_kb_ids.append(best_candidate)
+                    # TODO: thresholding
+                    best_index = scores.index(max(scores))
+                    best_candidate = candidates[best_index]
+                    final_entities.append(ent)
+                    final_kb_ids.append(best_candidate.entity_)
 
         return final_entities, final_kb_ids
 
     def set_annotations(self, docs, entities, kb_ids=None):
         for entity, kb_id in zip(entities, kb_ids):
-            entity.ent_kb_id_ = kb_id
+            for token in entity:
+                token.ent_kb_id_ = kb_id
 
 class Sentencizer(object):
     """Segment the Doc into sentences using a rule-based strategy.

From 66813a1fdcfa2b1f2c9e3af0b8b3922427d1d73a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 11 Jun 2019 14:18:20 +0200
Subject: [PATCH 072/148] speed up predictions

---
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 6e4ca6970..8753450bb 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -115,8 +115,8 @@ def run_pipeline():
 
     # STEP 6: create the entity linking pipe
     if train_pipe:
-        train_limit = 5
-        dev_limit = 2
+        train_limit = 100
+        dev_limit = 20
         print("Training on", train_limit, "articles")
         print("Dev testing on", dev_limit, "articles")
         print()
@@ -155,22 +155,25 @@ def run_pipeline():
                         losses=losses,
                     )
 
+            # print(" measuring accuracy 1-1")
             el_pipe.context_weight = 1
             el_pipe.prior_weight = 1
-            dev_acc_1_1 = _measure_accuracy(dev_data, nlp)
-            train_acc_1_1 = _measure_accuracy(train_data, nlp)
+            dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe)
+            train_acc_1_1 = _measure_accuracy(train_data, el_pipe)
 
+            # print(" measuring accuracy 0-1")
             el_pipe.context_weight = 0
             el_pipe.prior_weight = 1
-            dev_acc_0_1 = _measure_accuracy(dev_data, nlp)
-            train_acc_0_1 = _measure_accuracy(train_data, nlp)
+            dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe)
+            train_acc_0_1 = _measure_accuracy(train_data, el_pipe)
 
+            # print(" measuring accuracy 1-0")
             el_pipe.context_weight = 1
             el_pipe.prior_weight = 0
-            dev_acc_1_0 = _measure_accuracy(dev_data, nlp)
-            train_acc_1_0 = _measure_accuracy(train_data, nlp)
+            dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe)
+            train_acc_1_0 = _measure_accuracy(train_data, el_pipe)
 
-            print("Epoch, train loss, train/dev acc, 1-1, 0-1, 1-0:", itn, losses['entity_linker'],
+            print("Epoch, train loss, train/dev acc, 1-1, 0-1, 1-0:", itn, round(losses['entity_linker'], 2),
                   round(train_acc_1_1, 2), round(train_acc_0_1, 2), round(train_acc_1_0, 2), "/",
                   round(dev_acc_1_1, 2), round(dev_acc_0_1, 2), round(dev_acc_1_0, 2))
 
@@ -184,12 +187,13 @@ def run_pipeline():
     print("STOP", datetime.datetime.now())
 
 
-def _measure_accuracy(data, nlp):
+def _measure_accuracy(data, el_pipe):
     correct = 0
     incorrect = 0
 
-    texts = [d.text for d, g in data]
-    docs = list(nlp.pipe(texts))
+    docs = [d for d, g in data]
+    docs = el_pipe.pipe(docs)
+
     golds = [g for d, g in data]
 
     for doc, gold in zip(docs, golds):

From 6521cfa1328605b012db60d6077725bc697edd58 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 12 Jun 2019 13:37:05 +0200
Subject: [PATCH 073/148] speeding up training

---
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 95 ++++++++++---------
 spacy/pipeline/pipes.pyx                      |  5 +
 2 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 8753450bb..90218edda 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -115,6 +115,7 @@ def run_pipeline():
 
     # STEP 6: create the entity linking pipe
     if train_pipe:
+        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
         train_limit = 100
         dev_limit = 20
         print("Training on", train_limit, "articles")
@@ -131,7 +132,7 @@ def run_pipeline():
                                                       training_dir=TRAINING_DIR,
                                                       dev=True,
                                                       limit=dev_limit,
-                                                        to_print=False)
+                                                      to_print=False)
 
         el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF})
         nlp.add_pipe(el_pipe, last=True)
@@ -147,35 +148,40 @@ def run_pipeline():
 
             with nlp.disable_pipes(*other_pipes):
                 for batch in batches:
-                    docs, golds = zip(*batch)
-                    nlp.update(
-                        docs,
-                        golds,
-                        drop=DROPOUT,
-                        losses=losses,
-                    )
+                    try:
+                        docs, golds = zip(*batch)
+                        nlp.update(
+                            docs,
+                            golds,
+                            drop=DROPOUT,
+                            losses=losses,
+                        )
+                    except Exception as e:
+                        print("Error updating batch", e)
 
-            # print(" measuring accuracy 1-1")
-            el_pipe.context_weight = 1
-            el_pipe.prior_weight = 1
-            dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe)
-            train_acc_1_1 = _measure_accuracy(train_data, el_pipe)
+            print("Epoch, train loss", itn, round(losses['entity_linker'], 2))
 
-            # print(" measuring accuracy 0-1")
-            el_pipe.context_weight = 0
-            el_pipe.prior_weight = 1
-            dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe)
-            train_acc_0_1 = _measure_accuracy(train_data, el_pipe)
+        # baseline using only prior probabilities
+        el_pipe.context_weight = 0
+        el_pipe.prior_weight = 1
+        dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe)
+        train_acc_0_1 = _measure_accuracy(train_data, el_pipe)
 
-            # print(" measuring accuracy 1-0")
-            el_pipe.context_weight = 1
-            el_pipe.prior_weight = 0
-            dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe)
-            train_acc_1_0 = _measure_accuracy(train_data, el_pipe)
+        # print(" measuring accuracy 1-1")
+        el_pipe.context_weight = 1
+        el_pipe.prior_weight = 1
+        dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe)
+        train_acc_1_1 = _measure_accuracy(train_data, el_pipe)
 
-            print("Epoch, train loss, train/dev acc, 1-1, 0-1, 1-0:", itn, round(losses['entity_linker'], 2),
-                  round(train_acc_1_1, 2), round(train_acc_0_1, 2), round(train_acc_1_0, 2), "/",
-                  round(dev_acc_1_1, 2), round(dev_acc_0_1, 2), round(dev_acc_1_0, 2))
+        # print(" measuring accuracy 1-0")
+        el_pipe.context_weight = 1
+        el_pipe.prior_weight = 0
+        dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe)
+        train_acc_1_0 = _measure_accuracy(train_data, el_pipe)
+
+        print("train/dev acc, 1-1, 0-1, 1-0:" ,
+              round(train_acc_1_1, 2), round(train_acc_0_1, 2), round(train_acc_1_0, 2), "/",
+              round(dev_acc_1_1, 2), round(dev_acc_0_1, 2), round(dev_acc_1_0, 2))
 
     # test Entity Linker
     if to_test_pipeline:
@@ -193,26 +199,29 @@ def _measure_accuracy(data, el_pipe):
 
     docs = [d for d, g in data]
     docs = el_pipe.pipe(docs)
-
     golds = [g for d, g in data]
 
     for doc, gold in zip(docs, golds):
-        correct_entries_per_article = dict()
-        for entity in gold.links:
-            start, end, gold_kb = entity
-            correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
+        try:
+            correct_entries_per_article = dict()
+            for entity in gold.links:
+                start, end, gold_kb = entity
+                correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
 
-        for ent in doc.ents:
-            if ent.label_ == "PERSON":  # TODO: expand to other types
-                pred_entity = ent.kb_id_
-                start = ent.start
-                end = ent.end
-                gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
-                if gold_entity is not None:
-                    if gold_entity == pred_entity:
-                        correct += 1
-                    else:
-                        incorrect += 1
+            for ent in doc.ents:
+                if ent.label_ == "PERSON":  # TODO: expand to other types
+                    pred_entity = ent.kb_id_
+                    start = ent.start
+                    end = ent.end
+                    gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
+                    if gold_entity is not None:
+                        if gold_entity == pred_entity:
+                            correct += 1
+                        else:
+                            incorrect += 1
+
+        except Exception as e:
+            print("Error assessing accuracy", e)
 
     if correct == incorrect == 0:
         return 0
@@ -243,4 +252,4 @@ def run_el_toy_example(nlp, kb):
 
 
 if __name__ == "__main__":
-    run_pipeline()
+    run_pipeline()
\ No newline at end of file
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 9ef9df601..deaab0a19 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1220,8 +1220,13 @@ class EntityLinker(Pipe):
 
     def predict(self, docs):
         self.require_model()
+
+        if isinstance(docs, Doc):
+            docs = [docs]
+
         final_entities = list()
         final_kb_ids = list()
+
         for i, article_doc in enumerate(docs):
             doc_encoding = self.article_encoder([article_doc])
             for ent in article_doc.ents:

From b12001f368017b7f19ddb5b4f2f670d8dbf8e57b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 12 Jun 2019 22:05:53 +0200
Subject: [PATCH 074/148] small fixes

---
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 37 ++++++++++++-------
 spacy/pipeline/pipes.pyx                      |  2 +-
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 90218edda..ebad16ba5 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -116,8 +116,8 @@ def run_pipeline():
     # STEP 6: create the entity linking pipe
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
-        train_limit = 100
-        dev_limit = 20
+        train_limit = 5000
+        dev_limit = 1000
         print("Training on", train_limit, "articles")
         print("Dev testing on", dev_limit, "articles")
         print()
@@ -145,6 +145,7 @@ def run_pipeline():
             random.shuffle(train_data)
             losses = {}
             batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
+            batchnr = 0
 
             with nlp.disable_pipes(*other_pipes):
                 for batch in batches:
@@ -156,35 +157,43 @@ def run_pipeline():
                             drop=DROPOUT,
                             losses=losses,
                         )
+                        batchnr += 1
                     except Exception as e:
                         print("Error updating batch", e)
 
+            losses['entity_linker'] = losses['entity_linker'] / batchnr
             print("Epoch, train loss", itn, round(losses['entity_linker'], 2))
 
-        # baseline using only prior probabilities
-        el_pipe.context_weight = 0
-        el_pipe.prior_weight = 1
-        dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe)
-        train_acc_0_1 = _measure_accuracy(train_data, el_pipe)
+        print()
+        print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())
+        print()
 
         # print(" measuring accuracy 1-1")
         el_pipe.context_weight = 1
         el_pipe.prior_weight = 1
         dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe)
         train_acc_1_1 = _measure_accuracy(train_data, el_pipe)
+        print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2))
 
-        # print(" measuring accuracy 1-0")
+        # baseline using only prior probabilities
+        el_pipe.context_weight = 0
+        el_pipe.prior_weight = 1
+        dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe)
+        train_acc_0_1 = _measure_accuracy(train_data, el_pipe)
+        print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2))
+
+        # using only context
         el_pipe.context_weight = 1
         el_pipe.prior_weight = 0
         dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe)
         train_acc_1_0 = _measure_accuracy(train_data, el_pipe)
 
-        print("train/dev acc, 1-1, 0-1, 1-0:" ,
-              round(train_acc_1_1, 2), round(train_acc_0_1, 2), round(train_acc_1_0, 2), "/",
-              round(dev_acc_1_1, 2), round(dev_acc_0_1, 2), round(dev_acc_1_0, 2))
+        print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2))
+        print()
 
-    # test Entity Linker
     if to_test_pipeline:
+        print()
+        print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())
         print()
         run_el_toy_example(kb=my_kb, nlp=nlp)
         print()
@@ -197,9 +206,9 @@ def _measure_accuracy(data, el_pipe):
     correct = 0
     incorrect = 0
 
-    docs = [d for d, g in data]
+    docs = [d for d, g in data if len(d) > 0]
     docs = el_pipe.pipe(docs)
-    golds = [g for d, g in data]
+    golds = [g for d, g in data if len(d) > 0]
 
     for doc, gold in zip(docs, golds):
         try:
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index deaab0a19..f9043f0e4 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1188,7 +1188,7 @@ class EntityLinker(Pipe):
     def get_loss(self, docs, golds, scores):
         targets = [[1] for _  in golds]  # assuming we're only using positive examples
         loss, gradients = self.get_cossim_loss_2(yh=scores, y=golds, t=targets)
-        #loss = loss / len(golds)
+        loss = loss / len(golds)
         return loss, gradients
 
     def get_cossim_loss_2(self, yh, y, t):

From 78dd3e11da60532dc6f4c5cbcd76fa7577d3cb33 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 13 Jun 2019 16:25:39 +0200
Subject: [PATCH 075/148] write entity linking pipe to file and keep vocab
 consistent between kb and nlp

---
 .../wiki_entity_linking/kb_creator.py         |   4 +-
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 145 +++++++++-------
 spacy/kb.pyx                                  |   6 +
 spacy/language.py                             |   9 +
 spacy/pipeline/pipes.pyx                      | 155 ++++++++++++++----
 5 files changed, 226 insertions(+), 93 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index d097ac449..785811ea6 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
     title_list = list(title_to_id.keys())
 
     # TODO: remove this filter (just for quicker testing of code)
-    # title_list = title_list[0:34200]
-    # title_to_id = {t: title_to_id[t] for t in title_list}
+    title_list = title_list[0:342]
+    title_to_id = {t: title_to_id[t] for t in title_list}
 
     entity_list = [title_to_id[x] for x in title_list]
 
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index ebad16ba5..0c03784a1 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -6,6 +6,7 @@ import random
 from spacy.util import minibatch, compounding
 
 from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
+from examples.pipeline.wiki_entity_linking.kb_creator import DESC_WIDTH
 
 import spacy
 from spacy.vocab import Vocab
@@ -22,41 +23,48 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
 ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'
 
 KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
-VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
+NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1'
+NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'
 
 TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
 
 MAX_CANDIDATES = 10
 MIN_PAIR_OCC = 5
 DOC_CHAR_CUTOFF = 300
-EPOCHS = 10
+EPOCHS = 2
 DROPOUT = 0.1
 
 
 def run_pipeline():
     print("START", datetime.datetime.now())
     print()
-    nlp = spacy.load('en_core_web_lg')
-    my_kb = None
+    nlp_1 = spacy.load('en_core_web_lg')
+    nlp_2 = None
+    kb_1 = None
+    kb_2 = None
 
     # one-time methods to create KB and write to file
     to_create_prior_probs = False
     to_create_entity_counts = False
-    to_create_kb = False
+    to_create_kb = True
 
     # read KB back in from file
     to_read_kb = True
-    to_test_kb = False
+    to_test_kb = True
 
     # create training dataset
     create_wp_training = False
 
     # train the EL pipe
     train_pipe = True
+    measure_performance = False
 
     # test the EL pipe on a simple example
     to_test_pipeline = True
 
+    # write the NLP object, read back in and test again
+    test_nlp_io = True
+
     # STEP 1 : create prior probabilities from WP
     # run only once !
     if to_create_prior_probs:
@@ -75,7 +83,7 @@ def run_pipeline():
     # run only once !
     if to_create_kb:
         print("STEP 3a: to_create_kb", datetime.datetime.now())
-        my_kb = kb_creator.create_kb(nlp,
+        kb_1 = kb_creator.create_kb(nlp_1,
                                      max_entities_per_alias=MAX_CANDIDATES,
                                      min_occ=MIN_PAIR_OCC,
                                      entity_def_output=ENTITY_DEFS,
@@ -83,63 +91,66 @@ def run_pipeline():
                                      count_input=ENTITY_COUNTS,
                                      prior_prob_input=PRIOR_PROB,
                                      to_print=False)
-        print("kb entities:", my_kb.get_size_entities())
-        print("kb aliases:", my_kb.get_size_aliases())
+        print("kb entities:", kb_1.get_size_entities())
+        print("kb aliases:", kb_1.get_size_aliases())
         print()
 
-        print("STEP 3b: write KB", datetime.datetime.now())
-        my_kb.dump(KB_FILE)
-        nlp.vocab.to_disk(VOCAB_DIR)
+        print("STEP 3b: write KB and NLP", datetime.datetime.now())
+        kb_1.dump(KB_FILE)
+        nlp_1.to_disk(NLP_1_DIR)
         print()
 
     # STEP 4 : read KB back in from file
     if to_read_kb:
         print("STEP 4: to_read_kb", datetime.datetime.now())
-        my_vocab = Vocab()
-        my_vocab.from_disk(VOCAB_DIR)
-        my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64)  # TODO entity vectors
-        my_kb.load_bulk(KB_FILE)
-        print("kb entities:", my_kb.get_size_entities())
-        print("kb aliases:", my_kb.get_size_aliases())
+        # my_vocab = Vocab()
+        # my_vocab.from_disk(VOCAB_DIR)
+        # my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64)
+        nlp_2 = spacy.load(NLP_1_DIR)
+        kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH)
+        kb_2.load_bulk(KB_FILE)
+        print("kb entities:", kb_2.get_size_entities())
+        print("kb aliases:", kb_2.get_size_aliases())
         print()
 
         # test KB
         if to_test_kb:
-            run_el.run_kb_toy_example(kb=my_kb)
+            run_el.run_kb_toy_example(kb=kb_2)
             print()
 
     # STEP 5: create a training dataset from WP
     if create_wp_training:
         print("STEP 5: create training dataset", datetime.datetime.now())
-        training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
+        training_set_creator.create_training(kb=kb_2, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
 
     # STEP 6: create the entity linking pipe
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
-        train_limit = 5000
-        dev_limit = 1000
+        train_limit = 10
+        dev_limit = 5
         print("Training on", train_limit, "articles")
         print("Dev testing on", dev_limit, "articles")
         print()
 
-        train_data = training_set_creator.read_training(nlp=nlp,
+        train_data = training_set_creator.read_training(nlp=nlp_2,
                                                         training_dir=TRAINING_DIR,
                                                         dev=False,
                                                         limit=train_limit,
                                                         to_print=False)
 
-        dev_data = training_set_creator.read_training(nlp=nlp,
+        dev_data = training_set_creator.read_training(nlp=nlp_2,
                                                       training_dir=TRAINING_DIR,
                                                       dev=True,
                                                       limit=dev_limit,
                                                       to_print=False)
 
-        el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF})
-        nlp.add_pipe(el_pipe, last=True)
+        el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_CHAR_CUTOFF})
+        el_pipe.set_kb(kb_2)
+        nlp_2.add_pipe(el_pipe, last=True)
 
-        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
-        with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
-            nlp.begin_training()
+        other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"]
+        with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
+            nlp_2.begin_training()
 
         for itn in range(EPOCHS):
             random.shuffle(train_data)
@@ -147,11 +158,11 @@ def run_pipeline():
             batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
             batchnr = 0
 
-            with nlp.disable_pipes(*other_pipes):
+            with nlp_2.disable_pipes(*other_pipes):
                 for batch in batches:
                     try:
                         docs, golds = zip(*batch)
-                        nlp.update(
+                        nlp_2.update(
                             docs,
                             golds,
                             drop=DROPOUT,
@@ -164,40 +175,62 @@ def run_pipeline():
             losses['entity_linker'] = losses['entity_linker'] / batchnr
             print("Epoch, train loss", itn, round(losses['entity_linker'], 2))
 
-        print()
-        print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())
-        print()
+        if measure_performance:
+            print()
+            print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())
+            print()
 
-        # print(" measuring accuracy 1-1")
-        el_pipe.context_weight = 1
-        el_pipe.prior_weight = 1
-        dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe)
-        train_acc_1_1 = _measure_accuracy(train_data, el_pipe)
-        print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2))
+            # print(" measuring accuracy 1-1")
+            el_pipe.context_weight = 1
+            el_pipe.prior_weight = 1
+            dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe)
+            train_acc_1_1 = _measure_accuracy(train_data, el_pipe)
+            print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2))
 
-        # baseline using only prior probabilities
-        el_pipe.context_weight = 0
-        el_pipe.prior_weight = 1
-        dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe)
-        train_acc_0_1 = _measure_accuracy(train_data, el_pipe)
-        print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2))
+            # baseline using only prior probabilities
+            el_pipe.context_weight = 0
+            el_pipe.prior_weight = 1
+            dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe)
+            train_acc_0_1 = _measure_accuracy(train_data, el_pipe)
+            print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2))
 
-        # using only context
-        el_pipe.context_weight = 1
-        el_pipe.prior_weight = 0
-        dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe)
-        train_acc_1_0 = _measure_accuracy(train_data, el_pipe)
+            # using only context
+            el_pipe.context_weight = 1
+            el_pipe.prior_weight = 0
+            dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe)
+            train_acc_1_0 = _measure_accuracy(train_data, el_pipe)
 
-        print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2))
-        print()
+            print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2))
+            print()
 
     if to_test_pipeline:
         print()
         print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())
         print()
-        run_el_toy_example(kb=my_kb, nlp=nlp)
+        run_el_toy_example(nlp=nlp_2)
         print()
 
+    if test_nlp_io:
+        print()
+        print("STEP 9: testing NLP IO", datetime.datetime.now())
+        print()
+        print("writing to", NLP_2_DIR)
+        print(" vocab len nlp_2", len(nlp_2.vocab))
+        print(" vocab len kb_2", len(kb_2.vocab))
+        nlp_2.to_disk(NLP_2_DIR)
+        print()
+        print("reading from", NLP_2_DIR)
+        nlp_3 = spacy.load(NLP_2_DIR)
+        print(" vocab len nlp_3", len(nlp_3.vocab))
+
+        for pipe_name, pipe in nlp_3.pipeline:
+            if pipe_name == "entity_linker":
+                print(" vocab len kb_3", len(pipe.kb.vocab))
+
+        print()
+        print("running toy example with NLP 2")
+        run_el_toy_example(nlp=nlp_3)
+
     print()
     print("STOP", datetime.datetime.now())
 
@@ -239,7 +272,7 @@ def _measure_accuracy(data, el_pipe):
     return acc
 
 
-def run_el_toy_example(nlp, kb):
+def run_el_toy_example(nlp):
     text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
            "Douglas reminds us to always bring our towel. " \
            "The main character in Doug's novel is the man Arthur Dent, " \
@@ -261,4 +294,4 @@ def run_el_toy_example(nlp, kb):
 
 
 if __name__ == "__main__":
-    run_pipeline()
\ No newline at end of file
+    run_pipeline()
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ade2360be..9a84439ea 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -2,6 +2,8 @@
 # cython: profile=True
 # coding: utf8
 from collections import OrderedDict
+from pathlib import Path, WindowsPath
+
 from cpython.exc cimport PyErr_CheckSignals
 
 from spacy import util
@@ -389,6 +391,8 @@ cdef class Writer:
     def __init__(self, object loc):
         if path.exists(loc):
             assert not path.isdir(loc), "%s is directory." % loc
+        if isinstance(loc, Path):
+            loc = bytes(loc)
         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
         self._fp = fopen(<char*>bytes_loc, 'wb')
         assert self._fp != NULL
@@ -431,6 +435,8 @@ cdef class Reader:
     def __init__(self, object loc):
         assert path.exists(loc)
         assert not path.isdir(loc)
+        if isinstance(loc, Path):
+            loc = bytes(loc)
         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
         self._fp = fopen(<char*>bytes_loc, 'rb')
         if not self._fp:
diff --git a/spacy/language.py b/spacy/language.py
index ec3232bd5..0e5e29244 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -11,6 +11,7 @@ from copy import copy, deepcopy
 from thinc.neural import Model
 import srsly
 
+from spacy.kb import KnowledgeBase
 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
@@ -809,6 +810,14 @@ class Language(object):
             # Convert to list here in case exclude is (default) tuple
             exclude = list(exclude) + ["vocab"]
         util.from_disk(path, deserializers, exclude)
+
+        # download the KB for the entity linking component - requires the vocab
+        for pipe_name, pipe in self.pipeline:
+            if pipe_name == "entity_linker":
+                kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=pipe.cfg["entity_width"])
+                kb.load_bulk(path / pipe_name / "kb")
+                pipe.set_kb(kb)
+
         self._path = path
         return self
 
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index f9043f0e4..e73ff6a0e 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -14,6 +14,7 @@ from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical
 from thinc.neural.util import get_array_module
 
+from spacy.kb import KnowledgeBase
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
 from ..syntax.ner cimport BiluoPushDown
@@ -1077,7 +1078,7 @@ class EntityLinker(Pipe):
         hidden_width = cfg.get("hidden_width", 32)
         article_width = cfg.get("article_width", 128)
         sent_width = cfg.get("sent_width", 64)
-        entity_width = cfg["kb"].entity_vector_length
+        entity_width = cfg.get("entity_width")  # no default because this needs to correspond with the KB
 
         article_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=article_width, **cfg)
         sent_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg)
@@ -1089,34 +1090,41 @@ class EntityLinker(Pipe):
         return article_encoder, sent_encoder, mention_encoder
 
     def __init__(self, **cfg):
+        self.article_encoder = True
+        self.sent_encoder = True
         self.mention_encoder = True
+        self.kb = None
         self.cfg = dict(cfg)
-        self.kb = self.cfg["kb"]
-        self.doc_cutoff = self.cfg["doc_cutoff"]
-
-    def use_avg_params(self):
-        # Modify the pipe's encoders/models, to use their average parameter values.
-        # TODO: this doesn't work yet because there's no exit method
-        self.article_encoder.use_params(self.sgd_article.averages)
-        self.sent_encoder.use_params(self.sgd_sent.averages)
-        self.mention_encoder.use_params(self.sgd_mention.averages)
+        self.doc_cutoff = self.cfg.get("doc_cutoff", 150)
 
+    def set_kb(self, kb):
+        self.kb = kb
 
     def require_model(self):
         # Raise an error if the component's model is not initialized.
         if getattr(self, "mention_encoder", None) in (None, True, False):
             raise ValueError(Errors.E109.format(name=self.name))
 
+    def require_kb(self):
+        # Raise an error if the knowledge base is not initialized.
+        if getattr(self, "kb", None) in (None, True, False):
+            # TODO: custom error
+            raise ValueError(Errors.E109.format(name=self.name))
+
     def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
+        self.require_kb()
+        self.cfg["entity_width"] = self.kb.entity_vector_length
+
         if self.mention_encoder is True:
             self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
-            self.sgd_article = create_default_optimizer(self.article_encoder.ops)
-            self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
-            self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
+        self.sgd_article = create_default_optimizer(self.article_encoder.ops)
+        self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
+        self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
         return self.sgd_article
 
     def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
         self.require_model()
+        self.require_kb()
 
         if len(docs) != len(golds):
             raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs),
@@ -1220,6 +1228,7 @@ class EntityLinker(Pipe):
 
     def predict(self, docs):
         self.require_model()
+        self.require_kb()
 
         if isinstance(docs, Doc):
             docs = [docs]
@@ -1228,30 +1237,32 @@ class EntityLinker(Pipe):
         final_kb_ids = list()
 
         for i, article_doc in enumerate(docs):
-            doc_encoding = self.article_encoder([article_doc])
-            for ent in article_doc.ents:
-                sent_doc = ent.sent.as_doc()
-                sent_encoding = self.sent_encoder([sent_doc])
-                concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
-                mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]]))
-                mention_enc_t = np.transpose(mention_encoding)
+            if len(article_doc) > 0:
+                doc_encoding = self.article_encoder([article_doc])
+                for ent in article_doc.ents:
+                    sent_doc = ent.sent.as_doc()
+                    if len(sent_doc) > 0:
+                        sent_encoding = self.sent_encoder([sent_doc])
+                        concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
+                        mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]]))
+                        mention_enc_t = np.transpose(mention_encoding)
 
-                candidates = self.kb.get_candidates(ent.text)
-                if candidates:
-                    scores = list()
-                    for c in candidates:
-                        prior_prob = c.prior_prob * self.prior_weight
-                        kb_id = c.entity_
-                        entity_encoding = c.entity_vector
-                        sim = cosine(np.asarray([entity_encoding]), mention_enc_t) * self.context_weight
-                        score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
-                        scores.append(score)
+                        candidates = self.kb.get_candidates(ent.text)
+                        if candidates:
+                            scores = list()
+                            for c in candidates:
+                                prior_prob = c.prior_prob * self.prior_weight
+                                kb_id = c.entity_
+                                entity_encoding = c.entity_vector
+                                sim = cosine(np.asarray([entity_encoding]), mention_enc_t) * self.context_weight
+                                score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
+                                scores.append(score)
 
-                    # TODO: thresholding
-                    best_index = scores.index(max(scores))
-                    best_candidate = candidates[best_index]
-                    final_entities.append(ent)
-                    final_kb_ids.append(best_candidate.entity_)
+                            # TODO: thresholding
+                            best_index = scores.index(max(scores))
+                            best_candidate = candidates[best_index]
+                            final_entities.append(ent)
+                            final_kb_ids.append(best_candidate.entity_)
 
         return final_entities, final_kb_ids
 
@@ -1260,6 +1271,80 @@ class EntityLinker(Pipe):
             for token in entity:
                 token.ent_kb_id_ = kb_id
 
+    def to_bytes(self, exclude=tuple(), **kwargs):
+        """Serialize the pipe to a bytestring.
+
+        exclude (list): String names of serialization fields to exclude.
+        RETURNS (bytes): The serialized object.
+        """
+        serialize = OrderedDict()
+        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
+        serialize["kb"] = self.kb.to_bytes  # TODO
+        if self.mention_encoder not in (True, False, None):
+            serialize["article_encoder"] = self.article_encoder.to_bytes
+            serialize["sent_encoder"] = self.sent_encoder.to_bytes
+            serialize["mention_encoder"] = self.mention_encoder.to_bytes
+        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
+        return util.to_bytes(serialize, exclude)
+
+    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
+        """Load the pipe from a bytestring."""
+        deserialize = OrderedDict()
+        deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
+        deserialize["kb"] = lambda b: self.kb.from_bytes(b)  # TODO
+        deserialize["article_encoder"] = lambda b: self.article_encoder.from_bytes(b)
+        deserialize["sent_encoder"] = lambda b: self.sent_encoder.from_bytes(b)
+        deserialize["mention_encoder"] = lambda b: self.mention_encoder.from_bytes(b)
+        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
+        util.from_bytes(bytes_data, deserialize, exclude)
+        return self
+
+    def to_disk(self, path, exclude=tuple(), **kwargs):
+        """Serialize the pipe to disk."""
+        serialize = OrderedDict()
+        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
+        serialize["kb"] = lambda p: self.kb.dump(p)
+        if self.mention_encoder not in (None, True, False):
+            serialize["article_encoder"] = lambda p: p.open("wb").write(self.article_encoder.to_bytes())
+            serialize["sent_encoder"] = lambda p: p.open("wb").write(self.sent_encoder.to_bytes())
+            serialize["mention_encoder"] = lambda p: p.open("wb").write(self.mention_encoder.to_bytes())
+        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(self, path, exclude=tuple(), **kwargs):
+        """Load the pipe from disk."""
+        def load_article_encoder(p):
+            if self.article_encoder is True:
+                self.article_encoder, _, _ = self.Model(**self.cfg)
+            self.article_encoder.from_bytes(p.open("rb").read())
+
+        def load_sent_encoder(p):
+            if self.sent_encoder is True:
+                _, self.sent_encoder, _ = self.Model(**self.cfg)
+            self.sent_encoder.from_bytes(p.open("rb").read())
+
+        def load_mention_encoder(p):
+             if self.mention_encoder is True:
+                _, _, self.mention_encoder = self.Model(**self.cfg)
+             self.mention_encoder.from_bytes(p.open("rb").read())
+
+        deserialize = OrderedDict()
+        deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
+        deserialize["article_encoder"] = load_article_encoder
+        deserialize["sent_encoder"] = load_sent_encoder
+        deserialize["mention_encoder"] = load_mention_encoder
+        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
+        util.from_disk(path, deserialize, exclude)
+        return self
+
+    def rehearse(self, docs, sgd=None, losses=None, **config):
+        # TODO
+        pass
+
+    def add_label(self, label):
+        pass
+
+
 class Sentencizer(object):
     """Segment the Doc into sentences using a rule-based strategy.
 

From 0b04d142de01806e15a696fcc667c8563d438005 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 13 Jun 2019 22:32:56 +0200
Subject: [PATCH 076/148] regenerating KB

---
 .../wiki_entity_linking/kb_creator.py         | 19 +++++++++----------
 .../wiki_entity_linking/train_descriptions.py |  4 ++--
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  8 ++++----
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index 785811ea6..7b740216b 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -14,6 +14,7 @@ from . import wikidata_processor as wd
 INPUT_DIM = 300  # dimension of pre-trained vectors
 DESC_WIDTH = 64
 
+
 def create_kb(nlp, max_entities_per_alias, min_occ,
               entity_def_output, entity_descr_output,
               count_input, prior_prob_input, to_print=False):
@@ -25,8 +26,7 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
 
     if read_raw_data:
         print()
-        print("1. _read_wikidata_entities", datetime.datetime.now())
-        print()
+        print(" * _read_wikidata_entities", datetime.datetime.now())
         title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None)
 
         # write the title-ID and ID-description mappings to file
@@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
     title_list = list(title_to_id.keys())
 
     # TODO: remove this filter (just for quicker testing of code)
-    title_list = title_list[0:342]
-    title_to_id = {t: title_to_id[t] for t in title_list}
+    # title_list = title_list[0:342]
+    # title_to_id = {t: title_to_id[t] for t in title_list}
 
     entity_list = [title_to_id[x] for x in title_list]
 
@@ -49,29 +49,28 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
     description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
 
     print()
-    print("2. _get_entity_frequencies", datetime.datetime.now())
+    print(" * _get_entity_frequencies", datetime.datetime.now())
     print()
     entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list)
 
     print()
-    print("3. train entity encoder", datetime.datetime.now())
+    print(" * train entity encoder", datetime.datetime.now())
     print()
 
     encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH)
     encoder.train(description_list=description_list, to_print=True)
     print()
 
-    print("4. get entity embeddings", datetime.datetime.now())
+    print(" * get entity embeddings", datetime.datetime.now())
     print()
     embeddings = encoder.apply_encoder(description_list)
 
     print()
-    print("5. adding", len(entity_list), "entities", datetime.datetime.now())
-    print()
+    print(" * adding", len(entity_list), "entities", datetime.datetime.now())
     kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings)
 
     print()
-    print("6. adding aliases", datetime.datetime.now())
+    print(" * adding aliases", datetime.datetime.now())
     print()
     _add_aliases(kb, title_to_id=title_to_id,
                  max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py
index e1a2f1797..92859fd84 100644
--- a/examples/pipeline/wiki_entity_linking/train_descriptions.py
+++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py
@@ -17,7 +17,7 @@ class EntityEncoder:
 
     DROP = 0
     EPOCHS = 5
-    STOP_THRESHOLD = 0.1
+    STOP_THRESHOLD = 0.04
 
     BATCH_SIZE = 1000
 
@@ -127,7 +127,7 @@ class EntityEncoder:
         return loss, gradients
 
     def _test_encoder(self):
-        """ Test encoder on some dummy examples """
+        # Test encoder on some dummy examples
         desc_A1 = "Fictional character in The Simpsons"
         desc_A2 = "Simpsons - fictional human"
         desc_A3 = "Fictional character in The Flintstones"
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 0c03784a1..d5002e26f 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -22,7 +22,7 @@ ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
 ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
 ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'
 
-KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
+KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb_1/kb'
 NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1'
 NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'
 
@@ -56,14 +56,14 @@ def run_pipeline():
     create_wp_training = False
 
     # train the EL pipe
-    train_pipe = True
+    train_pipe = False
     measure_performance = False
 
     # test the EL pipe on a simple example
-    to_test_pipeline = True
+    to_test_pipeline = False
 
     # write the NLP object, read back in and test again
-    test_nlp_io = True
+    test_nlp_io = False
 
     # STEP 1 : create prior probabilities from WP
     # run only once !

From b312f2d0e79b886d0d824f9294ccc2f1f24b725a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 14 Jun 2019 15:55:26 +0200
Subject: [PATCH 077/148] redo training data to be independent of KB and
 entity-level instead of doc-level

---
 .../wiki_entity_linking/train_descriptions.py |   2 -
 .../training_set_creator.py                   | 219 +++++++++---------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  44 ++--
 spacy/pipeline/pipes.pyx                      |  96 ++++----
 4 files changed, 179 insertions(+), 182 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py
index 92859fd84..bf4bcbc3d 100644
--- a/examples/pipeline/wiki_entity_linking/train_descriptions.py
+++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py
@@ -1,8 +1,6 @@
 # coding: utf-8
 from random import shuffle
 
-from examples.pipeline.wiki_entity_linking import kb_creator
-
 import numpy as np
 
 from spacy._ml import zero_init, create_default_optimizer
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index 38a86058d..fc620a1d3 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -19,17 +19,15 @@ Process Wikipedia interlinks to generate a training dataset for the EL algorithm
 ENTITY_FILE = "gold_entities.csv"
 
 
-def create_training(kb, entity_def_input, training_output):
-    if not kb:
-        raise ValueError("kb should be defined")
+def create_training(entity_def_input, training_output):
     wp_to_id = kb_creator._get_entity_to_id(entity_def_input)
-    _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000)  # TODO: full dataset
+    _process_wikipedia_texts(wp_to_id, training_output, limit=100000000)  # TODO: full dataset   100000000
 
 
-def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
+def _process_wikipedia_texts(wp_to_id, training_output, limit=None):
     """
     Read the XML wikipedia data to parse out training data:
-    raw text data + positive and negative instances
+    raw text data + positive instances
     """
 
     title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
@@ -43,8 +41,9 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
         _write_training_entity(outputfile=entityfile,
                                article_id="article_id",
                                alias="alias",
-                               entity="entity",
-                               correct="correct")
+                               entity="WD_id",
+                               start="start",
+                               end="end")
 
         with bz2.open(wp.ENWIKI_DUMP, mode='rb') as file:
             line = file.readline()
@@ -75,14 +74,11 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
                 elif clean_line == "</page>":
                     if article_id:
                         try:
-                            _process_wp_text(kb, wp_to_id, entityfile, article_id, article_text.strip(), training_output)
-                        # on a previous run, an error occurred after 46M lines and 2h
+                            _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(), training_output)
                         except Exception as e:
                             print("Error processing article", article_id, article_title, e)
                     else:
-                        print("Done processing a page, but couldn't find an article_id ?")
-                        print(article_title)
-                        print(article_text)
+                        print("Done processing a page, but couldn't find an article_id ?", article_title)
                     article_text = ""
                     article_title = None
                     article_id = None
@@ -122,7 +118,14 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
 text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
 
 
-def _process_wp_text(kb, wp_to_id, entityfile, article_id, article_text, training_output):
+def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text, training_output):
+    found_entities = False
+    # print("Processing", article_id, article_title)
+
+    # ignore meta Wikipedia pages
+    if article_title.startswith("Wikipedia:"):
+        return
+
     # remove the text tags
     text = text_regex.search(article_text).group(0)
 
@@ -130,67 +133,91 @@ def _process_wp_text(kb, wp_to_id, entityfile, article_id, article_text, trainin
     if text.startswith("#REDIRECT"):
         return
 
-    # print("WP article", article_id, ":", article_title)
     # print()
     # print(text)
 
-    # get the raw text without markup etc
+    # get the raw text without markup etc, keeping only interwiki links
     clean_text = _get_clean_wp_text(text)
     # print()
     # print(clean_text)
 
-    article_dict = dict()
-    ambiguous_aliases = set()
-    aliases, entities, normalizations = wp.get_wp_links(text)
-    for alias, entity, norm in zip(aliases, entities, normalizations):
-        if alias not in ambiguous_aliases:
-            entity_id = wp_to_id.get(entity)
-            if entity_id:
-                # TODO: take care of these conflicts ! Currently they are being removed from the dataset
-                if article_dict.get(alias) and article_dict[alias] != entity_id:
-                    ambiguous_aliases.add(alias)
-                    article_dict.pop(alias)
-                    # print("Found conflicting alias", alias, "in article", article_id, article_title)
-                else:
-                    article_dict[alias] = entity_id
+    # read the text char by char to get the right offsets of the interwiki links
+    final_text = ""
+    open_read = 0
+    reading_text = True
+    reading_entity = False
+    reading_mention = False
+    reading_special_case = False
+    entity_buffer = ""
+    mention_buffer = ""
+    for index, letter in enumerate(clean_text):
+        if letter == '[':
+            open_read += 1
+        elif letter == ']':
+            open_read -= 1
+        elif letter == '|':
+            if reading_text:
+                final_text += letter
+            # switch from reading entity to mention in the [[entity|mention]] pattern
+            elif reading_entity:
+                reading_text = False
+                reading_entity = False
+                reading_mention = True
+            else:
+                reading_special_case = True
+        else:
+            if reading_entity:
+                entity_buffer += letter
+            elif reading_mention:
+                mention_buffer += letter
+            elif reading_text:
+                final_text += letter
+            else:
+                raise ValueError("Not sure at point", clean_text[index-2:index+2])
 
-    # print("found entities:")
-    for alias, entity in article_dict.items():
-        # print(alias, "-->", entity)
-        candidates = kb.get_candidates(alias)
+        if open_read > 2:
+            reading_special_case = True
 
-        # as training data, we only store entities that are sufficiently ambiguous
-        if len(candidates) > 1:
-            _write_training_article(article_id=article_id, clean_text=clean_text, training_output=training_output)
-            # print("alias", alias)
+        if open_read == 2 and reading_text:
+            reading_text = False
+            reading_entity = True
+            reading_mention = False
 
-            # print all incorrect candidates
-            for c in candidates:
-                if entity != c.entity_:
+        # we just finished reading an entity
+        if open_read == 0 and not reading_text:
+            if '#' in entity_buffer or entity_buffer.startswith(':'):
+                reading_special_case = True
+            # Ignore cases with nested structures like File: handles etc
+            if not reading_special_case:
+                if not mention_buffer:
+                    mention_buffer = entity_buffer
+                start = len(final_text)
+                end = start + len(mention_buffer)
+                qid = wp_to_id.get(entity_buffer, None)
+                if qid:
                     _write_training_entity(outputfile=entityfile,
                                            article_id=article_id,
-                                           alias=alias,
-                                           entity=c.entity_,
-                                           correct="0")
+                                           alias=mention_buffer,
+                                           entity=qid,
+                                           start=start,
+                                           end=end)
+                found_entities = True
+                final_text += mention_buffer
 
-            # print the one correct candidate
-            _write_training_entity(outputfile=entityfile,
-                                   article_id=article_id,
-                                   alias=alias,
-                                   entity=entity,
-                                   correct="1")
+            entity_buffer = ""
+            mention_buffer = ""
 
-            # print("gold entity", entity)
-            # print()
+            reading_text = True
+            reading_entity = False
+            reading_mention = False
+            reading_special_case = False
 
-    # _run_ner_depr(nlp, clean_text, article_dict)
-    # print()
+    if found_entities:
+        _write_training_article(article_id=article_id, clean_text=final_text, training_output=training_output)
 
 
 info_regex = re.compile(r'{[^{]*?}')
-interwiki_regex = re.compile(r'\[\[([^|]*?)]]')
-interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]')
-htlm_regex = re.compile(r'&lt;!--[^!]*--&gt;')
+htlm_regex = re.compile(r'&lt;!--[^-]*--&gt;')
 category_regex = re.compile(r'\[\[Category:[^\[]*]]')
 file_regex = re.compile(r'\[\[File:[^[\]]+]]')
 ref_regex = re.compile(r'&lt;ref.*?&gt;')     # non-greedy
@@ -215,12 +242,6 @@ def _get_clean_wp_text(article_text):
             try_again = False
         previous_length = len(clean_text)
 
-    # remove simple interwiki links (no alternative name)
-    clean_text = interwiki_regex.sub(r'\1', clean_text)
-
-    # remove simple interwiki links by picking the alternative name
-    clean_text = interwiki_2_regex.sub(r'\1', clean_text)
-
     # remove HTML comments
     clean_text = htlm_regex.sub('', clean_text)
 
@@ -265,43 +286,34 @@ def _write_training_article(article_id, clean_text, training_output):
         outputfile.write(clean_text)
 
 
-def _write_training_entity(outputfile, article_id, alias, entity, correct):
-    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
+def _write_training_entity(outputfile, article_id, alias, entity, start, end):
+    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + str(start) + "|" + str(end) + "\n")
 
 
-def read_training_entities(training_output, collect_correct=True, collect_incorrect=False):
+def read_training_entities(training_output):
     entityfile_loc = training_output + "/" + ENTITY_FILE
-    incorrect_entries_per_article = dict()
-    correct_entries_per_article = dict()
+    entries_per_article = dict()
+
     with open(entityfile_loc, mode='r', encoding='utf8') as file:
         for line in file:
             fields = line.replace('\n', "").split(sep='|')
             article_id = fields[0]
             alias = fields[1]
-            entity = fields[2]
-            correct = fields[3]
+            wp_title = fields[2]
+            start = fields[3]
+            end = fields[4]
 
-            if correct == "1" and collect_correct:
-                entry_dict = correct_entries_per_article.get(article_id, dict())
-                if alias in entry_dict:
-                    raise ValueError("Found alias", alias, "multiple times for article", article_id, "in", ENTITY_FILE)
-                entry_dict[alias] = entity
-                correct_entries_per_article[article_id] = entry_dict
+            entries_by_offset = entries_per_article.get(article_id, dict())
+            entries_by_offset[start + "-" + end] = (alias, wp_title)
 
-            if correct == "0" and collect_incorrect:
-                entry_dict = incorrect_entries_per_article.get(article_id, dict())
-                entities = entry_dict.get(alias, set())
-                entities.add(entity)
-                entry_dict[alias] = entities
-                incorrect_entries_per_article[article_id] = entry_dict
+            entries_per_article[article_id] = entries_by_offset
 
-    return correct_entries_per_article, incorrect_entries_per_article
+    return entries_per_article
 
 
 def read_training(nlp, training_dir, dev, limit, to_print):
-    correct_entries, incorrect_entries = read_training_entities(training_output=training_dir,
-                                                                collect_correct=True,
-                                                                collect_incorrect=True)
+    # This method will provide training examples that correspond to the entity annotations found by the nlp object
+    entries_per_article = read_training_entities(training_output=training_dir)
 
     data = []
 
@@ -320,36 +332,33 @@ def read_training(nlp, training_dir, dev, limit, to_print):
                         text = file.read()
                         article_doc = nlp(text)
 
+                    entries_by_offset = entries_per_article.get(article_id, dict())
+
                     gold_entities = list()
+                    for ent in article_doc.ents:
+                        start = ent.start_char
+                        end = ent.end_char
 
-                    # process all positive and negative entities, collect all relevant mentions in this article
-                    for mention, entity_pos in correct_entries[article_id].items():
-                        # find all matches in the doc for the mentions
-                        # TODO: fix this - doesn't look like all entities are found
-                        matcher = PhraseMatcher(nlp.vocab)
-                        patterns = list(nlp.tokenizer.pipe([mention]))
+                        entity_tuple = entries_by_offset.get(str(start) + "-" + str(end), None)
+                        if entity_tuple:
+                            alias, wp_title = entity_tuple
+                            if ent.text != alias:
+                                print("Non-matching entity in", article_id, start, end)
+                            else:
+                                gold_entities.append((start, end, wp_title))
 
-                        matcher.add("TerminologyList", None, *patterns)
-                        matches = matcher(article_doc)
-
-                        # store gold entities
-                        for match_id, start, end in matches:
-                            gold_entities.append((start, end, entity_pos))
-
-                    gold = GoldParse(doc=article_doc, links=gold_entities)
-                    data.append((article_doc, gold))
+                    if gold_entities:
+                        gold = GoldParse(doc=article_doc, links=gold_entities)
+                        data.append((article_doc, gold))
 
                     cnt += 1
                 except Exception as e:
                     print("Problem parsing article", article_id)
                     print(e)
+                    raise e
 
     if to_print:
         print()
         print("Processed", cnt, "training articles, dev=" + str(dev))
         print()
     return data
-
-
-
-
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index d5002e26f..faea93f53 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -30,8 +30,8 @@ TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
 
 MAX_CANDIDATES = 10
 MIN_PAIR_OCC = 5
-DOC_CHAR_CUTOFF = 300
-EPOCHS = 2
+DOC_SENT_CUTOFF = 2
+EPOCHS = 10
 DROPOUT = 0.1
 
 
@@ -46,14 +46,14 @@ def run_pipeline():
     # one-time methods to create KB and write to file
     to_create_prior_probs = False
     to_create_entity_counts = False
-    to_create_kb = True
+    to_create_kb = False  # TODO: entity_defs should also contain entities not in the KB
 
     # read KB back in from file
-    to_read_kb = True
-    to_test_kb = True
+    to_read_kb = False
+    to_test_kb = False
 
     # create training dataset
-    create_wp_training = False
+    create_wp_training = True
 
     # train the EL pipe
     train_pipe = False
@@ -103,9 +103,6 @@ def run_pipeline():
     # STEP 4 : read KB back in from file
     if to_read_kb:
         print("STEP 4: to_read_kb", datetime.datetime.now())
-        # my_vocab = Vocab()
-        # my_vocab.from_disk(VOCAB_DIR)
-        # my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64)
         nlp_2 = spacy.load(NLP_1_DIR)
         kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH)
         kb_2.load_bulk(KB_FILE)
@@ -121,13 +118,13 @@ def run_pipeline():
     # STEP 5: create a training dataset from WP
     if create_wp_training:
         print("STEP 5: create training dataset", datetime.datetime.now())
-        training_set_creator.create_training(kb=kb_2, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
+        training_set_creator.create_training(entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
 
     # STEP 6: create the entity linking pipe
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
-        train_limit = 10
-        dev_limit = 5
+        train_limit = 50
+        dev_limit = 10
         print("Training on", train_limit, "articles")
         print("Dev testing on", dev_limit, "articles")
         print()
@@ -144,7 +141,7 @@ def run_pipeline():
                                                       limit=dev_limit,
                                                       to_print=False)
 
-        el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_CHAR_CUTOFF})
+        el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_SENT_CUTOFF})
         el_pipe.set_kb(kb_2)
         nlp_2.add_pipe(el_pipe, last=True)
 
@@ -199,10 +196,14 @@ def run_pipeline():
             el_pipe.prior_weight = 0
             dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe)
             train_acc_1_0 = _measure_accuracy(train_data, el_pipe)
-
             print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2))
             print()
 
+            # reset for follow-up tests
+            el_pipe.context_weight = 1
+            el_pipe.prior_weight = 1
+
+
     if to_test_pipeline:
         print()
         print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())
@@ -215,17 +216,10 @@ def run_pipeline():
         print("STEP 9: testing NLP IO", datetime.datetime.now())
         print()
         print("writing to", NLP_2_DIR)
-        print(" vocab len nlp_2", len(nlp_2.vocab))
-        print(" vocab len kb_2", len(kb_2.vocab))
         nlp_2.to_disk(NLP_2_DIR)
         print()
         print("reading from", NLP_2_DIR)
         nlp_3 = spacy.load(NLP_2_DIR)
-        print(" vocab len nlp_3", len(nlp_3.vocab))
-
-        for pipe_name, pipe in nlp_3.pipeline:
-            if pipe_name == "entity_linker":
-                print(" vocab len kb_3", len(pipe.kb.vocab))
 
         print()
         print("running toy example with NLP 2")
@@ -253,9 +247,10 @@ def _measure_accuracy(data, el_pipe):
             for ent in doc.ents:
                 if ent.label_ == "PERSON":  # TODO: expand to other types
                     pred_entity = ent.kb_id_
-                    start = ent.start
-                    end = ent.end
+                    start = ent.start_char
+                    end = ent.end_char
                     gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
+                    # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
                     if gold_entity is not None:
                         if gold_entity == pred_entity:
                             correct += 1
@@ -285,7 +280,8 @@ def run_el_toy_example(nlp):
     print()
 
     # Q4426480 is her husband, Q3568763 her tutor
-    text = "Ada Lovelace loved her husband William King dearly. " \
+    text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine."\
+           "Ada Lovelace loved her husband William King dearly. " \
            "Ada Lovelace was tutored by her favorite physics tutor William King."
     doc = nlp(text)
 
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index e73ff6a0e..5d82da7ee 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1074,6 +1074,9 @@ class EntityLinker(Pipe):
 
     @classmethod
     def Model(cls, **cfg):
+        if "entity_width" not in cfg:
+            raise ValueError("entity_width not found")
+
         embed_width = cfg.get("embed_width", 300)
         hidden_width = cfg.get("hidden_width", 32)
         article_width = cfg.get("article_width", 128)
@@ -1095,7 +1098,10 @@ class EntityLinker(Pipe):
         self.mention_encoder = True
         self.kb = None
         self.cfg = dict(cfg)
-        self.doc_cutoff = self.cfg.get("doc_cutoff", 150)
+        self.doc_cutoff = self.cfg.get("doc_cutoff", 5)
+        self.sgd_article = None
+        self.sgd_sent = None
+        self.sgd_mention = None
 
     def set_kb(self, kb):
         self.kb = kb
@@ -1126,6 +1132,12 @@ class EntityLinker(Pipe):
         self.require_model()
         self.require_kb()
 
+        if losses is not None:
+            losses.setdefault(self.name, 0.0)
+
+        if not docs or not golds:
+            return 0
+
         if len(docs) != len(golds):
             raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs),
                                                 n_golds=len(golds)))
@@ -1141,21 +1153,30 @@ class EntityLinker(Pipe):
         for doc, gold in zip(docs, golds):
             for entity in gold.links:
                 start, end, gold_kb = entity
-                mention = doc[start:end]
-                sentence = mention.sent
-                first_par = doc[0:self.doc_cutoff].as_doc()
+                mention = doc.text[start:end]
+                sent_start = 0
+                sent_end = len(doc)
+                first_par_end = len(doc)
+                for index, sent in enumerate(doc.sents):
+                    if start >= sent.start_char and end <= sent.end_char:
+                        sent_start = sent.start
+                        sent_end = sent.end
+                    if index == self.doc_cutoff-1:
+                        first_par_end = sent.end
+                sentence = doc[sent_start:sent_end].as_doc()
+                first_par = doc[0:first_par_end].as_doc()
 
-                candidates = self.kb.get_candidates(mention.text)
+                candidates = self.kb.get_candidates(mention)
                 for c in candidates:
                     kb_id = c.entity_
-                    # TODO: currently only training on the positive instances
+                    # Currently only training on the positive instances
                     if kb_id == gold_kb:
                         prior_prob = c.prior_prob
                         entity_encoding = c.entity_vector
 
                         entity_encodings.append(entity_encoding)
                         article_docs.append(first_par)
-                        sentence_docs.append(sentence.as_doc())
+                        sentence_docs.append(sentence)
 
         if len(entity_encodings) > 0:
             doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
@@ -1168,11 +1189,6 @@ class EntityLinker(Pipe):
             entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
 
             loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
-            # print("scores", mention_encodings)
-            # print("golds", entity_encodings)
-            # print("loss", loss)
-            # print("d_scores", d_scores)
-
             mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention)
 
             # gradient : concat (doc+sent) vs. desc
@@ -1187,7 +1203,6 @@ class EntityLinker(Pipe):
             bp_sent(sent_gradients, sgd=self.sgd_sent)
 
             if losses is not None:
-                losses.setdefault(self.name, 0.0)
                 losses[self.name] += loss
             return loss
 
@@ -1230,16 +1245,25 @@ class EntityLinker(Pipe):
         self.require_model()
         self.require_kb()
 
-        if isinstance(docs, Doc):
-            docs = [docs]
-
         final_entities = list()
         final_kb_ids = list()
 
-        for i, article_doc in enumerate(docs):
-            if len(article_doc) > 0:
-                doc_encoding = self.article_encoder([article_doc])
-                for ent in article_doc.ents:
+        if not docs:
+            return final_entities, final_kb_ids
+
+        if isinstance(docs, Doc):
+            docs = [docs]
+
+        for i, doc in enumerate(docs):
+            if len(doc) > 0:
+                first_par_end = len(doc)
+                for index, sent in enumerate(doc.sents):
+                    if index == self.doc_cutoff-1:
+                        first_par_end = sent.end
+                first_par = doc[0:first_par_end].as_doc()
+
+                doc_encoding = self.article_encoder([first_par])
+                for ent in doc.ents:
                     sent_doc = ent.sent.as_doc()
                     if len(sent_doc) > 0:
                         sent_encoding = self.sent_encoder([sent_doc])
@@ -1254,7 +1278,7 @@ class EntityLinker(Pipe):
                                 prior_prob = c.prior_prob * self.prior_weight
                                 kb_id = c.entity_
                                 entity_encoding = c.entity_vector
-                                sim = cosine(np.asarray([entity_encoding]), mention_enc_t) * self.context_weight
+                                sim = float(cosine(np.asarray([entity_encoding]), mention_enc_t)) * self.context_weight
                                 score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
                                 scores.append(score)
 
@@ -1271,36 +1295,7 @@ class EntityLinker(Pipe):
             for token in entity:
                 token.ent_kb_id_ = kb_id
 
-    def to_bytes(self, exclude=tuple(), **kwargs):
-        """Serialize the pipe to a bytestring.
-
-        exclude (list): String names of serialization fields to exclude.
-        RETURNS (bytes): The serialized object.
-        """
-        serialize = OrderedDict()
-        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-        serialize["kb"] = self.kb.to_bytes  # TODO
-        if self.mention_encoder not in (True, False, None):
-            serialize["article_encoder"] = self.article_encoder.to_bytes
-            serialize["sent_encoder"] = self.sent_encoder.to_bytes
-            serialize["mention_encoder"] = self.mention_encoder.to_bytes
-        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
-        return util.to_bytes(serialize, exclude)
-
-    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
-        """Load the pipe from a bytestring."""
-        deserialize = OrderedDict()
-        deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
-        deserialize["kb"] = lambda b: self.kb.from_bytes(b)  # TODO
-        deserialize["article_encoder"] = lambda b: self.article_encoder.from_bytes(b)
-        deserialize["sent_encoder"] = lambda b: self.sent_encoder.from_bytes(b)
-        deserialize["mention_encoder"] = lambda b: self.mention_encoder.from_bytes(b)
-        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
-        util.from_bytes(bytes_data, deserialize, exclude)
-        return self
-
     def to_disk(self, path, exclude=tuple(), **kwargs):
-        """Serialize the pipe to disk."""
         serialize = OrderedDict()
         serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
         serialize["kb"] = lambda p: self.kb.dump(p)
@@ -1312,7 +1307,6 @@ class EntityLinker(Pipe):
         util.to_disk(path, serialize, exclude)
 
     def from_disk(self, path, exclude=tuple(), **kwargs):
-        """Load the pipe from disk."""
         def load_article_encoder(p):
             if self.article_encoder is True:
                 self.article_encoder, _, _ = self.Model(**self.cfg)

From 81731907ba0c3589c28367c5ec08f8a8f3eaeeae Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 14 Jun 2019 19:55:46 +0200
Subject: [PATCH 078/148] performance per entity type

---
 .../wiki_entity_linking/kb_creator.py         |  37 +++---
 .../training_set_creator.py                   |   2 +-
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 122 +++++++++++-------
 .../wiki_entity_linking/wikidata_processor.py |  28 ++--
 .../wikipedia_processor.py                    |   5 +-
 5 files changed, 114 insertions(+), 80 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index 7b740216b..4d7bd646b 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -15,10 +15,10 @@ INPUT_DIM = 300  # dimension of pre-trained vectors
 DESC_WIDTH = 64
 
 
-def create_kb(nlp, max_entities_per_alias, min_occ,
+def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
               entity_def_output, entity_descr_output,
               count_input, prior_prob_input, to_print=False):
-    """ Create the knowledge base from Wikidata entries """
+    # Create the knowledge base from Wikidata entries
     kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH)
 
     # disable this part of the pipeline when rerunning the KB generation from preprocessed files
@@ -37,21 +37,26 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
         title_to_id = _get_entity_to_id(entity_def_output)
         id_to_descr = _get_id_to_description(entity_descr_output)
 
-    title_list = list(title_to_id.keys())
-
-    # TODO: remove this filter (just for quicker testing of code)
-    # title_list = title_list[0:342]
-    # title_to_id = {t: title_to_id[t] for t in title_list}
-
-    entity_list = [title_to_id[x] for x in title_list]
-
-    # Currently keeping entities from the KB where there is no description - putting a default void description
-    description_list = [id_to_descr.get(x, "No description defined") for x in entity_list]
-
     print()
     print(" * _get_entity_frequencies", datetime.datetime.now())
     print()
-    entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list)
+    entity_frequencies = wp.get_all_frequencies(count_input=count_input)
+
+    # filter the entities for in the KB by frequency, because there's just too much data otherwise
+    filtered_title_to_id = dict()
+    entity_list = list()
+    description_list = list()
+    frequency_list = list()
+    for title, entity in title_to_id.items():
+        freq = entity_frequencies.get(title, 0)
+        desc = id_to_descr.get(entity, None)
+        if desc and freq > min_entity_freq:
+            entity_list.append(entity)
+            description_list.append(desc)
+            frequency_list.append(freq)
+            filtered_title_to_id[title] = entity
+
+    print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()), "titles")
 
     print()
     print(" * train entity encoder", datetime.datetime.now())
@@ -67,12 +72,12 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
 
     print()
     print(" * adding", len(entity_list), "entities", datetime.datetime.now())
-    kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings)
+    kb.set_entities(entity_list=entity_list, prob_list=frequency_list, vector_list=embeddings)
 
     print()
     print(" * adding aliases", datetime.datetime.now())
     print()
-    _add_aliases(kb, title_to_id=title_to_id,
+    _add_aliases(kb, title_to_id=filtered_title_to_id,
                  max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
                  prior_prob_input=prior_prob_input)
 
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index fc620a1d3..845ce62dc 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -21,7 +21,7 @@ ENTITY_FILE = "gold_entities.csv"
 
 def create_training(entity_def_input, training_output):
     wp_to_id = kb_creator._get_entity_to_id(entity_def_input)
-    _process_wikipedia_texts(wp_to_id, training_output, limit=100000000)  # TODO: full dataset   100000000
+    _process_wikipedia_texts(wp_to_id, training_output, limit=100000000)
 
 
 def _process_wikipedia_texts(wp_to_id, training_output, limit=None):
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index faea93f53..1e5280f89 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -29,6 +29,7 @@ NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'
 TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
 
 MAX_CANDIDATES = 10
+MIN_ENTITY_FREQ = 200
 MIN_PAIR_OCC = 5
 DOC_SENT_CUTOFF = 2
 EPOCHS = 10
@@ -46,14 +47,14 @@ def run_pipeline():
     # one-time methods to create KB and write to file
     to_create_prior_probs = False
     to_create_entity_counts = False
-    to_create_kb = False  # TODO: entity_defs should also contain entities not in the KB
+    to_create_kb = True
 
     # read KB back in from file
     to_read_kb = False
     to_test_kb = False
 
     # create training dataset
-    create_wp_training = True
+    create_wp_training = False
 
     # train the EL pipe
     train_pipe = False
@@ -84,13 +85,14 @@ def run_pipeline():
     if to_create_kb:
         print("STEP 3a: to_create_kb", datetime.datetime.now())
         kb_1 = kb_creator.create_kb(nlp_1,
-                                     max_entities_per_alias=MAX_CANDIDATES,
-                                     min_occ=MIN_PAIR_OCC,
-                                     entity_def_output=ENTITY_DEFS,
-                                     entity_descr_output=ENTITY_DESCR,
-                                     count_input=ENTITY_COUNTS,
-                                     prior_prob_input=PRIOR_PROB,
-                                     to_print=False)
+                                    max_entities_per_alias=MAX_CANDIDATES,
+                                    min_entity_freq=MIN_ENTITY_FREQ,
+                                    min_occ=MIN_PAIR_OCC,
+                                    entity_def_output=ENTITY_DEFS,
+                                    entity_descr_output=ENTITY_DESCR,
+                                    count_input=ENTITY_COUNTS,
+                                    prior_prob_input=PRIOR_PROB,
+                                    to_print=False)
         print("kb entities:", kb_1.get_size_entities())
         print("kb aliases:", kb_1.get_size_aliases())
         print()
@@ -112,7 +114,7 @@ def run_pipeline():
 
         # test KB
         if to_test_kb:
-            run_el.run_kb_toy_example(kb=kb_2)
+            test_kb(kb_2)
             print()
 
     # STEP 5: create a training dataset from WP
@@ -121,10 +123,18 @@ def run_pipeline():
         training_set_creator.create_training(entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
 
     # STEP 6: create the entity linking pipe
+    el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_SENT_CUTOFF})
+    el_pipe.set_kb(kb_2)
+    nlp_2.add_pipe(el_pipe, last=True)
+
+    other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"]
+    with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
+        nlp_2.begin_training()
+
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
-        train_limit = 50
-        dev_limit = 10
+        train_limit = 10
+        dev_limit = 2
         print("Training on", train_limit, "articles")
         print("Dev testing on", dev_limit, "articles")
         print()
@@ -141,14 +151,6 @@ def run_pipeline():
                                                       limit=dev_limit,
                                                       to_print=False)
 
-        el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_SENT_CUTOFF})
-        el_pipe.set_kb(kb_2)
-        nlp_2.add_pipe(el_pipe, last=True)
-
-        other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"]
-        with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
-            nlp_2.begin_training()
-
         for itn in range(EPOCHS):
             random.shuffle(train_data)
             losses = {}
@@ -180,30 +182,32 @@ def run_pipeline():
             # print(" measuring accuracy 1-1")
             el_pipe.context_weight = 1
             el_pipe.prior_weight = 1
-            dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe)
-            train_acc_1_1 = _measure_accuracy(train_data, el_pipe)
-            print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2))
+            dev_acc_1_1, dev_acc_1_1_dict = _measure_accuracy(dev_data, el_pipe)
+            print("dev acc combo:", round(dev_acc_1_1, 3), [(x, round(y, 3)) for x, y in dev_acc_1_1_dict.items()])
+            train_acc_1_1, train_acc_1_1_dict = _measure_accuracy(train_data, el_pipe)
+            print("train acc combo:", round(train_acc_1_1, 3), [(x, round(y, 3)) for x, y in train_acc_1_1_dict.items()])
 
             # baseline using only prior probabilities
             el_pipe.context_weight = 0
             el_pipe.prior_weight = 1
-            dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe)
-            train_acc_0_1 = _measure_accuracy(train_data, el_pipe)
-            print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2))
+            dev_acc_0_1, dev_acc_0_1_dict = _measure_accuracy(dev_data, el_pipe)
+            print("dev acc prior:", round(dev_acc_0_1, 3), [(x, round(y, 3)) for x, y in dev_acc_0_1_dict.items()])
+            train_acc_0_1, train_acc_0_1_dict = _measure_accuracy(train_data, el_pipe)
+            print("train acc prior:", round(train_acc_0_1, 3), [(x, round(y, 3)) for x, y in train_acc_0_1_dict.items()])
 
             # using only context
             el_pipe.context_weight = 1
             el_pipe.prior_weight = 0
-            dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe)
-            train_acc_1_0 = _measure_accuracy(train_data, el_pipe)
-            print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2))
+            dev_acc_1_0, dev_acc_1_0_dict = _measure_accuracy(dev_data, el_pipe)
+            print("dev acc context:", round(dev_acc_1_0, 3), [(x, round(y, 3)) for x, y in dev_acc_1_0_dict.items()])
+            train_acc_1_0, train_acc_1_0_dict = _measure_accuracy(train_data, el_pipe)
+            print("train acc context:", round(train_acc_1_0, 3), [(x, round(y, 3)) for x, y in train_acc_1_0_dict.items()])
             print()
 
             # reset for follow-up tests
             el_pipe.context_weight = 1
             el_pipe.prior_weight = 1
 
-
     if to_test_pipeline:
         print()
         print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())
@@ -230,8 +234,8 @@ def run_pipeline():
 
 
 def _measure_accuracy(data, el_pipe):
-    correct = 0
-    incorrect = 0
+    correct_by_label = dict()
+    incorrect_by_label = dict()
 
     docs = [d for d, g in data if len(d) > 0]
     docs = el_pipe.pipe(docs)
@@ -245,31 +249,53 @@ def _measure_accuracy(data, el_pipe):
                 correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
 
             for ent in doc.ents:
-                if ent.label_ == "PERSON":  # TODO: expand to other types
-                    pred_entity = ent.kb_id_
-                    start = ent.start_char
-                    end = ent.end_char
-                    gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
-                    # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
-                    if gold_entity is not None:
-                        if gold_entity == pred_entity:
-                            correct += 1
-                        else:
-                            incorrect += 1
+                ent_label = ent.label_
+                pred_entity = ent.kb_id_
+                start = ent.start_char
+                end = ent.end_char
+                gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
+                # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
+                if gold_entity is not None:
+                    if gold_entity == pred_entity:
+                        correct = correct_by_label.get(ent_label, 0)
+                        correct_by_label[ent_label] = correct + 1
+                    else:
+                        incorrect = incorrect_by_label.get(ent_label, 0)
+                        incorrect_by_label[ent_label] = incorrect + 1
 
         except Exception as e:
             print("Error assessing accuracy", e)
 
-    if correct == incorrect == 0:
-        return 0
+    acc_by_label = dict()
+    total_correct = 0
+    total_incorrect = 0
+    for label, correct in correct_by_label.items():
+        incorrect = incorrect_by_label.get(label, 0)
+        total_correct += correct
+        total_incorrect += incorrect
+        if correct == incorrect == 0:
+            acc_by_label[label] = 0
+        else:
+            acc_by_label[label] = correct / (correct + incorrect)
+    acc = 0
+    if not (total_correct == total_incorrect == 0):
+        acc = total_correct / (total_correct + total_incorrect)
+    return acc, acc_by_label
 
-    acc = correct / (correct + incorrect)
-    return acc
+
+def test_kb(kb):
+    for mention in ("Bush", "Douglas Adams", "Homer", "Brazil", "China"):
+        candidates = kb.get_candidates(mention)
+
+        print("generating candidates for " + mention + " :")
+        for c in candidates:
+            print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
+        print()
 
 
 def run_el_toy_example(nlp):
     text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
-           "Douglas reminds us to always bring our towel. " \
+           "Douglas reminds us to always bring our towel, even in China or Brazil. " \
            "The main character in Doug's novel is the man Arthur Dent, " \
            "but Douglas doesn't write about George Washington or Homer Simpson."
     doc = nlp(text)
diff --git a/examples/pipeline/wiki_entity_linking/wikidata_processor.py b/examples/pipeline/wiki_entity_linking/wikidata_processor.py
index 7d84b1a2a..f6a6cbe23 100644
--- a/examples/pipeline/wiki_entity_linking/wikidata_processor.py
+++ b/examples/pipeline/wiki_entity_linking/wikidata_processor.py
@@ -1,7 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
 import bz2
 import json
 import datetime
@@ -14,7 +13,7 @@ def read_wikidata_entities_json(limit=None, to_print=False):
     """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
 
     lang = 'en'
-    prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
+    # prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
     site_filter = 'enwiki'
 
     title_to_id = dict()
@@ -41,18 +40,19 @@ def read_wikidata_entities_json(limit=None, to_print=False):
                 entry_type = obj["type"]
 
                 if entry_type == "item":
-                    # filtering records on their properties
-                    keep = False
+                    # filtering records on their properties (currently disabled to get ALL data)
+                    # keep = False
+                    keep = True
 
                     claims = obj["claims"]
-                    for prop, value_set in prop_filter.items():
-                        claim_property = claims.get(prop, None)
-                        if claim_property:
-                            for cp in claim_property:
-                                cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
-                                cp_rank = cp['rank']
-                                if cp_rank != "deprecated" and cp_id in value_set:
-                                    keep = True
+                    # for prop, value_set in prop_filter.items():
+                        # claim_property = claims.get(prop, None)
+                        # if claim_property:
+                            # for cp in claim_property:
+                                # cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
+                                # cp_rank = cp['rank']
+                                # if cp_rank != "deprecated" and cp_id in value_set:
+                                    # keep = True
 
                     if keep:
                         unique_id = obj["id"]
@@ -70,6 +70,7 @@ def read_wikidata_entities_json(limit=None, to_print=False):
                                     if to_print:
                                         print("prop:", prop, cp_values)
 
+                        found_link = False
                         if parse_sitelinks:
                             site_value = obj["sitelinks"].get(site_filter, None)
                             if site_value:
@@ -77,6 +78,7 @@ def read_wikidata_entities_json(limit=None, to_print=False):
                                 if to_print:
                                     print(site_filter, ":", site)
                                 title_to_id[site] = unique_id
+                                found_link = True
 
                         if parse_labels:
                             labels = obj["labels"]
@@ -86,7 +88,7 @@ def read_wikidata_entities_json(limit=None, to_print=False):
                                     if to_print:
                                         print("label (" + lang + "):", lang_label["value"])
 
-                        if parse_descriptions:
+                        if found_link and parse_descriptions:
                             descriptions = obj["descriptions"]
                             if descriptions:
                                 lang_descr = descriptions.get(lang, None)
diff --git a/examples/pipeline/wiki_entity_linking/wikipedia_processor.py b/examples/pipeline/wiki_entity_linking/wikipedia_processor.py
index 0461cb19f..e53423487 100644
--- a/examples/pipeline/wiki_entity_linking/wikipedia_processor.py
+++ b/examples/pipeline/wiki_entity_linking/wikipedia_processor.py
@@ -175,7 +175,7 @@ def write_entity_counts(prior_prob_input, count_output, to_print=False):
         print("Total count:", total_count)
 
 
-def get_entity_frequencies(count_input, entities):
+def get_all_frequencies(count_input):
     entity_to_count = dict()
     with open(count_input, 'r', encoding='utf8') as csvfile:
         csvreader = csv.reader(csvfile, delimiter='|')
@@ -184,4 +184,5 @@ def get_entity_frequencies(count_input, entities):
         for row in csvreader:
             entity_to_count[row[0]] = int(row[1])
 
-    return [entity_to_count.get(e, 0) for e in entities]
+    return entity_to_count
+

From 24db1392b9fad37fc532bf53d7f152611f319e70 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sun, 16 Jun 2019 21:14:45 +0200
Subject: [PATCH 079/148] reprocessing all of wikipedia for training data

---
 .../wiki_entity_linking/kb_creator.py         |   2 +-
 .../pipeline/wiki_entity_linking/run_el.py    |   4 +-
 .../pipeline/wiki_entity_linking/train_el.py  |   4 +-
 .../training_set_creator.py                   | 108 +++++++++---------
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  78 +++++++------
 .../wiki_entity_linking/wikidata_processor.py |   2 +-
 6 files changed, 98 insertions(+), 100 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py
index 4d7bd646b..80d0e21e9 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/examples/pipeline/wiki_entity_linking/kb_creator.py
@@ -56,7 +56,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
             frequency_list.append(freq)
             filtered_title_to_id[title] = entity
 
-    print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()), "titles")
+    print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()), "titles with filter frequency", min_entity_freq)
 
     print()
     print(" * train entity encoder", datetime.datetime.now())
diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
index 52ccccfda..c26e8d65a 100644
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ b/examples/pipeline/wiki_entity_linking/run_el.py
@@ -25,9 +25,7 @@ def run_kb_toy_example(kb):
 
 
 def run_el_dev(nlp, kb, training_dir, limit=None):
-    correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir,
-                                                                                 collect_correct=True,
-                                                                                 collect_incorrect=False)
+    correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir)
 
     predictions = list()
     golds = list()
diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
index 143e38d99..a4026d935 100644
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@@ -389,9 +389,7 @@ class EL_Model:
         bp_sent(sent_gradients, sgd=self.sgd_sent)
 
     def _get_training_data(self, training_dir, id_to_descr, dev, limit, to_print):
-        correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir,
-                                                                                         collect_correct=True,
-                                                                                         collect_incorrect=True)
+        correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir)
 
         entities_by_cluster = dict()
         gold_by_entity = dict()
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index 845ce62dc..5d089c620 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -16,12 +16,13 @@ from . import wikipedia_processor as wp, kb_creator
 Process Wikipedia interlinks to generate a training dataset for the EL algorithm
 """
 
-ENTITY_FILE = "gold_entities.csv"
+# ENTITY_FILE = "gold_entities.csv"
+ENTITY_FILE = "gold_entities_100000.csv"   # use this file for faster processing
 
 
 def create_training(entity_def_input, training_output):
     wp_to_id = kb_creator._get_entity_to_id(entity_def_input)
-    _process_wikipedia_texts(wp_to_id, training_output, limit=100000000)
+    _process_wikipedia_texts(wp_to_id, training_output, limit=None)
 
 
 def _process_wikipedia_texts(wp_to_id, training_output, limit=None):
@@ -290,75 +291,72 @@ def _write_training_entity(outputfile, article_id, alias, entity, start, end):
     outputfile.write(article_id + "|" + alias + "|" + entity + "|" + str(start) + "|" + str(end) + "\n")
 
 
-def read_training_entities(training_output):
+def is_dev(article_id):
+    return article_id.endswith("3")
+
+
+def read_training_entities(training_output, dev, limit):
     entityfile_loc = training_output + "/" + ENTITY_FILE
     entries_per_article = dict()
+    article_ids = set()
 
     with open(entityfile_loc, mode='r', encoding='utf8') as file:
         for line in file:
-            fields = line.replace('\n', "").split(sep='|')
-            article_id = fields[0]
-            alias = fields[1]
-            wp_title = fields[2]
-            start = fields[3]
-            end = fields[4]
+            if not limit or len(article_ids) < limit:
+                fields = line.replace('\n', "").split(sep='|')
+                article_id = fields[0]
+                if dev == is_dev(article_id) and article_id != "article_id":
+                    article_ids.add(article_id)
 
-            entries_by_offset = entries_per_article.get(article_id, dict())
-            entries_by_offset[start + "-" + end] = (alias, wp_title)
+                    alias = fields[1]
+                    wp_title = fields[2]
+                    start = fields[3]
+                    end = fields[4]
 
-            entries_per_article[article_id] = entries_by_offset
+                    entries_by_offset = entries_per_article.get(article_id, dict())
+                    entries_by_offset[start + "-" + end] = (alias, wp_title)
+
+                    entries_per_article[article_id] = entries_by_offset
 
     return entries_per_article
 
 
-def read_training(nlp, training_dir, dev, limit, to_print):
-    # This method will provide training examples that correspond to the entity annotations found by the nlp object
-    entries_per_article = read_training_entities(training_output=training_dir)
+def read_training(nlp, training_dir, dev, limit):
+    # This method provides training examples that correspond to the entity annotations found by the nlp object
+
+    print("reading training entities")
+    entries_per_article = read_training_entities(training_output=training_dir, dev=dev, limit=limit)
+    print("done reading training entities")
 
     data = []
+    for article_id, entries_by_offset in entries_per_article.items():
+        file_name = article_id + ".txt"
+        try:
+            # parse the article text
+            with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as file:
+                text = file.read()
+                article_doc = nlp(text)
 
-    cnt = 0
-    files = listdir(training_dir)
-    for f in files:
-        if not limit or cnt < limit:
-            if dev == run_el.is_dev(f):
-                article_id = f.replace(".txt", "")
-                if cnt % 500 == 0 and to_print:
-                    print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
+            gold_entities = list()
+            for ent in article_doc.ents:
+                start = ent.start_char
+                end = ent.end_char
 
-                try:
-                    # parse the article text
-                    with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
-                        text = file.read()
-                        article_doc = nlp(text)
+                entity_tuple = entries_by_offset.get(str(start) + "-" + str(end), None)
+                if entity_tuple:
+                    alias, wp_title = entity_tuple
+                    if ent.text != alias:
+                        print("Non-matching entity in", article_id, start, end)
+                    else:
+                        gold_entities.append((start, end, wp_title))
 
-                    entries_by_offset = entries_per_article.get(article_id, dict())
+            if gold_entities:
+                gold = GoldParse(doc=article_doc, links=gold_entities)
+                data.append((article_doc, gold))
 
-                    gold_entities = list()
-                    for ent in article_doc.ents:
-                        start = ent.start_char
-                        end = ent.end_char
+        except Exception as e:
+            print("Problem parsing article", article_id)
+            print(e)
+            raise e
 
-                        entity_tuple = entries_by_offset.get(str(start) + "-" + str(end), None)
-                        if entity_tuple:
-                            alias, wp_title = entity_tuple
-                            if ent.text != alias:
-                                print("Non-matching entity in", article_id, start, end)
-                            else:
-                                gold_entities.append((start, end, wp_title))
-
-                    if gold_entities:
-                        gold = GoldParse(doc=article_doc, links=gold_entities)
-                        data.append((article_doc, gold))
-
-                    cnt += 1
-                except Exception as e:
-                    print("Problem parsing article", article_id)
-                    print(e)
-                    raise e
-
-    if to_print:
-        print()
-        print("Processed", cnt, "training articles, dev=" + str(dev))
-        print()
     return data
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 1e5280f89..b3b3479e2 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -29,7 +29,7 @@ NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'
 TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
 
 MAX_CANDIDATES = 10
-MIN_ENTITY_FREQ = 200
+MIN_ENTITY_FREQ = 20
 MIN_PAIR_OCC = 5
 DOC_SENT_CUTOFF = 2
 EPOCHS = 10
@@ -47,21 +47,21 @@ def run_pipeline():
     # one-time methods to create KB and write to file
     to_create_prior_probs = False
     to_create_entity_counts = False
-    to_create_kb = True
+    to_create_kb = False
 
     # read KB back in from file
-    to_read_kb = False
+    to_read_kb = True
     to_test_kb = False
 
     # create training dataset
     create_wp_training = False
 
     # train the EL pipe
-    train_pipe = False
-    measure_performance = False
+    train_pipe = True
+    measure_performance = True
 
     # test the EL pipe on a simple example
-    to_test_pipeline = False
+    to_test_pipeline = True
 
     # write the NLP object, read back in and test again
     test_nlp_io = False
@@ -135,46 +135,50 @@ def run_pipeline():
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
         train_limit = 10
         dev_limit = 2
-        print("Training on", train_limit, "articles")
-        print("Dev testing on", dev_limit, "articles")
-        print()
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
                                                         training_dir=TRAINING_DIR,
                                                         dev=False,
-                                                        limit=train_limit,
-                                                        to_print=False)
+                                                        limit=train_limit)
+
+        print("Training on", len(train_data), "articles")
+        print()
+
+        if not train_data:
+            print("Did not find any training data")
+
+        else:
+            for itn in range(EPOCHS):
+                random.shuffle(train_data)
+                losses = {}
+                batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
+                batchnr = 0
+
+                with nlp_2.disable_pipes(*other_pipes):
+                    for batch in batches:
+                        try:
+                            docs, golds = zip(*batch)
+                            nlp_2.update(
+                                docs,
+                                golds,
+                                drop=DROPOUT,
+                                losses=losses,
+                            )
+                            batchnr += 1
+                        except Exception as e:
+                            print("Error updating batch", e)
+
+                losses['entity_linker'] = losses['entity_linker'] / batchnr
+                print("Epoch, train loss", itn, round(losses['entity_linker'], 2))
 
         dev_data = training_set_creator.read_training(nlp=nlp_2,
                                                       training_dir=TRAINING_DIR,
                                                       dev=True,
-                                                      limit=dev_limit,
-                                                      to_print=False)
+                                                      limit=dev_limit)
+        print("Dev testing on", len(dev_data), "articles")
+        print()
 
-        for itn in range(EPOCHS):
-            random.shuffle(train_data)
-            losses = {}
-            batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
-            batchnr = 0
-
-            with nlp_2.disable_pipes(*other_pipes):
-                for batch in batches:
-                    try:
-                        docs, golds = zip(*batch)
-                        nlp_2.update(
-                            docs,
-                            golds,
-                            drop=DROPOUT,
-                            losses=losses,
-                        )
-                        batchnr += 1
-                    except Exception as e:
-                        print("Error updating batch", e)
-
-            losses['entity_linker'] = losses['entity_linker'] / batchnr
-            print("Epoch, train loss", itn, round(losses['entity_linker'], 2))
-
-        if measure_performance:
+        if len(dev_data) and measure_performance:
             print()
             print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())
             print()
diff --git a/examples/pipeline/wiki_entity_linking/wikidata_processor.py b/examples/pipeline/wiki_entity_linking/wikidata_processor.py
index f6a6cbe23..967849abb 100644
--- a/examples/pipeline/wiki_entity_linking/wikidata_processor.py
+++ b/examples/pipeline/wiki_entity_linking/wikidata_processor.py
@@ -104,7 +104,7 @@ def read_wikidata_entities_json(limit=None, to_print=False):
                                 if lang_aliases:
                                     for item in lang_aliases:
                                         if to_print:
-                                            print("alias (" + lang + "):", item["value"])
+                                             print("alias (" + lang + "):", item["value"])
 
                         if to_print:
                             print()

From 6332af40de10b221ec7ef4354b3d51bf6f80ca71 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 17 Jun 2019 14:39:40 +0200
Subject: [PATCH 080/148] baseline performances: oracle KB, random and prior
 prob

---
 .../training_set_creator.py                   | 116 ++++++--------
 .../wiki_entity_linking/wiki_nel_pipeline.py  | 143 ++++++++++++++----
 2 files changed, 161 insertions(+), 98 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index 5d089c620..4ce69e75d 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -5,11 +5,8 @@ import os
 import re
 import bz2
 import datetime
-from os import listdir
 
-from examples.pipeline.wiki_entity_linking import run_el
 from spacy.gold import GoldParse
-from spacy.matcher import PhraseMatcher
 from . import wikipedia_processor as wp, kb_creator
 
 """
@@ -17,7 +14,7 @@ Process Wikipedia interlinks to generate a training dataset for the EL algorithm
 """
 
 # ENTITY_FILE = "gold_entities.csv"
-ENTITY_FILE = "gold_entities_100000.csv"   # use this file for faster processing
+ENTITY_FILE = "gold_entities_1000000.csv"   # use this file for faster processing
 
 
 def create_training(entity_def_input, training_output):
@@ -58,7 +55,6 @@ def _process_wikipedia_texts(wp_to_id, training_output, limit=None):
                 if cnt % 1000000 == 0:
                     print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
                 clean_line = line.strip().decode("utf-8")
-                # print(clean_line)
 
                 if clean_line == "<revision>":
                     reading_revision = True
@@ -121,7 +117,6 @@ text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
 
 def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text, training_output):
     found_entities = False
-    # print("Processing", article_id, article_title)
 
     # ignore meta Wikipedia pages
     if article_title.startswith("Wikipedia:"):
@@ -134,13 +129,8 @@ def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_te
     if text.startswith("#REDIRECT"):
         return
 
-    # print()
-    # print(text)
-
     # get the raw text without markup etc, keeping only interwiki links
     clean_text = _get_clean_wp_text(text)
-    # print()
-    # print(clean_text)
 
     # read the text char by char to get the right offsets of the interwiki links
     final_text = ""
@@ -295,68 +285,62 @@ def is_dev(article_id):
     return article_id.endswith("3")
 
 
-def read_training_entities(training_output, dev, limit):
-    entityfile_loc = training_output + "/" + ENTITY_FILE
-    entries_per_article = dict()
-    article_ids = set()
-
-    with open(entityfile_loc, mode='r', encoding='utf8') as file:
-        for line in file:
-            if not limit or len(article_ids) < limit:
-                fields = line.replace('\n', "").split(sep='|')
-                article_id = fields[0]
-                if dev == is_dev(article_id) and article_id != "article_id":
-                    article_ids.add(article_id)
-
-                    alias = fields[1]
-                    wp_title = fields[2]
-                    start = fields[3]
-                    end = fields[4]
-
-                    entries_by_offset = entries_per_article.get(article_id, dict())
-                    entries_by_offset[start + "-" + end] = (alias, wp_title)
-
-                    entries_per_article[article_id] = entries_by_offset
-
-    return entries_per_article
-
-
 def read_training(nlp, training_dir, dev, limit):
     # This method provides training examples that correspond to the entity annotations found by the nlp object
 
-    print("reading training entities")
-    entries_per_article = read_training_entities(training_output=training_dir, dev=dev, limit=limit)
-    print("done reading training entities")
-
+    entityfile_loc = training_dir + "/" + ENTITY_FILE
     data = []
-    for article_id, entries_by_offset in entries_per_article.items():
-        file_name = article_id + ".txt"
-        try:
-            # parse the article text
-            with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as file:
-                text = file.read()
-                article_doc = nlp(text)
 
-            gold_entities = list()
-            for ent in article_doc.ents:
-                start = ent.start_char
-                end = ent.end_char
+    # we assume the data is written sequentially
+    current_article_id = None
+    current_doc = None
+    gold_entities = list()
+    ents_by_offset = dict()
+    skip_articles = set()
+    total_entities = 0
 
-                entity_tuple = entries_by_offset.get(str(start) + "-" + str(end), None)
-                if entity_tuple:
-                    alias, wp_title = entity_tuple
-                    if ent.text != alias:
-                        print("Non-matching entity in", article_id, start, end)
-                    else:
-                        gold_entities.append((start, end, wp_title))
+    with open(entityfile_loc, mode='r', encoding='utf8') as file:
+        for line in file:
+            if not limit or len(data) < limit:
+                if len(data) > 0 and len(data) % 50 == 0:
+                    print("Read", total_entities, "entities in", len(data), "articles")
+                fields = line.replace('\n', "").split(sep='|')
+                article_id = fields[0]
+                alias = fields[1]
+                wp_title = fields[2]
+                start = fields[3]
+                end = fields[4]
 
-            if gold_entities:
-                gold = GoldParse(doc=article_doc, links=gold_entities)
-                data.append((article_doc, gold))
+                if dev == is_dev(article_id) and article_id != "article_id" and article_id not in skip_articles:
+                    if not current_doc or (current_article_id != article_id):
+                        # store the data from the previous article
+                        if gold_entities and current_doc:
+                            gold = GoldParse(doc=current_doc, links=gold_entities)
+                            data.append((current_doc, gold))
+                            total_entities += len(gold_entities)
 
-        except Exception as e:
-            print("Problem parsing article", article_id)
-            print(e)
-            raise e
+                        # parse the new article text
+                        file_name = article_id + ".txt"
+                        try:
+                            with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as f:
+                                text = f.read()
+                                current_doc = nlp(text)
+                                for ent in current_doc.ents:
+                                    ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent.text
+                        except Exception as e:
+                            print("Problem parsing article", article_id, e)
 
+                        current_article_id = article_id
+                        gold_entities = list()
+
+                    # repeat checking this condition in case an exception was thrown
+                    if current_doc and (current_article_id == article_id):
+                        found_ent = ents_by_offset.get(start + "_" + end,  None)
+                        if found_ent:
+                            if found_ent != alias:
+                                skip_articles.add(current_article_id)
+                            else:
+                                gold_entities.append((int(start), int(end), wp_title))
+
+    print("Read", total_entities, "entities in", len(data), "articles")
     return data
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index b3b3479e2..7b54df527 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -64,7 +64,8 @@ def run_pipeline():
     to_test_pipeline = True
 
     # write the NLP object, read back in and test again
-    test_nlp_io = False
+    to_write_nlp = True
+    to_read_nlp = True
 
     # STEP 1 : create prior probabilities from WP
     # run only once !
@@ -133,7 +134,7 @@ def run_pipeline():
 
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
-        train_limit = 10
+        train_limit = 5
         dev_limit = 2
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
@@ -166,46 +167,42 @@ def run_pipeline():
                             )
                             batchnr += 1
                         except Exception as e:
-                            print("Error updating batch", e)
+                            print("Error updating batch:", e)
+                            raise(e)
 
-                losses['entity_linker'] = losses['entity_linker'] / batchnr
-                print("Epoch, train loss", itn, round(losses['entity_linker'], 2))
+                if batchnr > 0:
+                    losses['entity_linker'] = losses['entity_linker'] / batchnr
+                    print("Epoch, train loss", itn, round(losses['entity_linker'], 2))
 
         dev_data = training_set_creator.read_training(nlp=nlp_2,
                                                       training_dir=TRAINING_DIR,
                                                       dev=True,
                                                       limit=dev_limit)
-        print("Dev testing on", len(dev_data), "articles")
+
         print()
+        print("Dev testing on", len(dev_data), "articles")
 
         if len(dev_data) and measure_performance:
             print()
             print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())
             print()
 
+            acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label = _measure_baselines(dev_data, kb_2)
+            print("dev acc oracle:", round(acc_oracle, 3), [(x, round(y, 3)) for x, y in acc_oracle_by_label.items()])
+            print("dev acc random:", round(acc_random, 3), [(x, round(y, 3)) for x, y in acc_random_by_label.items()])
+            print("dev acc prior:", round(acc_prior, 3), [(x, round(y, 3)) for x, y in acc_prior_by_label.items()])
+
             # print(" measuring accuracy 1-1")
             el_pipe.context_weight = 1
             el_pipe.prior_weight = 1
-            dev_acc_1_1, dev_acc_1_1_dict = _measure_accuracy(dev_data, el_pipe)
-            print("dev acc combo:", round(dev_acc_1_1, 3), [(x, round(y, 3)) for x, y in dev_acc_1_1_dict.items()])
-            train_acc_1_1, train_acc_1_1_dict = _measure_accuracy(train_data, el_pipe)
-            print("train acc combo:", round(train_acc_1_1, 3), [(x, round(y, 3)) for x, y in train_acc_1_1_dict.items()])
-
-            # baseline using only prior probabilities
-            el_pipe.context_weight = 0
-            el_pipe.prior_weight = 1
-            dev_acc_0_1, dev_acc_0_1_dict = _measure_accuracy(dev_data, el_pipe)
-            print("dev acc prior:", round(dev_acc_0_1, 3), [(x, round(y, 3)) for x, y in dev_acc_0_1_dict.items()])
-            train_acc_0_1, train_acc_0_1_dict = _measure_accuracy(train_data, el_pipe)
-            print("train acc prior:", round(train_acc_0_1, 3), [(x, round(y, 3)) for x, y in train_acc_0_1_dict.items()])
+            dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe)
+            print("dev acc combo:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
 
             # using only context
             el_pipe.context_weight = 1
             el_pipe.prior_weight = 0
-            dev_acc_1_0, dev_acc_1_0_dict = _measure_accuracy(dev_data, el_pipe)
-            print("dev acc context:", round(dev_acc_1_0, 3), [(x, round(y, 3)) for x, y in dev_acc_1_0_dict.items()])
-            train_acc_1_0, train_acc_1_0_dict = _measure_accuracy(train_data, el_pipe)
-            print("train acc context:", round(train_acc_1_0, 3), [(x, round(y, 3)) for x, y in train_acc_1_0_dict.items()])
+            dev_acc_context, dev_acc_1_0_dict = _measure_accuracy(dev_data, el_pipe)
+            print("dev acc context:", round(dev_acc_context, 3), [(x, round(y, 3)) for x, y in dev_acc_1_0_dict.items()])
             print()
 
             # reset for follow-up tests
@@ -219,7 +216,7 @@ def run_pipeline():
         run_el_toy_example(nlp=nlp_2)
         print()
 
-    if test_nlp_io:
+    if to_write_nlp:
         print()
         print("STEP 9: testing NLP IO", datetime.datetime.now())
         print()
@@ -229,9 +226,10 @@ def run_pipeline():
         print("reading from", NLP_2_DIR)
         nlp_3 = spacy.load(NLP_2_DIR)
 
-        print()
-        print("running toy example with NLP 2")
-        run_el_toy_example(nlp=nlp_3)
+        if to_read_nlp:
+            print()
+            print("running toy example with NLP 2")
+            run_el_toy_example(nlp=nlp_3)
 
     print()
     print("STOP", datetime.datetime.now())
@@ -270,6 +268,80 @@ def _measure_accuracy(data, el_pipe):
         except Exception as e:
             print("Error assessing accuracy", e)
 
+    acc, acc_by_label = calculate_acc(correct_by_label,  incorrect_by_label)
+    return acc, acc_by_label
+
+
+def _measure_baselines(data, kb):
+    random_correct_by_label = dict()
+    random_incorrect_by_label = dict()
+
+    oracle_correct_by_label = dict()
+    oracle_incorrect_by_label = dict()
+
+    prior_correct_by_label = dict()
+    prior_incorrect_by_label = dict()
+
+    docs = [d for d, g in data if len(d) > 0]
+    golds = [g for d, g in data if len(d) > 0]
+
+    for doc, gold in zip(docs, golds):
+        try:
+            correct_entries_per_article = dict()
+            for entity in gold.links:
+                start, end, gold_kb = entity
+                correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb
+
+            for ent in doc.ents:
+                ent_label = ent.label_
+                start = ent.start_char
+                end = ent.end_char
+                gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None)
+
+                # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
+                if gold_entity is not None:
+                    candidates = kb.get_candidates(ent.text)
+                    oracle_candidate = ""
+                    best_candidate = ""
+                    random_candidate = ""
+                    if candidates:
+                        scores = list()
+
+                        for c in candidates:
+                            scores.append(c.prior_prob)
+                            if c.entity_ == gold_entity:
+                                oracle_candidate = c.entity_
+
+                        best_index = scores.index(max(scores))
+                        best_candidate = candidates[best_index].entity_
+                        random_candidate = random.choice(candidates).entity_
+
+                    if gold_entity == best_candidate:
+                        prior_correct_by_label[ent_label] = prior_correct_by_label.get(ent_label, 0) + 1
+                    else:
+                        prior_incorrect_by_label[ent_label] = prior_incorrect_by_label.get(ent_label, 0) + 1
+
+                    if gold_entity == random_candidate:
+                        random_correct_by_label[ent_label] = random_correct_by_label.get(ent_label, 0) + 1
+                    else:
+                        random_incorrect_by_label[ent_label] = random_incorrect_by_label.get(ent_label, 0) + 1
+
+                    if gold_entity == oracle_candidate:
+                        oracle_correct_by_label[ent_label] = oracle_correct_by_label.get(ent_label, 0) + 1
+                    else:
+                        oracle_incorrect_by_label[ent_label] = oracle_incorrect_by_label.get(ent_label, 0) + 1
+
+        except Exception as e:
+            print("Error assessing accuracy", e)
+
+    acc_prior, acc_prior_by_label = calculate_acc(prior_correct_by_label, prior_incorrect_by_label)
+    acc_random, acc_random_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label)
+    acc_oracle, acc_oracle_by_label = calculate_acc(oracle_correct_by_label, oracle_incorrect_by_label)
+
+    return acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label
+
+
+def calculate_acc(correct_by_label, incorrect_by_label):
     acc_by_label = dict()
     total_correct = 0
     total_incorrect = 0
@@ -303,18 +375,25 @@ def run_el_toy_example(nlp):
            "The main character in Doug's novel is the man Arthur Dent, " \
            "but Douglas doesn't write about George Washington or Homer Simpson."
     doc = nlp(text)
-
+    print(text)
     for ent in doc.ents:
         print("ent", ent.text, ent.label_, ent.kb_id_)
-
     print()
 
-    # Q4426480 is her husband, Q3568763 her tutor
-    text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine."\
-           "Ada Lovelace loved her husband William King dearly. " \
-           "Ada Lovelace was tutored by her favorite physics tutor William King."
+    # Q4426480 is her husband
+    text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine. "\
+           "She loved her husband William King dearly. "
     doc = nlp(text)
+    print(text)
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
+    print()
 
+    # Q3568763 is her tutor
+    text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine. "\
+           "She was tutored by her favorite physics tutor William King."
+    doc = nlp(text)
+    print(text)
     for ent in doc.ents:
         print("ent", ent.text, ent.label_, ent.kb_id_)
 

From ffae7d35552476adc14e2be6d66f64edd6ae06ed Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 18 Jun 2019 00:05:47 +0200
Subject: [PATCH 081/148] sentence encoder only (removing article/mention
 encoder)

---
 .../training_set_creator.py                   |  41 +++---
 .../wiki_entity_linking/wiki_nel_pipeline.py  |  15 +-
 spacy/pipeline/pipes.pyx                      | 129 +++++++++---------
 3 files changed, 95 insertions(+), 90 deletions(-)

diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py
index 4ce69e75d..cc985202c 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py
@@ -294,7 +294,6 @@ def read_training(nlp, training_dir, dev, limit):
     # we assume the data is written sequentially
     current_article_id = None
     current_doc = None
-    gold_entities = list()
     ents_by_offset = dict()
     skip_articles = set()
     total_entities = 0
@@ -302,8 +301,6 @@ def read_training(nlp, training_dir, dev, limit):
     with open(entityfile_loc, mode='r', encoding='utf8') as file:
         for line in file:
             if not limit or len(data) < limit:
-                if len(data) > 0 and len(data) % 50 == 0:
-                    print("Read", total_entities, "entities in", len(data), "articles")
                 fields = line.replace('\n', "").split(sep='|')
                 article_id = fields[0]
                 alias = fields[1]
@@ -313,34 +310,42 @@ def read_training(nlp, training_dir, dev, limit):
 
                 if dev == is_dev(article_id) and article_id != "article_id" and article_id not in skip_articles:
                     if not current_doc or (current_article_id != article_id):
-                        # store the data from the previous article
-                        if gold_entities and current_doc:
-                            gold = GoldParse(doc=current_doc, links=gold_entities)
-                            data.append((current_doc, gold))
-                            total_entities += len(gold_entities)
-
                         # parse the new article text
                         file_name = article_id + ".txt"
                         try:
                             with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as f:
                                 text = f.read()
-                                current_doc = nlp(text)
-                                for ent in current_doc.ents:
-                                    ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent.text
+                                if len(text) < 30000:   # threshold for convenience / speed of processing
+                                    current_doc = nlp(text)
+                                    current_article_id = article_id
+                                    ents_by_offset = dict()
+                                    for ent in current_doc.ents:
+                                        ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
+                                else:
+                                    skip_articles.add(current_article_id)
+                                    current_doc = None
                         except Exception as e:
                             print("Problem parsing article", article_id, e)
 
-                        current_article_id = article_id
-                        gold_entities = list()
-
                     # repeat checking this condition in case an exception was thrown
                     if current_doc and (current_article_id == article_id):
                         found_ent = ents_by_offset.get(start + "_" + end,  None)
                         if found_ent:
-                            if found_ent != alias:
+                            if found_ent.text != alias:
                                 skip_articles.add(current_article_id)
+                                current_doc = None
                             else:
-                                gold_entities.append((int(start), int(end), wp_title))
+                                sent = found_ent.sent.as_doc()
+                                # currently feeding the gold data one entity per sentence at a time
+                                gold_start = int(start) - found_ent.sent.start_char
+                                gold_end = int(end) - found_ent.sent.start_char
+                                gold_entities = list()
+                                gold_entities.append((gold_start, gold_end, wp_title))
+                                gold = GoldParse(doc=current_doc, links=gold_entities)
+                                data.append((sent, gold))
+                                total_entities += 1
+                                if len(data) % 500 == 0:
+                                    print(" -read", total_entities, "entities")
 
-    print("Read", total_entities, "entities in", len(data), "articles")
+    print(" -read", total_entities, "entities")
     return data
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
index 7b54df527..bdae023b9 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@@ -9,7 +9,6 @@ from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_
 from examples.pipeline.wiki_entity_linking.kb_creator import DESC_WIDTH
 
 import spacy
-from spacy.vocab import Vocab
 from spacy.kb import KnowledgeBase
 import datetime
 
@@ -64,8 +63,8 @@ def run_pipeline():
     to_test_pipeline = True
 
     # write the NLP object, read back in and test again
-    to_write_nlp = True
-    to_read_nlp = True
+    to_write_nlp = False
+    to_read_nlp = False
 
     # STEP 1 : create prior probabilities from WP
     # run only once !
@@ -134,8 +133,8 @@ def run_pipeline():
 
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
-        train_limit = 5
-        dev_limit = 2
+        train_limit = 25000
+        dev_limit = 1000
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
                                                         training_dir=TRAINING_DIR,
@@ -345,7 +344,11 @@ def calculate_acc(correct_by_label, incorrect_by_label):
     acc_by_label = dict()
     total_correct = 0
     total_incorrect = 0
-    for label, correct in correct_by_label.items():
+    all_keys = set()
+    all_keys.update(correct_by_label.keys())
+    all_keys.update(incorrect_by_label.keys())
+    for label in sorted(all_keys):
+        correct = correct_by_label.get(label, 0)
         incorrect = incorrect_by_label.get(label, 0)
         total_correct += correct
         total_incorrect += incorrect
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 5d82da7ee..fbdca8280 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1079,36 +1079,39 @@ class EntityLinker(Pipe):
 
         embed_width = cfg.get("embed_width", 300)
         hidden_width = cfg.get("hidden_width", 32)
-        article_width = cfg.get("article_width", 128)
-        sent_width = cfg.get("sent_width", 64)
         entity_width = cfg.get("entity_width")  # no default because this needs to correspond with the KB
+        sent_width = entity_width
 
-        article_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=article_width, **cfg)
-        sent_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg)
+        model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg)
 
         # dimension of the mention encoder needs to match the dimension of the entity encoder
-        mention_width = article_width + sent_width
-        mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
+        # article_width = cfg.get("article_width", 128)
+        # sent_width = cfg.get("sent_width", 64)
+        # article_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=article_width, **cfg)
+        # mention_width = article_width + sent_width
+        # mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
+        # return article_encoder, sent_encoder, mention_encoder
 
-        return article_encoder, sent_encoder, mention_encoder
+        return model
 
     def __init__(self, **cfg):
-        self.article_encoder = True
-        self.sent_encoder = True
-        self.mention_encoder = True
+        # self.article_encoder = True
+        # self.sent_encoder = True
+        # self.mention_encoder = True
+        self.model = True
         self.kb = None
         self.cfg = dict(cfg)
         self.doc_cutoff = self.cfg.get("doc_cutoff", 5)
-        self.sgd_article = None
-        self.sgd_sent = None
-        self.sgd_mention = None
+        # self.sgd_article = None
+        # self.sgd_sent = None
+        # self.sgd_mention = None
 
     def set_kb(self, kb):
         self.kb = kb
 
     def require_model(self):
         # Raise an error if the component's model is not initialized.
-        if getattr(self, "mention_encoder", None) in (None, True, False):
+        if getattr(self, "model", None) in (None, True, False):
             raise ValueError(Errors.E109.format(name=self.name))
 
     def require_kb(self):
@@ -1121,12 +1124,19 @@ class EntityLinker(Pipe):
         self.require_kb()
         self.cfg["entity_width"] = self.kb.entity_vector_length
 
-        if self.mention_encoder is True:
-            self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
-        self.sgd_article = create_default_optimizer(self.article_encoder.ops)
-        self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
-        self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
-        return self.sgd_article
+        if self.model is True:
+            self.model = self.Model(**self.cfg)
+
+        if sgd is None:
+            sgd = self.create_optimizer()
+        return sgd
+
+        # if self.mention_encoder is True:
+        #    self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
+        # self.sgd_article = create_default_optimizer(self.article_encoder.ops)
+        # self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
+        # self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
+        # return self.sgd_article
 
     def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
         self.require_model()
@@ -1146,7 +1156,7 @@ class EntityLinker(Pipe):
             docs = [docs]
             golds = [golds]
 
-        article_docs = list()
+        # article_docs = list()
         sentence_docs = list()
         entity_encodings = list()
 
@@ -1173,34 +1183,32 @@ class EntityLinker(Pipe):
                     if kb_id == gold_kb:
                         prior_prob = c.prior_prob
                         entity_encoding = c.entity_vector
-
                         entity_encodings.append(entity_encoding)
-                        article_docs.append(first_par)
+                        # article_docs.append(first_par)
                         sentence_docs.append(sentence)
 
         if len(entity_encodings) > 0:
-            doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
-            sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
+            # doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
+            # sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
 
-            concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
-                                range(len(article_docs))]
-            mention_encodings, bp_mention = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=drop)
+            # concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in range(len(article_docs))]
+            # mention_encodings, bp_mention = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=drop)
 
+            sent_encodings, bp_sent = self.model.begin_update(sentence_docs, drop=drop)
             entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
 
-            loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
-            mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention)
+            loss, d_scores = self.get_loss(scores=sent_encodings, golds=entity_encodings, docs=None)
+            bp_sent(d_scores, sgd=sgd)
 
             # gradient : concat (doc+sent) vs. desc
-            sent_start = self.article_encoder.nO
-            sent_gradients = list()
-            doc_gradients = list()
-            for x in mention_gradient:
-                doc_gradients.append(list(x[0:sent_start]))
-                sent_gradients.append(list(x[sent_start:]))
-
-            bp_doc(doc_gradients, sgd=self.sgd_article)
-            bp_sent(sent_gradients, sgd=self.sgd_sent)
+            # sent_start = self.article_encoder.nO
+            # sent_gradients = list()
+            # doc_gradients = list()
+            # for x in mention_gradient:
+                # doc_gradients.append(list(x[0:sent_start]))
+                # sent_gradients.append(list(x[sent_start:]))
+            # bp_doc(doc_gradients, sgd=self.sgd_article)
+            # bp_sent(sent_gradients, sgd=self.sgd_sent)
 
             if losses is not None:
                 losses[self.name] += loss
@@ -1262,14 +1270,17 @@ class EntityLinker(Pipe):
                         first_par_end = sent.end
                 first_par = doc[0:first_par_end].as_doc()
 
-                doc_encoding = self.article_encoder([first_par])
+                # doc_encoding = self.article_encoder([first_par])
                 for ent in doc.ents:
                     sent_doc = ent.sent.as_doc()
                     if len(sent_doc) > 0:
-                        sent_encoding = self.sent_encoder([sent_doc])
-                        concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
-                        mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]]))
-                        mention_enc_t = np.transpose(mention_encoding)
+                        # sent_encoding = self.sent_encoder([sent_doc])
+                        # concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
+                        # mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]]))
+                        # mention_enc_t = np.transpose(mention_encoding)
+
+                        sent_encoding = self.model([sent_doc])
+                        sent_enc_t = np.transpose(sent_encoding)
 
                         candidates = self.kb.get_candidates(ent.text)
                         if candidates:
@@ -1278,7 +1289,7 @@ class EntityLinker(Pipe):
                                 prior_prob = c.prior_prob * self.prior_weight
                                 kb_id = c.entity_
                                 entity_encoding = c.entity_vector
-                                sim = float(cosine(np.asarray([entity_encoding]), mention_enc_t)) * self.context_weight
+                                sim = float(cosine(np.asarray([entity_encoding]), sent_enc_t)) * self.context_weight
                                 score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
                                 scores.append(score)
 
@@ -1299,34 +1310,20 @@ class EntityLinker(Pipe):
         serialize = OrderedDict()
         serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
         serialize["kb"] = lambda p: self.kb.dump(p)
-        if self.mention_encoder not in (None, True, False):
-            serialize["article_encoder"] = lambda p: p.open("wb").write(self.article_encoder.to_bytes())
-            serialize["sent_encoder"] = lambda p: p.open("wb").write(self.sent_encoder.to_bytes())
-            serialize["mention_encoder"] = lambda p: p.open("wb").write(self.mention_encoder.to_bytes())
+        if self.model not in (None, True, False):
+            serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
         exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
         util.to_disk(path, serialize, exclude)
 
     def from_disk(self, path, exclude=tuple(), **kwargs):
-        def load_article_encoder(p):
-            if self.article_encoder is True:
-                self.article_encoder, _, _ = self.Model(**self.cfg)
-            self.article_encoder.from_bytes(p.open("rb").read())
-
-        def load_sent_encoder(p):
-            if self.sent_encoder is True:
-                _, self.sent_encoder, _ = self.Model(**self.cfg)
-            self.sent_encoder.from_bytes(p.open("rb").read())
-
-        def load_mention_encoder(p):
-             if self.mention_encoder is True:
-                _, _, self.mention_encoder = self.Model(**self.cfg)
-             self.mention_encoder.from_bytes(p.open("rb").read())
+        def load_model(p):
+             if self.model is True:
+                self.model = self.Model(**self.cfg)
+             self.model.from_bytes(p.open("rb").read())
 
         deserialize = OrderedDict()
         deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
-        deserialize["article_encoder"] = load_article_encoder
-        deserialize["sent_encoder"] = load_sent_encoder
-        deserialize["mention_encoder"] = load_mention_encoder
+        deserialize["model"] = load_model
         exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
         util.from_disk(path, deserialize, exclude)
         return self

From 0d177c1146d6384737a20400a2218a411fd8ab81 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 18 Jun 2019 13:20:40 +0200
Subject: [PATCH 082/148] clean up code, remove old code, move to bin

---
 .../wiki_entity_linking => bin}/__init__.py   |   0
 bin/wiki_entity_linking/__init__.py           |   0
 .../wiki_entity_linking/kb_creator.py         |  13 +-
 .../wiki_entity_linking/train_descriptions.py |  39 --
 .../training_set_creator.py                   |   7 +-
 .../wiki_entity_linking/wikidata_processor.py |  31 +-
 .../wikipedia_processor.py                    |   4 +-
 .../pipeline/wiki_entity_linking/run_el.py    | 136 -----
 .../pipeline/wiki_entity_linking/train_el.py  | 490 ------------------
 ...pipeline.py => wikidata_entity_linking.py} |  92 ++--
 spacy/_ml.py                                  |   5 +-
 spacy/pipeline/pipes.pyx                      |  62 +--
 12 files changed, 92 insertions(+), 787 deletions(-)
 rename {examples/pipeline/wiki_entity_linking => bin}/__init__.py (100%)
 create mode 100644 bin/wiki_entity_linking/__init__.py
 rename {examples/pipeline => bin}/wiki_entity_linking/kb_creator.py (94%)
 rename {examples/pipeline => bin}/wiki_entity_linking/train_descriptions.py (69%)
 rename {examples/pipeline => bin}/wiki_entity_linking/training_set_creator.py (98%)
 rename {examples/pipeline => bin}/wiki_entity_linking/wikidata_processor.py (80%)
 rename {examples/pipeline => bin}/wiki_entity_linking/wikipedia_processor.py (98%)
 delete mode 100644 examples/pipeline/wiki_entity_linking/run_el.py
 delete mode 100644 examples/pipeline/wiki_entity_linking/train_el.py
 rename examples/pipeline/{wiki_entity_linking/wiki_nel_pipeline.py => wikidata_entity_linking.py} (82%)

diff --git a/examples/pipeline/wiki_entity_linking/__init__.py b/bin/__init__.py
similarity index 100%
rename from examples/pipeline/wiki_entity_linking/__init__.py
rename to bin/__init__.py
diff --git a/bin/wiki_entity_linking/__init__.py b/bin/wiki_entity_linking/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
similarity index 94%
rename from examples/pipeline/wiki_entity_linking/kb_creator.py
rename to bin/wiki_entity_linking/kb_creator.py
index 80d0e21e9..8d293a0a1 100644
--- a/examples/pipeline/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@@ -1,15 +1,13 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import spacy
-from examples.pipeline.wiki_entity_linking.train_descriptions import EntityEncoder
+from bin.wiki_entity_linking.train_descriptions import EntityEncoder
 from spacy.kb import KnowledgeBase
 
 import csv
 import datetime
 
-from . import wikipedia_processor as wp
-from . import wikidata_processor as wd
+from bin.wiki_entity_linking import wikidata_processor as wd, wikipedia_processor as wp
 
 INPUT_DIM = 300  # dimension of pre-trained vectors
 DESC_WIDTH = 64
@@ -34,7 +32,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
 
     else:
         # read the mappings from file
-        title_to_id = _get_entity_to_id(entity_def_output)
+        title_to_id = get_entity_to_id(entity_def_output)
         id_to_descr = _get_id_to_description(entity_descr_output)
 
     print()
@@ -56,7 +54,8 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
             frequency_list.append(freq)
             filtered_title_to_id[title] = entity
 
-    print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()), "titles with filter frequency", min_entity_freq)
+    print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()),
+          "titles with filter frequency", min_entity_freq)
 
     print()
     print(" * train entity encoder", datetime.datetime.now())
@@ -101,7 +100,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_
             descr_file.write(str(qid) + "|" + descr + "\n")
 
 
-def _get_entity_to_id(entity_def_output):
+def get_entity_to_id(entity_def_output):
     entity_to_id = dict()
     with open(entity_def_output, 'r', encoding='utf8') as csvfile:
         csvreader = csv.reader(csvfile, delimiter='|')
diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py
similarity index 69%
rename from examples/pipeline/wiki_entity_linking/train_descriptions.py
rename to bin/wiki_entity_linking/train_descriptions.py
index bf4bcbc3d..cc5016237 100644
--- a/examples/pipeline/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@@ -55,8 +55,6 @@ class EntityEncoder:
             print("Trained on", processed, "entities across", self.EPOCHS, "epochs")
             print("Final loss:", loss)
 
-        # self._test_encoder()
-
     def _train_model(self, description_list):
         # TODO: when loss gets too low, a 'mean of empty slice' warning is thrown by numpy
 
@@ -123,40 +121,3 @@ class EntityEncoder:
     def _get_loss(golds, scores):
         loss, gradients = get_cossim_loss(scores, golds)
         return loss, gradients
-
-    def _test_encoder(self):
-        # Test encoder on some dummy examples
-        desc_A1 = "Fictional character in The Simpsons"
-        desc_A2 = "Simpsons - fictional human"
-        desc_A3 = "Fictional character in The Flintstones"
-        desc_A4 = "Politician from the US"
-
-        A1_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A1))])
-        A2_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A2))])
-        A3_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A3))])
-        A4_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A4))])
-
-        loss_a1_a1, _ = get_cossim_loss(A1_doc_vector, A1_doc_vector)
-        loss_a1_a2, _ = get_cossim_loss(A1_doc_vector, A2_doc_vector)
-        loss_a1_a3, _ = get_cossim_loss(A1_doc_vector, A3_doc_vector)
-        loss_a1_a4, _ = get_cossim_loss(A1_doc_vector, A4_doc_vector)
-
-        print("sim doc A1 A1", loss_a1_a1)
-        print("sim doc A1 A2", loss_a1_a2)
-        print("sim doc A1 A3", loss_a1_a3)
-        print("sim doc A1 A4", loss_a1_a4)
-
-        A1_encoded = self.encoder(A1_doc_vector)
-        A2_encoded = self.encoder(A2_doc_vector)
-        A3_encoded = self.encoder(A3_doc_vector)
-        A4_encoded = self.encoder(A4_doc_vector)
-
-        loss_a1_a1, _ = get_cossim_loss(A1_encoded, A1_encoded)
-        loss_a1_a2, _ = get_cossim_loss(A1_encoded, A2_encoded)
-        loss_a1_a3, _ = get_cossim_loss(A1_encoded, A3_encoded)
-        loss_a1_a4, _ = get_cossim_loss(A1_encoded, A4_encoded)
-
-        print("sim encoded A1 A1", loss_a1_a1)
-        print("sim encoded A1 A2", loss_a1_a2)
-        print("sim encoded A1 A3", loss_a1_a3)
-        print("sim encoded A1 A4", loss_a1_a4)
diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
similarity index 98%
rename from examples/pipeline/wiki_entity_linking/training_set_creator.py
rename to bin/wiki_entity_linking/training_set_creator.py
index cc985202c..a0d130824 100644
--- a/examples/pipeline/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -7,7 +7,7 @@ import bz2
 import datetime
 
 from spacy.gold import GoldParse
-from . import wikipedia_processor as wp, kb_creator
+from bin.wiki_entity_linking import kb_creator, wikipedia_processor as wp
 
 """
 Process Wikipedia interlinks to generate a training dataset for the EL algorithm
@@ -18,7 +18,7 @@ ENTITY_FILE = "gold_entities_1000000.csv"   # use this file for faster processin
 
 
 def create_training(entity_def_input, training_output):
-    wp_to_id = kb_creator._get_entity_to_id(entity_def_input)
+    wp_to_id = kb_creator.get_entity_to_id(entity_def_input)
     _process_wikipedia_texts(wp_to_id, training_output, limit=None)
 
 
@@ -71,7 +71,8 @@ def _process_wikipedia_texts(wp_to_id, training_output, limit=None):
                 elif clean_line == "</page>":
                     if article_id:
                         try:
-                            _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(), training_output)
+                            _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(),
+                                             training_output)
                         except Exception as e:
                             print("Error processing article", article_id, article_title, e)
                     else:
diff --git a/examples/pipeline/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py
similarity index 80%
rename from examples/pipeline/wiki_entity_linking/wikidata_processor.py
rename to bin/wiki_entity_linking/wikidata_processor.py
index 967849abb..899c607cc 100644
--- a/examples/pipeline/wiki_entity_linking/wikidata_processor.py
+++ b/bin/wiki_entity_linking/wikidata_processor.py
@@ -13,9 +13,12 @@ def read_wikidata_entities_json(limit=None, to_print=False):
     """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
 
     lang = 'en'
-    # prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
     site_filter = 'enwiki'
 
+    # filter currently disabled to get ALL data
+    prop_filter = dict()
+    # prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
+
     title_to_id = dict()
     id_to_descr = dict()
 
@@ -25,6 +28,7 @@ def read_wikidata_entities_json(limit=None, to_print=False):
     parse_labels = False
     parse_descriptions = True
     parse_aliases = False
+    parse_claims = False
 
     with bz2.open(WIKIDATA_JSON, mode='rb') as file:
         line = file.readline()
@@ -45,14 +49,15 @@ def read_wikidata_entities_json(limit=None, to_print=False):
                     keep = True
 
                     claims = obj["claims"]
-                    # for prop, value_set in prop_filter.items():
-                        # claim_property = claims.get(prop, None)
-                        # if claim_property:
-                            # for cp in claim_property:
-                                # cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
-                                # cp_rank = cp['rank']
-                                # if cp_rank != "deprecated" and cp_id in value_set:
-                                    # keep = True
+                    if parse_claims:
+                        for prop, value_set in prop_filter.items():
+                            claim_property = claims.get(prop, None)
+                            if claim_property:
+                                for cp in claim_property:
+                                    cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
+                                    cp_rank = cp['rank']
+                                    if cp_rank != "deprecated" and cp_id in value_set:
+                                        keep = True
 
                     if keep:
                         unique_id = obj["id"]
@@ -64,8 +69,10 @@ def read_wikidata_entities_json(limit=None, to_print=False):
                         # parsing all properties that refer to other entities
                         if parse_properties:
                             for prop, claim_property in claims.items():
-                                cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
-                                cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
+                                cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property
+                                            if cp['mainsnak'].get('datavalue')]
+                                cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict)
+                                             if cp_dict.get('id') is not None]
                                 if cp_values:
                                     if to_print:
                                         print("prop:", prop, cp_values)
@@ -104,7 +111,7 @@ def read_wikidata_entities_json(limit=None, to_print=False):
                                 if lang_aliases:
                                     for item in lang_aliases:
                                         if to_print:
-                                             print("alias (" + lang + "):", item["value"])
+                                            print("alias (" + lang + "):", item["value"])
 
                         if to_print:
                             print()
diff --git a/examples/pipeline/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py
similarity index 98%
rename from examples/pipeline/wiki_entity_linking/wikipedia_processor.py
rename to bin/wiki_entity_linking/wikipedia_processor.py
index e53423487..0747c9db7 100644
--- a/examples/pipeline/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@@ -26,8 +26,8 @@ wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
                    "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki",
                    "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev",
                    "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn",
-                   "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki",
-                   "User", "User talk", "v", "voy",
+                   "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools",
+                   "tswiki", "User", "User talk", "v", "voy",
                    "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews",
                    "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech",
                    "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"]
diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py
deleted file mode 100644
index c26e8d65a..000000000
--- a/examples/pipeline/wiki_entity_linking/run_el.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import os
-import spacy
-import datetime
-from os import listdir
-
-from examples.pipeline.wiki_entity_linking import training_set_creator
-
-# requires: pip install neuralcoref --no-binary neuralcoref
-# import neuralcoref
-
-
-def run_kb_toy_example(kb):
-    for mention in ("Bush", "Douglas Adams", "Homer"):
-        candidates = kb.get_candidates(mention)
-
-        print("generating candidates for " + mention + " :")
-        for c in candidates:
-            print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
-        print()
-
-
-
-
-def run_el_dev(nlp, kb, training_dir, limit=None):
-    correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir)
-
-    predictions = list()
-    golds = list()
-
-    cnt = 0
-    for f in listdir(training_dir):
-        if not limit or cnt < limit:
-            if is_dev(f):
-                article_id = f.replace(".txt", "")
-                if cnt % 500 == 0:
-                    print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset")
-                cnt += 1
-                with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
-                    text = file.read()
-                    doc = nlp(text)
-                    for ent in doc.ents:
-                        if ent.label_ == "PERSON":  # TODO: expand to other types
-                            gold_entity = correct_entries_per_article[article_id].get(ent.text, None)
-                            # only evaluating gold entities we know, because the training data is not complete
-                            if gold_entity:
-                                predictions.append(ent.kb_id_)
-                                golds.append(gold_entity)
-
-    print("Processed", cnt, "dev articles")
-    print()
-    evaluate(predictions, golds)
-
-
-def is_dev(file_name):
-    return file_name.endswith("3.txt")
-
-
-def evaluate(predictions, golds, to_print=True, times_hundred=True):
-    if len(predictions) != len(golds):
-        raise ValueError("predictions and gold entities should have the same length")
-
-    tp = 0
-    fp = 0
-    fn = 0
-
-    corrects = 0
-    incorrects = 0
-
-    for pred, gold in zip(predictions, golds):
-        is_correct = pred == gold
-        if is_correct:
-            corrects += 1
-        else:
-            incorrects += 1
-        if not pred:
-            if not is_correct:  # we don't care about tn
-                fn += 1
-        elif is_correct:
-            tp += 1
-        else:
-            fp += 1
-
-    if to_print:
-        print("Evaluating", len(golds), "entities")
-        print("tp", tp)
-        print("fp", fp)
-        print("fn", fn)
-
-    precision = tp / (tp + fp + 0.0000001)
-    recall = tp / (tp + fn + 0.0000001)
-    if times_hundred:
-        precision = precision*100
-        recall = recall*100
-    fscore = 2 * recall * precision / (recall + precision + 0.0000001)
-
-    accuracy = corrects / (corrects + incorrects)
-
-    if to_print:
-        print("precision", round(precision, 1), "%")
-        print("recall", round(recall, 1), "%")
-        print("Fscore", round(fscore, 1), "%")
-        print("Accuracy", round(accuracy, 1), "%")
-
-    return precision, recall, fscore, accuracy
-
-
-
-
-
-# TODO
-def add_coref(nlp):
-    """ Add coreference resolution to our model """
-    # TODO: this doesn't work yet
-    # neuralcoref.add_to_pipe(nlp)
-    print("done adding to pipe")
-
-    doc = nlp(u'My sister has a dog. She loves him.')
-    print("done doc")
-
-    print(doc._.has_coref)
-    print(doc._.coref_clusters)
-
-
-# TODO
-def _run_ner_depr(nlp, clean_text, article_dict):
-    doc = nlp(clean_text)
-    for ent in doc.ents:
-        if ent.label_ == "PERSON":           # TODO: expand to non-persons
-            ent_id = article_dict.get(ent.text)
-            if ent_id:
-                print(" -", ent.text, ent.label_, ent_id)
-            else:
-                print(" -", ent.text, ent.label_, '???')  # TODO: investigate these cases
diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py
deleted file mode 100644
index a4026d935..000000000
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ /dev/null
@@ -1,490 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import os
-import datetime
-from os import listdir
-import numpy as np
-import random
-from random import shuffle
-from thinc.neural._classes.convolution import ExtractWindow
-from thinc.neural.util import get_array_module
-
-from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator
-
-from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, cosine
-
-from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten
-from thinc.v2v import Model, Maxout, Affine
-from thinc.t2v import Pooling, mean_pool
-from thinc.t2t import ParametricAttention
-from thinc.misc import Residual
-from thinc.misc import LayerNorm as LN
-
-# from spacy.cli.pretrain import get_cossim_loss
-from spacy.matcher import PhraseMatcher
-
-
-class EL_Model:
-
-    PRINT_INSPECT = False
-    PRINT_BATCH_LOSS = False
-    EPS = 0.0000000005
-
-    BATCH_SIZE = 100
-
-    DOC_CUTOFF = 300    # number of characters from the doc context
-    INPUT_DIM = 300     # dimension of pre-trained vectors
-
-    HIDDEN_1_WIDTH = 32
-    DESC_WIDTH = 64
-    ARTICLE_WIDTH = 128
-    SENT_WIDTH = 64
-
-    DROP = 0.4
-    LEARN_RATE = 0.005
-    EPOCHS = 10
-    L2 = 1e-6
-
-    name = "entity_linker"
-
-    def __init__(self, kb, nlp):
-        run_el._prepare_pipeline(nlp, kb)
-        self.nlp = nlp
-        self.kb = kb
-
-        self._build_cnn(embed_width=self.INPUT_DIM,
-                        desc_width=self.DESC_WIDTH,
-                        article_width=self.ARTICLE_WIDTH,
-                        sent_width=self.SENT_WIDTH,
-                        hidden_1_width=self.HIDDEN_1_WIDTH)
-
-    def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True):
-        np.seterr(divide="raise", over="warn", under="ignore", invalid="raise")
-
-        id_to_descr = kb_creator._get_id_to_description(entity_descr_output)
-
-        train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \
-            self._get_training_data(training_dir, id_to_descr, False, trainlimit, to_print=False)
-        train_clusters = list(train_ent.keys())
-
-        dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \
-            self._get_training_data(training_dir, id_to_descr, True, devlimit, to_print=False)
-        dev_clusters = list(dev_ent.keys())
-
-        dev_pos_count = len([g for g in dev_gold.values() if g])
-        dev_neg_count = len([g for g in dev_gold.values() if not g])
-
-        # inspect data
-        if self.PRINT_INSPECT:
-            for cluster, entities in train_ent.items():
-                print()
-                for entity in entities:
-                    print("entity", entity)
-                    print("gold", train_gold[entity])
-                    print("desc", train_desc[entity])
-                    print("sentence ID", train_sent[entity])
-                    print("sentence text", train_sent_texts[train_sent[entity]])
-                    print("article ID", train_art[entity])
-                    print("article text", train_art_texts[train_art[entity]])
-                    print()
-
-        train_pos_entities = [k for k, v in train_gold.items() if v]
-        train_neg_entities = [k for k, v in train_gold.items() if not v]
-
-        train_pos_count = len(train_pos_entities)
-        train_neg_count = len(train_neg_entities)
-
-        self._begin_training()
-
-        if to_print:
-            print()
-            print("Training on", len(train_clusters), "entity clusters in", len(train_art_texts), "articles")
-            print("Training instances pos/neg:", train_pos_count, train_neg_count)
-            print()
-            print("Dev test on", len(dev_clusters), "entity clusters in", len(dev_art_texts), "articles")
-            print("Dev instances pos/neg:", dev_pos_count, dev_neg_count)
-            print()
-            print(" DOC_CUTOFF", self.DOC_CUTOFF)
-            print(" INPUT_DIM", self.INPUT_DIM)
-            print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH)
-            print(" DESC_WIDTH", self.DESC_WIDTH)
-            print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH)
-            print(" SENT_WIDTH", self.SENT_WIDTH)
-            print(" DROP", self.DROP)
-            print(" LEARNING RATE", self.LEARN_RATE)
-            print(" BATCH SIZE", self.BATCH_SIZE)
-            print()
-
-        dev_random = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
-                                    calc_random=True)
-        print("acc", "dev_random", round(dev_random, 2))
-
-        dev_pre = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts,
-                                 avg=True)
-        print("acc", "dev_pre", round(dev_pre, 2))
-        print()
-
-        processed = 0
-        for i in range(self.EPOCHS):
-            shuffle(train_clusters)
-
-            start = 0
-            stop = min(self.BATCH_SIZE, len(train_clusters))
-
-            while start < len(train_clusters):
-                next_batch = {c: train_ent[c] for c in train_clusters[start:stop]}
-                processed += len(next_batch.keys())
-
-                self.update(entity_clusters=next_batch, golds=train_gold, descs=train_desc,
-                            art_texts=train_art_texts, arts=train_art,
-                            sent_texts=train_sent_texts, sents=train_sent)
-
-                start = start + self.BATCH_SIZE
-                stop = min(stop + self.BATCH_SIZE, len(train_clusters))
-
-            train_acc = self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts, avg=True)
-            dev_acc = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, avg=True)
-
-            print(i, "acc train/dev", round(train_acc, 2), round(dev_acc, 2))
-
-        if to_print:
-            print()
-            print("Trained on", processed, "entity clusters across", self.EPOCHS, "epochs")
-
-    def _test_dev(self, entity_clusters, golds, descs, arts, art_texts, sents, sent_texts, avg=True, calc_random=False):
-        correct = 0
-        incorrect = 0
-
-        if calc_random:
-            for cluster, entities in entity_clusters.items():
-                correct_entities = [e for e in entities if golds[e]]
-                assert len(correct_entities) == 1
-
-                entities = list(entities)
-                shuffle(entities)
-
-                if calc_random:
-                    predicted_entity = random.choice(entities)
-                    if predicted_entity in correct_entities:
-                        correct += 1
-                    else:
-                        incorrect += 1
-
-        else:
-            all_clusters = list()
-            arts_list = list()
-            sents_list = list()
-
-            for cluster in entity_clusters.keys():
-                all_clusters.append(cluster)
-                arts_list.append(art_texts[arts[cluster]])
-                sents_list.append(sent_texts[sents[cluster]])
-
-            art_docs = list(self.nlp.pipe(arts_list))
-            sent_docs = list(self.nlp.pipe(sents_list))
-
-            for i, cluster in enumerate(all_clusters):
-                entities = entity_clusters[cluster]
-                correct_entities = [e for e in entities if golds[e]]
-                assert len(correct_entities) == 1
-
-                entities = list(entities)
-                shuffle(entities)
-
-                desc_docs = self.nlp.pipe([descs[e] for e in entities])
-                sent_doc = sent_docs[i]
-                article_doc = art_docs[i]
-
-                predicted_index = self._predict(article_doc=article_doc, sent_doc=sent_doc,
-                                                desc_docs=desc_docs, avg=avg)
-                if entities[predicted_index] in correct_entities:
-                    correct += 1
-                else:
-                    incorrect += 1
-
-        if correct == incorrect == 0:
-            return 0
-
-        acc = correct / (correct + incorrect)
-        return acc
-
-    def _predict(self, article_doc, sent_doc, desc_docs, avg=True, apply_threshold=True):
-        if avg:
-            with self.article_encoder.use_params(self.sgd_article.averages) \
-                 and self.desc_encoder.use_params(self.sgd_desc.averages)\
-                 and self.sent_encoder.use_params(self.sgd_sent.averages):
-                desc_encodings = self.desc_encoder(desc_docs)
-                doc_encoding = self.article_encoder([article_doc])
-                sent_encoding = self.sent_encoder([sent_doc])
-
-        else:
-            desc_encodings = self.desc_encoder(desc_docs)
-            doc_encoding = self.article_encoder([article_doc])
-            sent_encoding = self.sent_encoder([sent_doc])
-
-        concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
-
-        if avg:
-            with self.cont_encoder.use_params(self.sgd_cont.averages):
-                cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
-
-        else:
-            cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
-
-        context_enc = np.transpose(cont_encodings)
-
-        highest_sim = -5
-        best_i = -1
-        for i, desc_enc in enumerate(desc_encodings):
-            sim = cosine(desc_enc, context_enc)
-            if sim >= highest_sim:
-                best_i = i
-                highest_sim = sim
-
-        return best_i
-
-    def _build_cnn(self, embed_width, desc_width, article_width, sent_width, hidden_1_width):
-        self.desc_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width, end_width=desc_width)
-        self.cont_encoder = self._context_encoder(embed_width=embed_width, article_width=article_width,
-                                                     sent_width=sent_width, hidden_width=hidden_1_width,
-                                                     end_width=desc_width)
-
-
-    # def _encoder(self, width):
-    #    tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3,
-    #                      subword_features=False, conv_depth=4, bilstm_depth=0)
-    #
-    #    return tok2vec >> flatten_add_lengths >> Pooling(mean_pool)
-
-    def _context_encoder(self, embed_width, article_width, sent_width, hidden_width, end_width):
-        self.article_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
-        self.sent_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
-
-        model = Affine(end_width, article_width+sent_width, drop_factor=0.0)
-        return model
-
-    @staticmethod
-    def _encoder(in_width, hidden_with, end_width):
-        conv_depth = 2
-        cnn_maxout_pieces = 3
-
-        with Model.define_operators({">>": chain, "**": clone}):
-            convolution = Residual((ExtractWindow(nW=1) >>
-                                    LN(Maxout(hidden_with, hidden_with * 3, pieces=cnn_maxout_pieces))))
-
-            encoder = SpacyVectors \
-                      >> with_flatten(LN(Maxout(hidden_with, in_width)) >> convolution ** conv_depth, pad=conv_depth) \
-                      >> flatten_add_lengths \
-                      >> ParametricAttention(hidden_with)\
-                      >> Pooling(mean_pool) \
-                      >> Residual(zero_init(Maxout(hidden_with, hidden_with))) \
-                      >> zero_init(Affine(end_width, hidden_with, drop_factor=0.0))
-
-            # TODO: ReLu or LN(Maxout)  ?
-            # sum_pool or mean_pool ?
-
-        return encoder
-
-    def _begin_training(self):
-        self.sgd_article = create_default_optimizer(self.article_encoder.ops)
-        self.sgd_article.learn_rate = self.LEARN_RATE
-        self.sgd_article.L2 = self.L2
-
-        self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
-        self.sgd_sent.learn_rate = self.LEARN_RATE
-        self.sgd_sent.L2 = self.L2
-
-        self.sgd_cont = create_default_optimizer(self.cont_encoder.ops)
-        self.sgd_cont.learn_rate = self.LEARN_RATE
-        self.sgd_cont.L2 = self.L2
-
-        self.sgd_desc = create_default_optimizer(self.desc_encoder.ops)
-        self.sgd_desc.learn_rate = self.LEARN_RATE
-        self.sgd_desc.L2 = self.L2
-
-    def get_loss(self, pred, gold, targets):
-        loss, gradients = self.get_cossim_loss(pred, gold, targets)
-        return loss, gradients
-
-    def get_cossim_loss(self, yh, y, t):
-        # Add a small constant to avoid 0 vectors
-        # print()
-        # print("yh", yh)
-        # print("y", y)
-        # print("t", t)
-        yh = yh + 1e-8
-        y = y + 1e-8
-        # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
-        xp = get_array_module(yh)
-        norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
-        norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
-        mul_norms = norm_yh * norm_y
-        cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms
-        # print("cos", cos)
-        d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2))
-        # print("abs", xp.abs(cos - t))
-        loss = xp.abs(cos - t).sum()
-        # print("loss", loss)
-        # print("d_yh", d_yh)
-        inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))])
-        # print("inverse", inverse)
-        return loss, -inverse
-
-    def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents):
-        arts_list = list()
-        sents_list = list()
-        descs_list = list()
-        targets = list()
-
-        for cluster, entities in entity_clusters.items():
-            art = art_texts[arts[cluster]]
-            sent = sent_texts[sents[cluster]]
-            for e in entities:
-                if golds[e]:
-                    arts_list.append(art)
-                    sents_list.append(sent)
-                    descs_list.append(descs[e])
-                    targets.append([1])
-                # else:
-                #    arts_list.append(art)
-                #    sents_list.append(sent)
-                #    descs_list.append(descs[e])
-                #    targets.append([-1])
-
-        desc_docs = self.nlp.pipe(descs_list)
-        desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP)
-
-        art_docs = self.nlp.pipe(arts_list)
-        sent_docs = self.nlp.pipe(sents_list)
-
-        doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP)
-        sent_encodings, bp_sent = self.sent_encoder.begin_update(sent_docs, drop=self.DROP)
-
-        concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
-                            range(len(targets))]
-        cont_encodings, bp_cont = self.cont_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
-
-        loss, cont_gradient = self.get_loss(cont_encodings, desc_encodings, targets)
-
-        # loss, desc_gradient = self.get_loss(desc_encodings, cont_encodings, targets)
-        # cont_gradient = cont_gradient / 2
-        # desc_gradient = desc_gradient / 2
-        # bp_desc(desc_gradient, sgd=self.sgd_desc)
-
-        if self.PRINT_BATCH_LOSS:
-            print("batch loss", loss)
-
-        context_gradient = bp_cont(cont_gradient, sgd=self.sgd_cont)
-
-        # gradient : concat (doc+sent) vs. desc
-        sent_start = self.ARTICLE_WIDTH
-        sent_gradients = list()
-        doc_gradients = list()
-        for x in context_gradient:
-            doc_gradients.append(list(x[0:sent_start]))
-            sent_gradients.append(list(x[sent_start:]))
-
-        bp_doc(doc_gradients, sgd=self.sgd_article)
-        bp_sent(sent_gradients, sgd=self.sgd_sent)
-
-    def _get_training_data(self, training_dir, id_to_descr, dev, limit, to_print):
-        correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir)
-
-        entities_by_cluster = dict()
-        gold_by_entity = dict()
-        desc_by_entity = dict()
-        article_by_cluster = dict()
-        text_by_article = dict()
-        sentence_by_cluster = dict()
-        text_by_sentence = dict()
-        sentence_by_text = dict()
-
-        cnt = 0
-        next_entity_nr = 1
-        next_sent_nr = 1
-        files = listdir(training_dir)
-        shuffle(files)
-        for f in files:
-            if not limit or cnt < limit:
-                if dev == run_el.is_dev(f):
-                    article_id = f.replace(".txt", "")
-                    if cnt % 500 == 0 and to_print:
-                        print(datetime.datetime.now(), "processed", cnt, "files in the training dataset")
-
-                    try:
-                        # parse the article text
-                        with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
-                            text = file.read()
-                            article_doc = self.nlp(text)
-                            truncated_text = text[0:min(self.DOC_CUTOFF, len(text))]
-                            text_by_article[article_id] = truncated_text
-
-                        # process all positive and negative entities, collect all relevant mentions in this article
-                        for mention, entity_pos in correct_entries[article_id].items():
-                            cluster = article_id + "_" + mention
-                            descr = id_to_descr.get(entity_pos)
-                            entities = set()
-                            if descr:
-                                entity = "E_" + str(next_entity_nr) + "_" + cluster
-                                next_entity_nr += 1
-                                gold_by_entity[entity] = 1
-                                desc_by_entity[entity] = descr
-                                entities.add(entity)
-
-                                entity_negs = incorrect_entries[article_id][mention]
-                                for entity_neg in entity_negs:
-                                    descr = id_to_descr.get(entity_neg)
-                                    if descr:
-                                        entity = "E_" + str(next_entity_nr) + "_" + cluster
-                                        next_entity_nr += 1
-                                        gold_by_entity[entity] = 0
-                                        desc_by_entity[entity] = descr
-                                        entities.add(entity)
-
-                            found_matches = 0
-                            if len(entities) > 1:
-                                entities_by_cluster[cluster] = entities
-
-                                # find all matches in the doc for the mentions
-                                # TODO: fix this - doesn't look like all entities are found
-                                matcher = PhraseMatcher(self.nlp.vocab)
-                                patterns = list(self.nlp.tokenizer.pipe([mention]))
-
-                                matcher.add("TerminologyList", None, *patterns)
-                                matches = matcher(article_doc)
-
-                                # store sentences
-                                for match_id, start, end in matches:
-                                    span = article_doc[start:end]
-                                    if mention == span.text:
-                                        found_matches += 1
-                                        sent_text = span.sent.text
-                                        sent_nr = sentence_by_text.get(sent_text,  None)
-                                        if sent_nr is None:
-                                            sent_nr = "S_" + str(next_sent_nr) + article_id
-                                            next_sent_nr += 1
-                                            text_by_sentence[sent_nr] = sent_text
-                                            sentence_by_text[sent_text] = sent_nr
-                                        article_by_cluster[cluster] = article_id
-                                        sentence_by_cluster[cluster] = sent_nr
-
-                            if found_matches == 0:
-                                # print("Could not find neg instances or sentence matches for", mention, "in", article_id)
-                                entities_by_cluster.pop(cluster, None)
-                                article_by_cluster.pop(cluster, None)
-                                sentence_by_cluster.pop(cluster, None)
-                                for entity in entities:
-                                    gold_by_entity.pop(entity, None)
-                                    desc_by_entity.pop(entity, None)
-                        cnt += 1
-                    except:
-                        print("Problem parsing article", article_id)
-
-        if to_print:
-            print()
-            print("Processed", cnt, "training articles, dev=" + str(dev))
-            print()
-        return entities_by_cluster, gold_by_entity, desc_by_entity, article_by_cluster, text_by_article, \
-               sentence_by_cluster, text_by_sentence
-
diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wikidata_entity_linking.py
similarity index 82%
rename from examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
rename to examples/pipeline/wikidata_entity_linking.py
index bdae023b9..d537cce7e 100644
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -5,8 +5,8 @@ import random
 
 from spacy.util import minibatch, compounding
 
-from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el
-from examples.pipeline.wiki_entity_linking.kb_creator import DESC_WIDTH
+from bin.wiki_entity_linking import training_set_creator, kb_creator, wikipedia_processor as wp
+from bin.wiki_entity_linking.kb_creator import DESC_WIDTH
 
 import spacy
 from spacy.kb import KnowledgeBase
@@ -30,9 +30,11 @@ TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
 MAX_CANDIDATES = 10
 MIN_ENTITY_FREQ = 20
 MIN_PAIR_OCC = 5
-DOC_SENT_CUTOFF = 2
+
 EPOCHS = 10
 DROPOUT = 0.1
+LEARN_RATE = 0.005
+L2 = 1e-6
 
 
 def run_pipeline():
@@ -40,7 +42,6 @@ def run_pipeline():
     print()
     nlp_1 = spacy.load('en_core_web_lg')
     nlp_2 = None
-    kb_1 = None
     kb_2 = None
 
     # one-time methods to create KB and write to file
@@ -114,7 +115,7 @@ def run_pipeline():
 
         # test KB
         if to_test_kb:
-            test_kb(kb_2)
+            check_kb(kb_2)
             print()
 
     # STEP 5: create a training dataset from WP
@@ -122,19 +123,21 @@ def run_pipeline():
         print("STEP 5: create training dataset", datetime.datetime.now())
         training_set_creator.create_training(entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
 
-    # STEP 6: create the entity linking pipe
-    el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_SENT_CUTOFF})
+    # STEP 6: create and train the entity linking pipe
+    el_pipe = nlp_2.create_pipe(name='entity_linker', config={})
     el_pipe.set_kb(kb_2)
     nlp_2.add_pipe(el_pipe, last=True)
 
     other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"]
     with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
-        nlp_2.begin_training()
+        optimizer = nlp_2.begin_training()
+        optimizer.learn_rate = LEARN_RATE
+        optimizer.L2 = L2
 
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
         train_limit = 25000
-        dev_limit = 1000
+        dev_limit = 5000
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
                                                         training_dir=TRAINING_DIR,
@@ -144,6 +147,14 @@ def run_pipeline():
         print("Training on", len(train_data), "articles")
         print()
 
+        dev_data = training_set_creator.read_training(nlp=nlp_2,
+                                                      training_dir=TRAINING_DIR,
+                                                      dev=True,
+                                                      limit=dev_limit)
+
+        print("Dev testing on", len(dev_data), "articles")
+        print()
+
         if not train_data:
             print("Did not find any training data")
 
@@ -161,53 +172,55 @@ def run_pipeline():
                             nlp_2.update(
                                 docs,
                                 golds,
+                                sgd=optimizer,
                                 drop=DROPOUT,
                                 losses=losses,
                             )
                             batchnr += 1
                         except Exception as e:
                             print("Error updating batch:", e)
-                            raise(e)
 
                 if batchnr > 0:
-                    losses['entity_linker'] = losses['entity_linker'] / batchnr
-                    print("Epoch, train loss", itn, round(losses['entity_linker'], 2))
-
-        dev_data = training_set_creator.read_training(nlp=nlp_2,
-                                                      training_dir=TRAINING_DIR,
-                                                      dev=True,
-                                                      limit=dev_limit)
-
-        print()
-        print("Dev testing on", len(dev_data), "articles")
+                    with el_pipe.model.use_params(optimizer.averages):
+                        el_pipe.context_weight = 1
+                        el_pipe.prior_weight = 0
+                        dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
+                        losses['entity_linker'] = losses['entity_linker'] / batchnr
+                        print("Epoch, train loss", itn, round(losses['entity_linker'], 2),
+                              " / dev acc context avg", round(dev_acc_context, 3))
 
+        # STEP 7: measure the performance of our trained pipe on an independent dev set
         if len(dev_data) and measure_performance:
             print()
             print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())
             print()
 
-            acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label = _measure_baselines(dev_data, kb_2)
-            print("dev acc oracle:", round(acc_oracle, 3), [(x, round(y, 3)) for x, y in acc_oracle_by_label.items()])
-            print("dev acc random:", round(acc_random, 3), [(x, round(y, 3)) for x, y in acc_random_by_label.items()])
-            print("dev acc prior:", round(acc_prior, 3), [(x, round(y, 3)) for x, y in acc_prior_by_label.items()])
+            acc_r, acc_r_by_label, acc_p, acc_p_by_label, acc_o, acc_o_by_label = _measure_baselines(dev_data, kb_2)
+            print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_by_label.items()])
+            print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_by_label.items()])
+            print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_by_label.items()])
 
-            # print(" measuring accuracy 1-1")
-            el_pipe.context_weight = 1
-            el_pipe.prior_weight = 1
-            dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe)
-            print("dev acc combo:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
+            with el_pipe.model.use_params(optimizer.averages):
+                # measuring combined accuracy (prior + context)
+                el_pipe.context_weight = 1
+                el_pipe.prior_weight = 1
+                dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe)
+                print("dev acc combo avg:", round(dev_acc_combo, 3),
+                      [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
 
-            # using only context
-            el_pipe.context_weight = 1
-            el_pipe.prior_weight = 0
-            dev_acc_context, dev_acc_1_0_dict = _measure_accuracy(dev_data, el_pipe)
-            print("dev acc context:", round(dev_acc_context, 3), [(x, round(y, 3)) for x, y in dev_acc_1_0_dict.items()])
-            print()
+                # using only context
+                el_pipe.context_weight = 1
+                el_pipe.prior_weight = 0
+                dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
+                print("dev acc context avg:", round(dev_acc_context, 3),
+                      [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()])
+                print()
 
             # reset for follow-up tests
             el_pipe.context_weight = 1
             el_pipe.prior_weight = 1
 
+    # STEP 8: apply the EL pipe on a toy example
     if to_test_pipeline:
         print()
         print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())
@@ -215,6 +228,7 @@ def run_pipeline():
         run_el_toy_example(nlp=nlp_2)
         print()
 
+    # STEP 9: write the NLP pipeline (including entity linker) to file
     if to_write_nlp:
         print()
         print("STEP 9: testing NLP IO", datetime.datetime.now())
@@ -225,6 +239,7 @@ def run_pipeline():
         print("reading from", NLP_2_DIR)
         nlp_3 = spacy.load(NLP_2_DIR)
 
+        # verify that the IO has gone correctly
         if to_read_nlp:
             print()
             print("running toy example with NLP 2")
@@ -272,6 +287,7 @@ def _measure_accuracy(data, el_pipe):
 
 
 def _measure_baselines(data, kb):
+    # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound
     random_correct_by_label = dict()
     random_incorrect_by_label = dict()
 
@@ -362,7 +378,7 @@ def calculate_acc(correct_by_label, incorrect_by_label):
     return acc, acc_by_label
 
 
-def test_kb(kb):
+def check_kb(kb):
     for mention in ("Bush", "Douglas Adams", "Homer", "Brazil", "China"):
         candidates = kb.get_candidates(mention)
 
@@ -384,7 +400,7 @@ def run_el_toy_example(nlp):
     print()
 
     # Q4426480 is her husband
-    text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine. "\
+    text = "Ada Lovelace was the countess of Lovelace. She's known for her programming work on the analytical engine. "\
            "She loved her husband William King dearly. "
     doc = nlp(text)
     print(text)
@@ -393,7 +409,7 @@ def run_el_toy_example(nlp):
     print()
 
     # Q3568763 is her tutor
-    text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine. "\
+    text = "Ada Lovelace was the countess of Lovelace. She's known for her programming work on the analytical engine. "\
            "She was tutored by her favorite physics tutor William King."
     doc = nlp(text)
     print(text)
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 29772c5ee..9139152aa 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -661,10 +661,11 @@ def build_nel_encoder(in_width, hidden_width, end_width, **cfg):
                                 LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
 
         encoder = SpacyVectors \
-                  >> with_flatten(LN(Maxout(hidden_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \
+                  >> with_flatten(Affine(hidden_width, in_width))\
+                  >> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \
                   >> flatten_add_lengths \
                   >> ParametricAttention(hidden_width) \
-                  >> Pooling(mean_pool) \
+                  >> Pooling(sum_pool) \
                   >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
                   >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
 
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index fbdca8280..7d90c4438 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1078,33 +1078,19 @@ class EntityLinker(Pipe):
             raise ValueError("entity_width not found")
 
         embed_width = cfg.get("embed_width", 300)
-        hidden_width = cfg.get("hidden_width", 32)
-        entity_width = cfg.get("entity_width")  # no default because this needs to correspond with the KB
-        sent_width = entity_width
+        hidden_width = cfg.get("hidden_width", 128)
+
+        # no default because this needs to correspond with the KB entity length
+        sent_width = cfg.get("entity_width")
 
         model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg)
 
-        # dimension of the mention encoder needs to match the dimension of the entity encoder
-        # article_width = cfg.get("article_width", 128)
-        # sent_width = cfg.get("sent_width", 64)
-        # article_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=article_width, **cfg)
-        # mention_width = article_width + sent_width
-        # mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
-        # return article_encoder, sent_encoder, mention_encoder
-
         return model
 
     def __init__(self, **cfg):
-        # self.article_encoder = True
-        # self.sent_encoder = True
-        # self.mention_encoder = True
         self.model = True
         self.kb = None
         self.cfg = dict(cfg)
-        self.doc_cutoff = self.cfg.get("doc_cutoff", 5)
-        # self.sgd_article = None
-        # self.sgd_sent = None
-        # self.sgd_mention = None
 
     def set_kb(self, kb):
         self.kb = kb
@@ -1131,13 +1117,6 @@ class EntityLinker(Pipe):
             sgd = self.create_optimizer()
         return sgd
 
-        # if self.mention_encoder is True:
-        #    self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
-        # self.sgd_article = create_default_optimizer(self.article_encoder.ops)
-        # self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
-        # self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
-        # return self.sgd_article
-
     def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
         self.require_model()
         self.require_kb()
@@ -1166,15 +1145,11 @@ class EntityLinker(Pipe):
                 mention = doc.text[start:end]
                 sent_start = 0
                 sent_end = len(doc)
-                first_par_end = len(doc)
                 for index, sent in enumerate(doc.sents):
                     if start >= sent.start_char and end <= sent.end_char:
                         sent_start = sent.start
                         sent_end = sent.end
-                    if index == self.doc_cutoff-1:
-                        first_par_end = sent.end
                 sentence = doc[sent_start:sent_end].as_doc()
-                first_par = doc[0:first_par_end].as_doc()
 
                 candidates = self.kb.get_candidates(mention)
                 for c in candidates:
@@ -1184,32 +1159,15 @@ class EntityLinker(Pipe):
                         prior_prob = c.prior_prob
                         entity_encoding = c.entity_vector
                         entity_encodings.append(entity_encoding)
-                        # article_docs.append(first_par)
                         sentence_docs.append(sentence)
 
         if len(entity_encodings) > 0:
-            # doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
-            # sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
-
-            # concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in range(len(article_docs))]
-            # mention_encodings, bp_mention = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=drop)
-
             sent_encodings, bp_sent = self.model.begin_update(sentence_docs, drop=drop)
             entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
 
             loss, d_scores = self.get_loss(scores=sent_encodings, golds=entity_encodings, docs=None)
             bp_sent(d_scores, sgd=sgd)
 
-            # gradient : concat (doc+sent) vs. desc
-            # sent_start = self.article_encoder.nO
-            # sent_gradients = list()
-            # doc_gradients = list()
-            # for x in mention_gradient:
-                # doc_gradients.append(list(x[0:sent_start]))
-                # sent_gradients.append(list(x[sent_start:]))
-            # bp_doc(doc_gradients, sgd=self.sgd_article)
-            # bp_sent(sent_gradients, sgd=self.sgd_sent)
-
             if losses is not None:
                 losses[self.name] += loss
             return loss
@@ -1264,21 +1222,9 @@ class EntityLinker(Pipe):
 
         for i, doc in enumerate(docs):
             if len(doc) > 0:
-                first_par_end = len(doc)
-                for index, sent in enumerate(doc.sents):
-                    if index == self.doc_cutoff-1:
-                        first_par_end = sent.end
-                first_par = doc[0:first_par_end].as_doc()
-
-                # doc_encoding = self.article_encoder([first_par])
                 for ent in doc.ents:
                     sent_doc = ent.sent.as_doc()
                     if len(sent_doc) > 0:
-                        # sent_encoding = self.sent_encoder([sent_doc])
-                        # concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
-                        # mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]]))
-                        # mention_enc_t = np.transpose(mention_encoding)
-
                         sent_encoding = self.model([sent_doc])
                         sent_enc_t = np.transpose(sent_encoding)
 

From 478305cd3f16cbfad2ea6cb9ccf49f434c3395aa Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 18 Jun 2019 18:38:09 +0200
Subject: [PATCH 083/148] small tweaks and documentation

---
 bin/wiki_entity_linking/train_descriptions.py |  5 ++
 .../training_set_creator.py                   |  9 ++-
 bin/wiki_entity_linking/wikidata_processor.py |  2 +-
 .../wikipedia_processor.py                    |  3 +-
 examples/pipeline/wikidata_entity_linking.py  | 14 +++--
 spacy/language.py                             |  2 +-
 spacy/pipeline/pipes.pyx                      | 60 ++++++++-----------
 7 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py
index cc5016237..82db582dc 100644
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@@ -12,6 +12,10 @@ from thinc.neural._classes.affine import Affine
 
 
 class EntityEncoder:
+    """
+    Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D).
+    This entity vector will be stored in the KB, and context vectors will be trained to be similar to them.
+    """
 
     DROP = 0
     EPOCHS = 5
@@ -102,6 +106,7 @@ class EntityEncoder:
 
     def _build_network(self, orig_width, hidden_with):
         with Model.define_operators({">>": chain}):
+            # very simple encoder-decoder model
             self.encoder = (
                 Affine(hidden_with, orig_width)
             )
diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index a0d130824..90df5d9fc 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -10,7 +10,8 @@ from spacy.gold import GoldParse
 from bin.wiki_entity_linking import kb_creator, wikipedia_processor as wp
 
 """
-Process Wikipedia interlinks to generate a training dataset for the EL algorithm
+Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
+Gold-standard entities are stored in one file in standoff format (by character offset).
 """
 
 # ENTITY_FILE = "gold_entities.csv"
@@ -321,12 +322,16 @@ def read_training(nlp, training_dir, dev, limit):
                                     current_article_id = article_id
                                     ents_by_offset = dict()
                                     for ent in current_doc.ents:
-                                        ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
+                                        sent_length = len(ent.sent)
+                                        # custom filtering to avoid too long or too short sentences
+                                        if 5 < sent_length < 100:
+                                            ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
                                 else:
                                     skip_articles.add(current_article_id)
                                     current_doc = None
                         except Exception as e:
                             print("Problem parsing article", article_id, e)
+                            skip_articles.add(current_article_id)
 
                     # repeat checking this condition in case an exception was thrown
                     if current_doc and (current_article_id == article_id):
diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py
index 899c607cc..85d3d8488 100644
--- a/bin/wiki_entity_linking/wikidata_processor.py
+++ b/bin/wiki_entity_linking/wikidata_processor.py
@@ -10,7 +10,7 @@ WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.js
 
 
 def read_wikidata_entities_json(limit=None, to_print=False):
-    """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
+    # Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines.
 
     lang = 'en'
     site_filter = 'enwiki'
diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py
index 0747c9db7..d957fc58c 100644
--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@@ -8,6 +8,7 @@ import datetime
 
 """
 Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
+Write these results to file for downstream KB and training data generation.
 """
 
 
@@ -142,7 +143,7 @@ def _capitalize_first(text):
 
 
 def write_entity_counts(prior_prob_input, count_output, to_print=False):
-    """ Write entity counts for quick access later  """
+    # Write entity counts for quick access later
     entity_to_count = dict()
     total_count = 0
 
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index d537cce7e..c282c7262 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -195,10 +195,11 @@ def run_pipeline():
             print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())
             print()
 
-            acc_r, acc_r_by_label, acc_p, acc_p_by_label, acc_o, acc_o_by_label = _measure_baselines(dev_data, kb_2)
-            print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_by_label.items()])
-            print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_by_label.items()])
-            print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_by_label.items()])
+            counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines(dev_data, kb_2)
+            print("dev counts:", sorted(counts))
+            print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_label.items()])
+            print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()])
+            print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()])
 
             with el_pipe.model.use_params(optimizer.averages):
                 # measuring combined accuracy (prior + context)
@@ -288,6 +289,8 @@ def _measure_accuracy(data, el_pipe):
 
 def _measure_baselines(data, kb):
     # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound
+    counts_by_label = dict()
+
     random_correct_by_label = dict()
     random_incorrect_by_label = dict()
 
@@ -315,6 +318,7 @@ def _measure_baselines(data, kb):
 
                 # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
                 if gold_entity is not None:
+                    counts_by_label[ent_label] = counts_by_label.get(ent_label, 0) + 1
                     candidates = kb.get_candidates(ent.text)
                     oracle_candidate = ""
                     best_candidate = ""
@@ -353,7 +357,7 @@ def _measure_baselines(data, kb):
     acc_random, acc_random_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label)
     acc_oracle, acc_oracle_by_label = calculate_acc(oracle_correct_by_label, oracle_incorrect_by_label)
 
-    return acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label
+    return counts_by_label, acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label
 
 
 def calculate_acc(correct_by_label, incorrect_by_label):
diff --git a/spacy/language.py b/spacy/language.py
index 0e5e29244..2225a763e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -11,7 +11,7 @@ from copy import copy, deepcopy
 from thinc.neural import Model
 import srsly
 
-from spacy.kb import KnowledgeBase
+from .kb import KnowledgeBase
 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 7d90c4438..99c361964 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -14,7 +14,6 @@ from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical
 from thinc.neural.util import get_array_module
 
-from spacy.kb import KnowledgeBase
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
 from ..syntax.ner cimport BiluoPushDown
@@ -1081,9 +1080,9 @@ class EntityLinker(Pipe):
         hidden_width = cfg.get("hidden_width", 128)
 
         # no default because this needs to correspond with the KB entity length
-        sent_width = cfg.get("entity_width")
+        entity_width = cfg.get("entity_width")
 
-        model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg)
+        model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=entity_width, **cfg)
 
         return model
 
@@ -1135,21 +1134,13 @@ class EntityLinker(Pipe):
             docs = [docs]
             golds = [golds]
 
-        # article_docs = list()
-        sentence_docs = list()
+        context_docs = list()
         entity_encodings = list()
 
         for doc, gold in zip(docs, golds):
             for entity in gold.links:
                 start, end, gold_kb = entity
                 mention = doc.text[start:end]
-                sent_start = 0
-                sent_end = len(doc)
-                for index, sent in enumerate(doc.sents):
-                    if start >= sent.start_char and end <= sent.end_char:
-                        sent_start = sent.start
-                        sent_end = sent.end
-                sentence = doc[sent_start:sent_end].as_doc()
 
                 candidates = self.kb.get_candidates(mention)
                 for c in candidates:
@@ -1159,14 +1150,14 @@ class EntityLinker(Pipe):
                         prior_prob = c.prior_prob
                         entity_encoding = c.entity_vector
                         entity_encodings.append(entity_encoding)
-                        sentence_docs.append(sentence)
+                        context_docs.append(doc)
 
         if len(entity_encodings) > 0:
-            sent_encodings, bp_sent = self.model.begin_update(sentence_docs, drop=drop)
+            context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop)
             entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
 
-            loss, d_scores = self.get_loss(scores=sent_encodings, golds=entity_encodings, docs=None)
-            bp_sent(d_scores, sgd=sgd)
+            loss, d_scores = self.get_loss(scores=context_encodings, golds=entity_encodings, docs=None)
+            bp_context(d_scores, sgd=sgd)
 
             if losses is not None:
                 losses[self.name] += loss
@@ -1222,28 +1213,25 @@ class EntityLinker(Pipe):
 
         for i, doc in enumerate(docs):
             if len(doc) > 0:
+                context_encoding = self.model([doc])
+                context_enc_t = np.transpose(context_encoding)
                 for ent in doc.ents:
-                    sent_doc = ent.sent.as_doc()
-                    if len(sent_doc) > 0:
-                        sent_encoding = self.model([sent_doc])
-                        sent_enc_t = np.transpose(sent_encoding)
+                    candidates = self.kb.get_candidates(ent.text)
+                    if candidates:
+                        scores = list()
+                        for c in candidates:
+                            prior_prob = c.prior_prob * self.prior_weight
+                            kb_id = c.entity_
+                            entity_encoding = c.entity_vector
+                            sim = float(cosine(np.asarray([entity_encoding]), context_enc_t)) * self.context_weight
+                            score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
+                            scores.append(score)
 
-                        candidates = self.kb.get_candidates(ent.text)
-                        if candidates:
-                            scores = list()
-                            for c in candidates:
-                                prior_prob = c.prior_prob * self.prior_weight
-                                kb_id = c.entity_
-                                entity_encoding = c.entity_vector
-                                sim = float(cosine(np.asarray([entity_encoding]), sent_enc_t)) * self.context_weight
-                                score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
-                                scores.append(score)
-
-                            # TODO: thresholding
-                            best_index = scores.index(max(scores))
-                            best_candidate = candidates[best_index]
-                            final_entities.append(ent)
-                            final_kb_ids.append(best_candidate.entity_)
+                        # TODO: thresholding
+                        best_index = scores.index(max(scores))
+                        best_candidate = candidates[best_index]
+                        final_entities.append(ent)
+                        final_kb_ids.append(best_candidate.entity_)
 
         return final_entities, final_kb_ids
 

From a31648d28be3ed10a3f8ba5cefc85f94ce22b715 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 19 Jun 2019 09:15:43 +0200
Subject: [PATCH 084/148] further code cleanup

---
 bin/wiki_entity_linking/kb_creator.py         | 36 ++++-----
 bin/wiki_entity_linking/train_descriptions.py |  7 --
 .../training_set_creator.py                   | 27 +++----
 bin/wiki_entity_linking/wikidata_processor.py | 10 +--
 .../wikipedia_processor.py                    | 21 ++---
 examples/pipeline/wikidata_entity_linking.py  | 77 ++++++++-----------
 spacy/kb.pxd                                  |  2 -
 spacy/kb.pyx                                  | 50 +-----------
 spacy/pipeline/pipes.pyx                      | 12 +--
 9 files changed, 76 insertions(+), 166 deletions(-)

diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
index 8d293a0a1..bd82e5b4e 100644
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@@ -1,31 +1,31 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from bin.wiki_entity_linking.train_descriptions import EntityEncoder
+from .train_descriptions import EntityEncoder
+from . import wikidata_processor as wd, wikipedia_processor as wp
 from spacy.kb import KnowledgeBase
 
 import csv
 import datetime
 
-from bin.wiki_entity_linking import wikidata_processor as wd, wikipedia_processor as wp
 
-INPUT_DIM = 300  # dimension of pre-trained vectors
-DESC_WIDTH = 64
+INPUT_DIM = 300  # dimension of pre-trained input vectors
+DESC_WIDTH = 64  # dimension of output entity vectors
 
 
 def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
               entity_def_output, entity_descr_output,
-              count_input, prior_prob_input, to_print=False):
+              count_input, prior_prob_input, wikidata_input):
     # Create the knowledge base from Wikidata entries
     kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH)
 
     # disable this part of the pipeline when rerunning the KB generation from preprocessed files
-    read_raw_data = False
+    read_raw_data = True
 
     if read_raw_data:
         print()
         print(" * _read_wikidata_entities", datetime.datetime.now())
-        title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None)
+        title_to_id, id_to_descr = wd.read_wikidata_entities_json(wikidata_input)
 
         # write the title-ID and ID-description mappings to file
         _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr)
@@ -40,7 +40,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
     print()
     entity_frequencies = wp.get_all_frequencies(count_input=count_input)
 
-    # filter the entities for in the KB by frequency, because there's just too much data otherwise
+    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
     filtered_title_to_id = dict()
     entity_list = list()
     description_list = list()
@@ -60,11 +60,10 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
     print()
     print(" * train entity encoder", datetime.datetime.now())
     print()
-
     encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH)
     encoder.train(description_list=description_list, to_print=True)
-    print()
 
+    print()
     print(" * get entity embeddings", datetime.datetime.now())
     print()
     embeddings = encoder.apply_encoder(description_list)
@@ -80,12 +79,10 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
                  max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
                  prior_prob_input=prior_prob_input)
 
-    if to_print:
-        print()
-        print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
+    print()
+    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
 
     print("done with kb", datetime.datetime.now())
-
     return kb
 
 
@@ -94,6 +91,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_
         id_file.write("WP_title" + "|" + "WD_id" + "\n")
         for title, qid in title_to_id.items():
             id_file.write(title + "|" + str(qid) + "\n")
+
     with open(entity_descr_output, mode='w', encoding='utf8') as descr_file:
         descr_file.write("WD_id" + "|" + "description" + "\n")
         for qid, descr in id_to_descr.items():
@@ -108,7 +106,6 @@ def get_entity_to_id(entity_def_output):
         next(csvreader)
         for row in csvreader:
             entity_to_id[row[0]] = row[1]
-
     return entity_to_id
 
 
@@ -120,16 +117,12 @@ def _get_id_to_description(entity_descr_output):
         next(csvreader)
         for row in csvreader:
             id_to_desc[row[0]] = row[1]
-
     return id_to_desc
 
 
-def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input, to_print=False):
+def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input):
     wp_titles = title_to_id.keys()
 
-    if to_print:
-        print("wp titles:", wp_titles)
-
     # adding aliases with prior probabilities
     # we can read this file sequentially, it's sorted by alias, and then by count
     with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
@@ -176,6 +169,3 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
 
             line = prior_file.readline()
 
-    if to_print:
-        print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
-
diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py
index 82db582dc..948a0e2d1 100644
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@@ -32,8 +32,6 @@ class EntityEncoder:
         if self.encoder is None:
             raise ValueError("Can not apply encoder before training it")
 
-        print("Encoding", len(description_list), "entities")
-
         batch_size = 100000
 
         start = 0
@@ -48,13 +46,11 @@ class EntityEncoder:
 
             start = start + batch_size
             stop = min(stop + batch_size, len(description_list))
-            print("encoded :", len(encodings))
 
         return encodings
 
     def train(self, description_list, to_print=False):
         processed, loss = self._train_model(description_list)
-
         if to_print:
             print("Trained on", processed, "entities across", self.EPOCHS, "epochs")
             print("Final loss:", loss)
@@ -111,15 +107,12 @@ class EntityEncoder:
                 Affine(hidden_with, orig_width)
             )
             self.model = self.encoder >> zero_init(Affine(orig_width, hidden_with, drop_factor=0.0))
-
         self.sgd = create_default_optimizer(self.model.ops)
 
     def _update(self, vectors):
         predictions, bp_model = self.model.begin_update(np.asarray(vectors), drop=self.DROP)
-
         loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors))
         bp_model(d_scores, sgd=self.sgd)
-
         return loss / len(vectors)
 
     @staticmethod
diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index 90df5d9fc..eb9f8af78 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -18,23 +18,21 @@ Gold-standard entities are stored in one file in standoff format (by character o
 ENTITY_FILE = "gold_entities_1000000.csv"   # use this file for faster processing
 
 
-def create_training(entity_def_input, training_output):
+def create_training(wikipedia_input, entity_def_input, training_output):
     wp_to_id = kb_creator.get_entity_to_id(entity_def_input)
-    _process_wikipedia_texts(wp_to_id, training_output, limit=None)
+    _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None)
 
 
-def _process_wikipedia_texts(wp_to_id, training_output, limit=None):
+def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None):
     """
     Read the XML wikipedia data to parse out training data:
     raw text data + positive instances
     """
-
     title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
     id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
 
     read_ids = set()
-
-    entityfile_loc = training_output + "/" + ENTITY_FILE
+    entityfile_loc = training_output / ENTITY_FILE
     with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
         # write entity training header file
         _write_training_entity(outputfile=entityfile,
@@ -44,7 +42,7 @@ def _process_wikipedia_texts(wp_to_id, training_output, limit=None):
                                start="start",
                                end="end")
 
-        with bz2.open(wp.ENWIKI_DUMP, mode='rb') as file:
+        with bz2.open(wikipedia_input, mode='rb') as file:
             line = file.readline()
             cnt = 0
             article_text = ""
@@ -104,7 +102,7 @@ def _process_wikipedia_texts(wp_to_id, training_output, limit=None):
                             print("Found duplicate article ID", article_id, clean_line)  # This should never happen ...
                         read_ids.add(article_id)
 
-                # read the title of this article  (outside the revision portion of the document)
+                # read the title of this article (outside the revision portion of the document)
                 if not reading_revision:
                     titles = title_regex.search(clean_line)
                     if titles:
@@ -134,7 +132,7 @@ def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_te
     # get the raw text without markup etc, keeping only interwiki links
     clean_text = _get_clean_wp_text(text)
 
-    # read the text char by char to get the right offsets of the interwiki links
+    # read the text char by char to get the right offsets for the interwiki links
     final_text = ""
     open_read = 0
     reading_text = True
@@ -274,7 +272,7 @@ def _get_clean_wp_text(article_text):
 
 
 def _write_training_article(article_id, clean_text, training_output):
-    file_loc = training_output + "/" + str(article_id) + ".txt"
+    file_loc = training_output / str(article_id) + ".txt"
     with open(file_loc, mode='w', encoding='utf8') as outputfile:
         outputfile.write(clean_text)
 
@@ -289,11 +287,10 @@ def is_dev(article_id):
 
 def read_training(nlp, training_dir, dev, limit):
     # This method provides training examples that correspond to the entity annotations found by the nlp object
-
-    entityfile_loc = training_dir + "/" + ENTITY_FILE
+    entityfile_loc = training_dir / ENTITY_FILE
     data = []
 
-    # we assume the data is written sequentially
+    # assume the data is written sequentially, so we can reuse the article docs
     current_article_id = None
     current_doc = None
     ents_by_offset = dict()
@@ -347,10 +344,10 @@ def read_training(nlp, training_dir, dev, limit):
                                 gold_end = int(end) - found_ent.sent.start_char
                                 gold_entities = list()
                                 gold_entities.append((gold_start, gold_end, wp_title))
-                                gold = GoldParse(doc=current_doc, links=gold_entities)
+                                gold = GoldParse(doc=sent, links=gold_entities)
                                 data.append((sent, gold))
                                 total_entities += 1
-                                if len(data) % 500 == 0:
+                                if len(data) % 2500 == 0:
                                     print(" -read", total_entities, "entities")
 
     print(" -read", total_entities, "entities")
diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py
index 85d3d8488..a32a0769a 100644
--- a/bin/wiki_entity_linking/wikidata_processor.py
+++ b/bin/wiki_entity_linking/wikidata_processor.py
@@ -5,17 +5,15 @@ import bz2
 import json
 import datetime
 
-# TODO: remove hardcoded paths
-WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
 
-
-def read_wikidata_entities_json(limit=None, to_print=False):
+def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False):
     # Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines.
+    # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
 
     lang = 'en'
     site_filter = 'enwiki'
 
-    # filter currently disabled to get ALL data
+    # properties filter (currently disabled to get ALL data)
     prop_filter = dict()
     # prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected
 
@@ -30,7 +28,7 @@ def read_wikidata_entities_json(limit=None, to_print=False):
     parse_aliases = False
     parse_claims = False
 
-    with bz2.open(WIKIDATA_JSON, mode='rb') as file:
+    with bz2.open(wikidata_file, mode='rb') as file:
         line = file.readline()
         cnt = 0
         while line and (not limit or cnt < limit):
diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py
index d957fc58c..c02e472bc 100644
--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@@ -11,11 +11,6 @@ Process a Wikipedia dump to calculate entity frequencies and prior probabilities
 Write these results to file for downstream KB and training data generation.
 """
 
-
-# TODO: remove hardcoded paths
-ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
-ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
-
 map_alias_to_link = dict()
 
 # these will/should be matched ignoring case
@@ -46,15 +41,13 @@ for ns in wiki_namespaces:
 ns_regex = re.compile(ns_regex, re.IGNORECASE)
 
 
-def read_wikipedia_prior_probs(prior_prob_output):
+def read_wikipedia_prior_probs(wikipedia_input, prior_prob_output):
     """
-    Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
-    The full file takes about 2h to parse 1100M lines (update printed every 5M lines).
-    It works relatively fast because we don't care about which article we parsed the interwiki from,
-    we just process line by line.
+    Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities.
+    The full file takes about 2h to parse 1100M lines.
+    It works relatively fast because it runs line by line, irrelevant of which article the intrawiki is from.
     """
-
-    with bz2.open(ENWIKI_DUMP, mode='rb') as file:
+    with bz2.open(wikipedia_input, mode='rb') as file:
         line = file.readline()
         cnt = 0
         while line:
@@ -70,7 +63,7 @@ def read_wikipedia_prior_probs(prior_prob_output):
             line = file.readline()
             cnt += 1
 
-    # write all aliases and their entities and occurrences to file
+    # write all aliases and their entities and count occurrences to file
     with open(prior_prob_output, mode='w', encoding='utf8') as outputfile:
         outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
         for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
@@ -108,7 +101,7 @@ def get_wp_links(text):
         if ns_regex.match(match):
             pass  # ignore namespaces at the beginning of the string
 
-        # this is a simple link, with the alias the same as the mention
+        # this is a simple [[link]], with the alias the same as the mention
         elif "|" not in match:
             aliases.append(match)
             entities.append(match)
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index c282c7262..aa1c00996 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -2,35 +2,45 @@
 from __future__ import unicode_literals
 
 import random
-
-from spacy.util import minibatch, compounding
+import datetime
+from pathlib import Path
 
 from bin.wiki_entity_linking import training_set_creator, kb_creator, wikipedia_processor as wp
 from bin.wiki_entity_linking.kb_creator import DESC_WIDTH
 
 import spacy
 from spacy.kb import KnowledgeBase
-import datetime
+from spacy.util import minibatch, compounding
 
 """
 Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
 """
 
-PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
-ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
-ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
-ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv'
+ROOT_DIR = Path("C:/Users/Sofie/Documents/data/")
+OUTPUT_DIR = ROOT_DIR / 'wikipedia'
+TRAINING_DIR = OUTPUT_DIR / 'training_data_nel'
 
-KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb_1/kb'
-NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1'
-NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2'
+PRIOR_PROB = OUTPUT_DIR / 'prior_prob.csv'
+ENTITY_COUNTS = OUTPUT_DIR / 'entity_freq.csv'
+ENTITY_DEFS = OUTPUT_DIR / 'entity_defs.csv'
+ENTITY_DESCR = OUTPUT_DIR / 'entity_descriptions.csv'
 
-TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/'
+KB_FILE = OUTPUT_DIR / 'kb_1' / 'kb'
+NLP_1_DIR = OUTPUT_DIR / 'nlp_1'
+NLP_2_DIR = OUTPUT_DIR / 'nlp_2'
 
+# get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
+WIKIDATA_JSON = ROOT_DIR / 'wikidata' / 'wikidata-20190304-all.json.bz2'
+
+# get enwiki-latest-pages-articles-multistream.xml.bz2 from https://dumps.wikimedia.org/enwiki/latest/
+ENWIKI_DUMP = ROOT_DIR / 'wikipedia' / 'enwiki-20190320-pages-articles-multistream.xml.bz2'
+
+# KB construction parameters
 MAX_CANDIDATES = 10
 MIN_ENTITY_FREQ = 20
 MIN_PAIR_OCC = 5
 
+# model training parameters
 EPOCHS = 10
 DROPOUT = 0.1
 LEARN_RATE = 0.005
@@ -38,6 +48,7 @@ L2 = 1e-6
 
 
 def run_pipeline():
+    # set the appropriate booleans to define which parts of the pipeline should be re(run)
     print("START", datetime.datetime.now())
     print()
     nlp_1 = spacy.load('en_core_web_lg')
@@ -67,22 +78,19 @@ def run_pipeline():
     to_write_nlp = False
     to_read_nlp = False
 
-    # STEP 1 : create prior probabilities from WP
-    # run only once !
+    # STEP 1 : create prior probabilities from WP (run only once)
     if to_create_prior_probs:
         print("STEP 1: to_create_prior_probs", datetime.datetime.now())
-        wp.read_wikipedia_prior_probs(prior_prob_output=PRIOR_PROB)
+        wp.read_wikipedia_prior_probs(wikipedia_input=ENWIKI_DUMP, prior_prob_output=PRIOR_PROB)
         print()
 
-    # STEP 2 : deduce entity frequencies from WP
-    # run only once !
+    # STEP 2 : deduce entity frequencies from WP (run only once)
     if to_create_entity_counts:
         print("STEP 2: to_create_entity_counts", datetime.datetime.now())
         wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False)
         print()
 
-    # STEP 3 : create KB and write to file
-    # run only once !
+    # STEP 3 : create KB and write to file (run only once)
     if to_create_kb:
         print("STEP 3a: to_create_kb", datetime.datetime.now())
         kb_1 = kb_creator.create_kb(nlp_1,
@@ -93,7 +101,7 @@ def run_pipeline():
                                     entity_descr_output=ENTITY_DESCR,
                                     count_input=ENTITY_COUNTS,
                                     prior_prob_input=PRIOR_PROB,
-                                    to_print=False)
+                                    wikidata_input=WIKIDATA_JSON)
         print("kb entities:", kb_1.get_size_entities())
         print("kb aliases:", kb_1.get_size_aliases())
         print()
@@ -121,7 +129,9 @@ def run_pipeline():
     # STEP 5: create a training dataset from WP
     if create_wp_training:
         print("STEP 5: create training dataset", datetime.datetime.now())
-        training_set_creator.create_training(entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR)
+        training_set_creator.create_training(wikipedia_input=ENWIKI_DUMP,
+                                             entity_def_input=ENTITY_DEFS,
+                                             training_output=TRAINING_DIR)
 
     # STEP 6: create and train the entity linking pipe
     el_pipe = nlp_2.create_pipe(name='entity_linker', config={})
@@ -136,7 +146,8 @@ def run_pipeline():
 
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
-        train_limit = 25000
+        # define the size (nr of entities) of training and dev set
+        train_limit = 10000
         dev_limit = 5000
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
@@ -157,7 +168,6 @@ def run_pipeline():
 
         if not train_data:
             print("Did not find any training data")
-
         else:
             for itn in range(EPOCHS):
                 random.shuffle(train_data)
@@ -196,7 +206,7 @@ def run_pipeline():
             print()
 
             counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines(dev_data, kb_2)
-            print("dev counts:", sorted(counts))
+            print("dev counts:", sorted(counts.items(), key=lambda x: x[0]))
             print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_label.items()])
             print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()])
             print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()])
@@ -215,7 +225,6 @@ def run_pipeline():
                 dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
                 print("dev acc context avg:", round(dev_acc_context, 3),
                       [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()])
-                print()
 
             # reset for follow-up tests
             el_pipe.context_weight = 1
@@ -227,7 +236,6 @@ def run_pipeline():
         print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())
         print()
         run_el_toy_example(nlp=nlp_2)
-        print()
 
     # STEP 9: write the NLP pipeline (including entity linker) to file
     if to_write_nlp:
@@ -400,26 +408,9 @@ def run_el_toy_example(nlp):
     doc = nlp(text)
     print(text)
     for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
+        print(" ent", ent.text, ent.label_, ent.kb_id_)
     print()
 
-    # Q4426480 is her husband
-    text = "Ada Lovelace was the countess of Lovelace. She's known for her programming work on the analytical engine. "\
-           "She loved her husband William King dearly. "
-    doc = nlp(text)
-    print(text)
-    for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
-    print()
-
-    # Q3568763 is her tutor
-    text = "Ada Lovelace was the countess of Lovelace. She's known for her programming work on the analytical engine. "\
-           "She was tutored by her favorite physics tutor William King."
-    doc = nlp(text)
-    print(text)
-    for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
-
 
 if __name__ == "__main__":
     run_pipeline()
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 9c5a73d59..ccf150cd2 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -18,7 +18,6 @@ ctypedef vector[float_vec] float_matrix
 
 # Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 cdef class Candidate:
-
     cdef readonly KnowledgeBase kb
     cdef hash_t entity_hash
     cdef float entity_freq
@@ -143,7 +142,6 @@ cdef class KnowledgeBase:
 
     cpdef load_bulk(self, loc)
     cpdef set_entities(self, entity_list, prob_list, vector_list)
-    cpdef set_aliases(self, alias_list, entities_list, probabilities_list)
 
 
 cdef class Writer:
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 9a84439ea..72f66b107 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,23 +1,16 @@
 # cython: infer_types=True
 # cython: profile=True
 # coding: utf8
-from collections import OrderedDict
-from pathlib import Path, WindowsPath
-
-from cpython.exc cimport PyErr_CheckSignals
-
-from spacy import util
 from spacy.errors import Errors, Warnings, user_warning
 
+from pathlib import Path
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 
-from cpython.mem cimport PyMem_Malloc
 from cpython.exc cimport PyErr_SetFromErrno
 
-from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
+from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
 from libc.stdint cimport int32_t, int64_t
-from libc.stdlib cimport qsort
 
 from .typedefs cimport hash_t
 
@@ -25,7 +18,6 @@ from os import path
 from libcpp.vector cimport vector
 
 
-
 cdef class Candidate:
 
     def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
@@ -79,8 +71,6 @@ cdef class KnowledgeBase:
         self._entry_index = PreshMap()
         self._alias_index = PreshMap()
 
-        # Should we initialize self._entries and self._aliases_table to specific starting size ?
-
         self.vocab.strings.add("")
         self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
 
@@ -165,47 +155,11 @@ cdef class KnowledgeBase:
 
             i += 1
 
-    # TODO: this method is untested
-    cpdef set_aliases(self, alias_list, entities_list, probabilities_list):
-        nr_aliases = len(alias_list)
-        self._alias_index = PreshMap(nr_aliases+1)
-        self._aliases_table = alias_vec(nr_aliases+1)
-
-        i = 0
-        cdef AliasC alias
-        cdef int32_t dummy_value = 342
-        while i <= nr_aliases:
-            alias_hash = self.vocab.strings.add(alias_list[i])
-            entities = entities_list[i]
-            probabilities = probabilities_list[i]
-
-            nr_candidates = len(entities)
-            entry_indices = vector[int64_t](nr_candidates)
-            probs = vector[float](nr_candidates)
-
-            for j in range(0, nr_candidates):
-                entity = entities[j]
-                entity_hash = self.vocab.strings[entity]
-                if not entity_hash in self._entry_index:
-                    raise ValueError(Errors.E134.format(alias=alias, entity=entity))
-
-                entry_index = <int64_t>self._entry_index.get(entity_hash)
-                entry_indices[j] = entry_index
-
-            alias.entry_indices = entry_indices
-            alias.probs = probs
-
-            self._aliases_table[i] = alias
-            self._alias_index[alias_hash] = i
-
-            i += 1
-
     def add_alias(self, unicode alias, entities, probabilities):
         """
         For a given alias, add its potential entities and prior probabilies to the KB.
         Return the alias_hash at the end
         """
-
         # Throw an error if the length of entities and probabilities are not the same
         if not len(entities) == len(probabilities):
             raise ValueError(Errors.E132.format(alias=alias,
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 99c361964..1c430a90b 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1068,8 +1068,6 @@ class EntityLinker(Pipe):
     DOCS: TODO
     """
     name = 'entity_linker'
-    context_weight = 1
-    prior_weight = 1
 
     @classmethod
     def Model(cls, **cfg):
@@ -1078,18 +1076,17 @@ class EntityLinker(Pipe):
 
         embed_width = cfg.get("embed_width", 300)
         hidden_width = cfg.get("hidden_width", 128)
-
-        # no default because this needs to correspond with the KB entity length
-        entity_width = cfg.get("entity_width")
+        entity_width = cfg.get("entity_width")  # this needs to correspond with the KB entity length
 
         model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=entity_width, **cfg)
-
         return model
 
     def __init__(self, **cfg):
         self.model = True
         self.kb = None
         self.cfg = dict(cfg)
+        self.context_weight = cfg.get("context_weight", 1)
+        self.prior_weight = cfg.get("prior_weight", 1)
 
     def set_kb(self, kb):
         self.kb = kb
@@ -1162,7 +1159,6 @@ class EntityLinker(Pipe):
             if losses is not None:
                 losses[self.name] += loss
             return loss
-
         return 0
 
     def get_loss(self, docs, golds, scores):
@@ -1224,7 +1220,7 @@ class EntityLinker(Pipe):
                             kb_id = c.entity_
                             entity_encoding = c.entity_vector
                             sim = float(cosine(np.asarray([entity_encoding]), context_enc_t)) * self.context_weight
-                            score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
+                            score = prior_prob + sim - (prior_prob*sim)
                             scores.append(score)
 
                         # TODO: thresholding

From cc9ae28a52df2bfc8ee96c38392522d9752a3058 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 19 Jun 2019 12:35:26 +0200
Subject: [PATCH 085/148] custom error and warning messages

---
 bin/wiki_entity_linking/training_set_creator.py |  7 ++++---
 spacy/errors.py                                 |  3 +++
 spacy/kb.pyx                                    | 16 +++++-----------
 spacy/pipeline/pipes.pyx                        |  5 ++---
 4 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index eb9f8af78..d9600048c 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -324,18 +324,19 @@ def read_training(nlp, training_dir, dev, limit):
                                         if 5 < sent_length < 100:
                                             ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
                                 else:
-                                    skip_articles.add(current_article_id)
+                                    skip_articles.add(article_id)
                                     current_doc = None
                         except Exception as e:
                             print("Problem parsing article", article_id, e)
-                            skip_articles.add(current_article_id)
+                            skip_articles.add(article_id)
+                            raise e
 
                     # repeat checking this condition in case an exception was thrown
                     if current_doc and (current_article_id == article_id):
                         found_ent = ents_by_offset.get(start + "_" + end,  None)
                         if found_ent:
                             if found_ent.text != alias:
-                                skip_articles.add(current_article_id)
+                                skip_articles.add(article_id)
                                 current_doc = None
                             else:
                                 sent = found_ent.sent.as_doc()
diff --git a/spacy/errors.py b/spacy/errors.py
index fcc3132c6..5684721ae 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -399,6 +399,9 @@ class Errors(object):
     E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input includes either the "
             "`text` or `tokens` key. For more info, see the docs:\n"
             "https://spacy.io/api/cli#pretrain-jsonl")
+    E139 = ("Knowledge base for component '{name}' not initialized. Did you forget to call set_kb()?")
+    E140 = ("The list of entities, prior probabilities and entity vectors should be of equal length.")
+    E141 = ("Entity vectors should be of length {required} instead of the provided {found}.")
 
 
 @add_codes
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 72f66b107..4d9d2b89b 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -106,9 +106,9 @@ cdef class KnowledgeBase:
             user_warning(Warnings.W018.format(entity=entity))
             return
 
+        # Raise an error if the provided entity vector is not of the correct length
         if len(entity_vector) != self.entity_vector_length:
-            # TODO: proper error
-            raise ValueError("Entity vector length should have been", self.entity_vector_length)
+            raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
 
         vector_index = self.c_add_vector(entity_vector=entity_vector)
 
@@ -121,13 +121,8 @@ cdef class KnowledgeBase:
         return entity_hash
 
     cpdef set_entities(self, entity_list, prob_list, vector_list):
-        if len(entity_list) != len(prob_list):
-            # TODO: proper error
-            raise ValueError("Entity list and prob list should have the same length")
-
-        if len(entity_list) != len(vector_list):
-            # TODO: proper error
-            raise ValueError("Entity list and vector list should have the same length")
+        if len(entity_list) != len(prob_list) or len(entity_list) != len(vector_list):
+            raise ValueError(Errors.E140)
 
         nr_entities = len(entity_list)
         self._entry_index = PreshMap(nr_entities+1)
@@ -138,8 +133,7 @@ cdef class KnowledgeBase:
         while i < nr_entities:
             entity_vector = vector_list[i]
             if len(entity_vector) != self.entity_vector_length:
-                # TODO: proper error
-                raise ValueError("Entity vector is", len(entity_vector), "length but should have been", self.entity_vector_length)
+                raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
 
             entity_hash = self.vocab.strings.add(entity_list[i])
             entry.entity_hash = entity_hash
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index a191a7906..2f7856fe0 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1002,7 +1002,7 @@ cdef class DependencyParser(Parser):
 
     @property
     def postprocesses(self):
-        return [nonproj.deprojectivize, merge_subtokens]
+        return [nonproj.deprojectivize]  # , merge_subtokens]
 
     def add_multitask_objective(self, target):
         if target == "cloze":
@@ -1100,8 +1100,7 @@ class EntityLinker(Pipe):
     def require_kb(self):
         # Raise an error if the knowledge base is not initialized.
         if getattr(self, "kb", None) in (None, True, False):
-            # TODO: custom error
-            raise ValueError(Errors.E109.format(name=self.name))
+            raise ValueError(Errors.E139.format(name=self.name))
 
     def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
         self.require_kb()

From 0b0959b363bd0e8eeb4a9b5aa8f24f618525dbbf Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 19 Jun 2019 13:11:39 +0200
Subject: [PATCH 086/148] UTF8 encoding

---
 spacy/tests/serialize/test_serialize_kb.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 7a8022890..bcf27990b 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -1,5 +1,4 @@
-import spacy
-from spacy.lang.en import English
+# coding: utf-8
 from ..util import make_tempdir
 from ...util import ensure_path
 

From b76a43bee4b085944f661f4cfce2bbcd11af138f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 19 Jun 2019 13:26:33 +0200
Subject: [PATCH 087/148] unicode strings

---
 spacy/tests/serialize/test_serialize_kb.py | 32 +++++++++++-----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index bcf27990b..26e912738 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -29,14 +29,14 @@ def test_serialize_kb_disk(en_vocab):
 def _get_dummy_kb(vocab):
     kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
 
-    kb.add_entity(entity="Q53", prob=0.33, entity_vector=[0, 5, 3])
-    kb.add_entity(entity="Q17", prob=0.2, entity_vector=[7, 1, 0])
-    kb.add_entity(entity="Q007", prob=0.7, entity_vector=[0, 0, 7])
-    kb.add_entity(entity="Q44", prob=0.4, entity_vector=[4, 4, 4])
+    kb.add_entity(entity=u'Q53', prob=0.33, entity_vector=[0, 5, 3])
+    kb.add_entity(entity=u'Q17', prob=0.2, entity_vector=[7, 1, 0])
+    kb.add_entity(entity=u'Q007', prob=0.7, entity_vector=[0, 0, 7])
+    kb.add_entity(entity=u'Q44', prob=0.4, entity_vector=[4, 4, 4])
 
-    kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
-    kb.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
-    kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
+    kb.add_alias(alias=u'double07', entities=[u'Q17', u'Q007'], probabilities=[0.1, 0.9])
+    kb.add_alias(alias=u'guy', entities=[u'Q53', u'Q007', u'Q17', u'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])
+    kb.add_alias(alias=u'random', entities=[u'Q007'], probabilities=[1.0])
 
     return kb
 
@@ -44,30 +44,30 @@ def _get_dummy_kb(vocab):
 def _check_kb(kb):
     # check entities
     assert kb.get_size_entities() == 4
-    for entity_string in ["Q53", "Q17", "Q007", "Q44"]:
+    for entity_string in [u'Q53', u'Q17', u'Q007', u'Q44']:
         assert entity_string in kb.get_entity_strings()
-    for entity_string in ["", "Q0"]:
+    for entity_string in [u'', u'Q0']:
         assert entity_string not in kb.get_entity_strings()
 
     # check aliases
     assert kb.get_size_aliases() == 3
-    for alias_string in ["double07", "guy", "random"]:
+    for alias_string in [u'double07', u'guy', u'random']:
         assert alias_string in kb.get_alias_strings()
-    for alias_string in ["nothingness", "", "randomnoise"]:
+    for alias_string in [u'nothingness', u'', u'randomnoise']:
         assert alias_string not in kb.get_alias_strings()
 
     # check candidates & probabilities
-    candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_)
+    candidates = sorted(kb.get_candidates(u'double07'), key=lambda x: x.entity_)
     assert len(candidates) == 2
 
-    assert candidates[0].entity_ == "Q007"
+    assert candidates[0].entity_ == u'Q007'
     assert 0.6999 < candidates[0].entity_freq < 0.701
     assert candidates[0].entity_vector == [0, 0, 7]
-    assert candidates[0].alias_ == "double07"
+    assert candidates[0].alias_ == u'double07'
     assert 0.899 < candidates[0].prior_prob < 0.901
 
-    assert candidates[1].entity_ == "Q17"
+    assert candidates[1].entity_ == u'Q17'
     assert 0.199 < candidates[1].entity_freq < 0.201
     assert candidates[1].entity_vector == [7, 1, 0]
-    assert candidates[1].alias_ == "double07"
+    assert candidates[1].alias_ == u'double07'
     assert 0.099 < candidates[1].prior_prob < 0.101

From 872121955c1ba3e8b9d4b2ee9b9ac89b2e85d1d5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 20 Jun 2019 10:35:51 +0200
Subject: [PATCH 088/148] Update error code

---
 spacy/cli/pretrain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 7afd10520..a95a40980 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -252,7 +252,7 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
     elif objective == "cosine":
         loss, d_target = get_cossim_loss(prediction, target)
     else:
-        raise ValueError(Errors.E139.format(loss_func=objective))
+        raise ValueError(Errors.E142.format(loss_func=objective))
     return loss, d_target
 
 
From b58bace84b56cc3dcc4f78e0b9dae15effdcd51e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 24 Jun 2019 10:55:04 +0200
Subject: [PATCH 089/148] small fixes

---
 bin/ud/conll17_ud_eval.py                      |  4 ++--
 bin/wiki_entity_linking/kb_creator.py          | 18 +++++++++---------
 .../training_set_creator.py                    |  2 +-
 examples/pipeline/wikidata_entity_linking.py   |  4 ++--
 spacy/pipeline/pipes.pyx                       | 15 +++++++--------
 5 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/bin/ud/conll17_ud_eval.py b/bin/ud/conll17_ud_eval.py
index 78a976a6d..88acfabac 100644
--- a/bin/ud/conll17_ud_eval.py
+++ b/bin/ud/conll17_ud_eval.py
@@ -292,8 +292,8 @@ def evaluate(gold_ud, system_ud, deprel_weights=None, check_parse=True):
 
     def spans_score(gold_spans, system_spans):
         correct, gi, si = 0, 0, 0
-        undersegmented = list()
-        oversegmented = list()
+        undersegmented = []
+        oversegmented = []
         combo = 0
         previous_end_si_earlier = False
         previous_end_gi_earlier = False
diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
index bd82e5b4e..6ee139174 100644
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@@ -42,9 +42,9 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
 
     # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
     filtered_title_to_id = dict()
-    entity_list = list()
-    description_list = list()
-    frequency_list = list()
+    entity_list = []
+    description_list = []
+    frequency_list = []
     for title, entity in title_to_id.items():
         freq = entity_frequencies.get(title, 0)
         desc = id_to_descr.get(entity, None)
@@ -131,8 +131,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
         line = prior_file.readline()
         previous_alias = None
         total_count = 0
-        counts = list()
-        entities = list()
+        counts = []
+        entities = []
         while line:
             splits = line.replace('\n', "").split(sep='|')
             new_alias = splits[0]
@@ -142,8 +142,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
             if new_alias != previous_alias and previous_alias:
                 # done reading the previous alias --> output
                 if len(entities) > 0:
-                    selected_entities = list()
-                    prior_probs = list()
+                    selected_entities = []
+                    prior_probs = []
                     for ent_count, ent_string in zip(counts, entities):
                         if ent_string in wp_titles:
                             wd_id = title_to_id[ent_string]
@@ -157,8 +157,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
                         except ValueError as e:
                             print(e)
                 total_count = 0
-                counts = list()
-                entities = list()
+                counts = []
+                entities = []
 
             total_count += count
 
diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index d9600048c..51105ce09 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -343,7 +343,7 @@ def read_training(nlp, training_dir, dev, limit):
                                 # currently feeding the gold data one entity per sentence at a time
                                 gold_start = int(start) - found_ent.sent.start_char
                                 gold_end = int(end) - found_ent.sent.start_char
-                                gold_entities = list()
+                                gold_entities = []
                                 gold_entities.append((gold_start, gold_end, wp_title))
                                 gold = GoldParse(doc=sent, links=gold_entities)
                                 data.append((sent, gold))
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index aa1c00996..2759da135 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -147,7 +147,7 @@ def run_pipeline():
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
         # define the size (nr of entities) of training and dev set
-        train_limit = 10000
+        train_limit = 5000
         dev_limit = 5000
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
@@ -332,7 +332,7 @@ def _measure_baselines(data, kb):
                     best_candidate = ""
                     random_candidate = ""
                     if candidates:
-                        scores = list()
+                        scores = []
 
                         for c in candidates:
                             scores.append(c.prior_prob)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 2f7856fe0..2eaedd73a 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1131,8 +1131,8 @@ class EntityLinker(Pipe):
             docs = [docs]
             golds = [golds]
 
-        context_docs = list()
-        entity_encodings = list()
+        context_docs = []
+        entity_encodings = []
 
         for doc, gold in zip(docs, golds):
             for entity in gold.links:
@@ -1198,8 +1198,8 @@ class EntityLinker(Pipe):
         self.require_model()
         self.require_kb()
 
-        final_entities = list()
-        final_kb_ids = list()
+        final_entities = []
+        final_kb_ids = []
 
         if not docs:
             return final_entities, final_kb_ids
@@ -1214,7 +1214,7 @@ class EntityLinker(Pipe):
                 for ent in doc.ents:
                     candidates = self.kb.get_candidates(ent.text)
                     if candidates:
-                        scores = list()
+                        scores = []
                         for c in candidates:
                             prior_prob = c.prior_prob * self.prior_weight
                             kb_id = c.entity_
@@ -1259,11 +1259,10 @@ class EntityLinker(Pipe):
         return self
 
     def rehearse(self, docs, sgd=None, losses=None, **config):
-        # TODO
-        pass
+        raise NotImplementedError
 
     def add_label(self, label):
-        pass
+        raise NotImplementedError
 
 
 class Sentencizer(object):

From ddc73b11a9caae7497b5d8d90e97a5b13b9dc6fa Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 24 Jun 2019 12:58:18 +0200
Subject: [PATCH 090/148] fix unicode literals

---
 spacy/tests/pipeline/test_entity_linker.py | 54 +++++++++++-----------
 spacy/tests/serialize/test_serialize_kb.py | 35 +++++++-------
 2 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index b44332df4..b12ad3917 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -17,13 +17,13 @@ def test_kb_valid_entities(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity=u'Q2', prob=0.5, entity_vector=[2])
-    mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity='Q2', prob=0.5, entity_vector=[2])
+    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
 
     # adding aliases
-    mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
-    mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9])
+    mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9])
 
     # test the size of the corresponding KB
     assert(mykb.get_size_entities() == 3)
@@ -35,13 +35,13 @@ def test_kb_invalid_entities(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
 
     # adding aliases - should fail because one of the given IDs is not valid
     with pytest.raises(ValueError):
-        mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q342'], probabilities=[0.8, 0.2])
+        mykb.add_alias(alias='douglas', entities=['Q2', 'Q342'], probabilities=[0.8, 0.2])
 
 
 def test_kb_invalid_probabilities(nlp):
@@ -49,13 +49,13 @@ def test_kb_invalid_probabilities(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
 
     # adding aliases - should fail because the sum of the probabilities exceeds 1
     with pytest.raises(ValueError):
-        mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.4])
+        mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.4])
 
 
 def test_kb_invalid_combination(nlp):
@@ -63,13 +63,13 @@ def test_kb_invalid_combination(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
 
     # adding aliases - should fail because the entities and probabilities vectors are not of equal length
     with pytest.raises(ValueError):
-        mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.3, 0.4, 0.1])
+        mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.3, 0.4, 0.1])
 
 
 def test_kb_invalid_entity_vector(nlp):
@@ -77,11 +77,11 @@ def test_kb_invalid_entity_vector(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
 
     # adding entities
-    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1, 2, 3])
+    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1, 2, 3])
 
     # this should fail because the kb's expected entity vector length is 3
     with pytest.raises(ValueError):
-        mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2])
+        mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
 
 
 def test_candidate_generation(nlp):
@@ -89,15 +89,15 @@ def test_candidate_generation(nlp):
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
 
     # adding entities
-    mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1])
-    mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2])
-    mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3])
+    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
+    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])
 
     # adding aliases
-    mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
-    mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9])
+    mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9])
 
     # test the size of the relevant candidates
-    assert(len(mykb.get_candidates(u'douglas')) == 2)
-    assert(len(mykb.get_candidates(u'adam')) == 1)
-    assert(len(mykb.get_candidates(u'shrubbery')) == 0)
+    assert(len(mykb.get_candidates('douglas')) == 2)
+    assert(len(mykb.get_candidates('adam')) == 1)
+    assert(len(mykb.get_candidates('shrubbery')) == 0)
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 26e912738..fa7253fa1 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -1,4 +1,6 @@
 # coding: utf-8
+from __future__ import unicode_literals
+
 from ..util import make_tempdir
 from ...util import ensure_path
 
@@ -16,7 +18,6 @@ def test_serialize_kb_disk(en_vocab):
         if not dir_path.exists():
             dir_path.mkdir()
         file_path = dir_path / "kb"
-        print(file_path, type(file_path))
         kb1.dump(str(file_path))
 
         kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
@@ -29,14 +30,14 @@ def test_serialize_kb_disk(en_vocab):
 def _get_dummy_kb(vocab):
     kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
 
-    kb.add_entity(entity=u'Q53', prob=0.33, entity_vector=[0, 5, 3])
-    kb.add_entity(entity=u'Q17', prob=0.2, entity_vector=[7, 1, 0])
-    kb.add_entity(entity=u'Q007', prob=0.7, entity_vector=[0, 0, 7])
-    kb.add_entity(entity=u'Q44', prob=0.4, entity_vector=[4, 4, 4])
+    kb.add_entity(entity='Q53', prob=0.33, entity_vector=[0, 5, 3])
+    kb.add_entity(entity='Q17', prob=0.2, entity_vector=[7, 1, 0])
+    kb.add_entity(entity='Q007', prob=0.7, entity_vector=[0, 0, 7])
+    kb.add_entity(entity='Q44', prob=0.4, entity_vector=[4, 4, 4])
 
-    kb.add_alias(alias=u'double07', entities=[u'Q17', u'Q007'], probabilities=[0.1, 0.9])
-    kb.add_alias(alias=u'guy', entities=[u'Q53', u'Q007', u'Q17', u'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])
-    kb.add_alias(alias=u'random', entities=[u'Q007'], probabilities=[1.0])
+    kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9])
+    kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])
+    kb.add_alias(alias='random', entities=['Q007'], probabilities=[1.0])
 
     return kb
 
@@ -44,30 +45,30 @@ def _get_dummy_kb(vocab):
 def _check_kb(kb):
     # check entities
     assert kb.get_size_entities() == 4
-    for entity_string in [u'Q53', u'Q17', u'Q007', u'Q44']:
+    for entity_string in ['Q53', 'Q17', 'Q007', 'Q44']:
         assert entity_string in kb.get_entity_strings()
-    for entity_string in [u'', u'Q0']:
+    for entity_string in ['', 'Q0']:
         assert entity_string not in kb.get_entity_strings()
 
     # check aliases
     assert kb.get_size_aliases() == 3
-    for alias_string in [u'double07', u'guy', u'random']:
+    for alias_string in ['double07', 'guy', 'random']:
         assert alias_string in kb.get_alias_strings()
-    for alias_string in [u'nothingness', u'', u'randomnoise']:
+    for alias_string in ['nothingness', '', 'randomnoise']:
         assert alias_string not in kb.get_alias_strings()
 
     # check candidates & probabilities
-    candidates = sorted(kb.get_candidates(u'double07'), key=lambda x: x.entity_)
+    candidates = sorted(kb.get_candidates('double07'), key=lambda x: x.entity_)
     assert len(candidates) == 2
 
-    assert candidates[0].entity_ == u'Q007'
+    assert candidates[0].entity_ == 'Q007'
     assert 0.6999 < candidates[0].entity_freq < 0.701
     assert candidates[0].entity_vector == [0, 0, 7]
-    assert candidates[0].alias_ == u'double07'
+    assert candidates[0].alias_ == 'double07'
     assert 0.899 < candidates[0].prior_prob < 0.901
 
-    assert candidates[1].entity_ == u'Q17'
+    assert candidates[1].entity_ == 'Q17'
     assert 0.199 < candidates[1].entity_freq < 0.201
     assert candidates[1].entity_vector == [7, 1, 0]
-    assert candidates[1].alias_ == u'double07'
+    assert candidates[1].alias_ == 'double07'
     assert 0.099 < candidates[1].prior_prob < 0.101

From 58a5b40ef6e58d30afd57eec8189b54bded32a47 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 24 Jun 2019 15:19:58 +0200
Subject: [PATCH 091/148] clean up duplicate code

---
 spacy/pipeline/pipes.pyx | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 2eaedd73a..47ba4dc05 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -12,8 +12,8 @@ from thinc.api import chain
 from thinc.v2v import Affine, Maxout, Softmax
 from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical
-from thinc.neural.util import get_array_module
 
+from ..cli.pretrain import get_cossim_loss
 from .functions import merge_subtokens
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
@@ -1162,26 +1162,11 @@ class EntityLinker(Pipe):
         return 0
 
     def get_loss(self, docs, golds, scores):
-        targets = [[1] for _  in golds]  # assuming we're only using positive examples
-        loss, gradients = self.get_cossim_loss_2(yh=scores, y=golds, t=targets)
+        # this loss function assumes we're only using positive examples
+        loss, gradients = get_cossim_loss(yh=scores, y=golds)
         loss = loss / len(golds)
         return loss, gradients
 
-    def get_cossim_loss_2(self, yh, y, t):
-        # Add a small constant to avoid 0 vectors
-        yh = yh + 1e-8
-        y = y + 1e-8
-        # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
-        xp = get_array_module(yh)
-        norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
-        norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
-        mul_norms = norm_yh * norm_y
-        cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms
-        d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2))
-        loss = xp.abs(cos - t).sum()
-        inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))])
-        return loss, -inverse
-
     def __call__(self, doc):
         entities, kb_ids = self.predict([doc])
         self.set_annotations([doc], entities, kb_ids)

From 86086855436f15bf24a9d3d0993a1dafc4003d1a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 25 Jun 2019 15:28:51 +0200
Subject: [PATCH 092/148] ensure Span.as_doc keeps the entity links + unit test

---
 spacy/attrs.pxd                            |  1 +
 spacy/attrs.pyx                            |  1 +
 spacy/symbols.pxd                          |  1 +
 spacy/symbols.pyx                          |  1 +
 spacy/tests/pipeline/test_entity_linker.py | 42 ++++++++++++++++++++++
 spacy/tokens/doc.pyx                       |  7 ++--
 spacy/tokens/span.pyx                      |  2 +-
 spacy/tokens/token.pxd                     |  4 +++
 8 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index 79a177ba9..c5ba8d765 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -82,6 +82,7 @@ cdef enum attr_id_t:
     DEP
     ENT_IOB
     ENT_TYPE
+    ENT_KB_ID
     HEAD
     SENT_START
     SPACY
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index ed1f39a3f..8eeea363f 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -84,6 +84,7 @@ IDS = {
     "DEP": DEP,
     "ENT_IOB": ENT_IOB,
     "ENT_TYPE": ENT_TYPE,
+    "ENT_KB_ID": ENT_KB_ID,
     "HEAD": HEAD,
     "SENT_START": SENT_START,
     "SPACY": SPACY,
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 051b92edb..4501861a2 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -81,6 +81,7 @@ cdef enum symbol_t:
     DEP
     ENT_IOB
     ENT_TYPE
+    ENT_KB_ID
     HEAD
     SENT_START
     SPACY
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index 949621820..b65ae9628 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -86,6 +86,7 @@ IDS = {
     "DEP": DEP,
     "ENT_IOB": ENT_IOB,
     "ENT_TYPE": ENT_TYPE,
+    "ENT_KB_ID": ENT_KB_ID,
     "HEAD": HEAD,
     "SENT_START": SENT_START,
     "SPACY": SPACY,
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index b12ad3917..7ea893408 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -5,6 +5,7 @@ import pytest
 
 from spacy.kb import KnowledgeBase
 from spacy.lang.en import English
+from spacy.pipeline import EntityRuler
 
 
 @pytest.fixture
@@ -101,3 +102,44 @@ def test_candidate_generation(nlp):
     assert(len(mykb.get_candidates('douglas')) == 2)
     assert(len(mykb.get_candidates('adam')) == 1)
     assert(len(mykb.get_candidates('shrubbery')) == 0)
+
+
+def test_preserving_links_asdoc(nlp):
+    """Test that Span.as_doc preserves the existing entity links"""
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+
+    # adding entities
+    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
+    mykb.add_entity(entity='Q2', prob=0.8, entity_vector=[1])
+
+    # adding aliases
+    mykb.add_alias(alias='Boston', entities=['Q1'], probabilities=[0.7])
+    mykb.add_alias(alias='Denver', entities=['Q2'], probabilities=[0.6])
+
+    # set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained)
+    sentencizer = nlp.create_pipe("sentencizer")
+    nlp.add_pipe(sentencizer)
+
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "GPE", "pattern": "Boston"},
+                {"label": "GPE", "pattern": "Denver"}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+
+    el_pipe = nlp.create_pipe(name='entity_linker', config={})
+    el_pipe.set_kb(mykb)
+    el_pipe.begin_training()
+    el_pipe.context_weight = 0
+    el_pipe.prior_weight = 1
+    nlp.add_pipe(el_pipe, last=True)
+
+    # test whether the entity links are preserved by the `as_doc()` function
+    text = "She lives in Boston. He lives in Denver."
+    doc = nlp(text)
+    for ent in doc.ents:
+        orig_text = ent.text
+        orig_kb_id = ent.kb_id_
+        sent_doc = ent.sent.as_doc()
+        for s_ent in sent_doc.ents:
+            if s_ent.text == orig_text:
+                assert s_ent.kb_id_ == orig_kb_id
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 131c43d37..10f57ed60 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -22,7 +22,7 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
 from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
-from ..attrs cimport ENT_TYPE, SENT_START, attr_id_t
+from ..attrs cimport ENT_TYPE, ENT_KB_ID, SENT_START, attr_id_t
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 
 from ..attrs import intify_attrs, IDS
@@ -64,6 +64,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
         return token.ent_iob
     elif feat_name == ENT_TYPE:
         return token.ent_type
+    elif feat_name == ENT_KB_ID:
+        return token.ent_kb_id
     else:
         return Lexeme.get_struct_attr(token.lex, feat_name)
 
@@ -850,7 +852,7 @@ cdef class Doc:
 
         DOCS: https://spacy.io/api/doc#to_bytes
         """
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]  # TODO: ENT_KB_ID ?
         if self.is_tagged:
             array_head.append(TAG)
         # If doc parsed add head and dep attribute
@@ -1004,6 +1006,7 @@ cdef class Doc:
         """
         cdef unicode tag, lemma, ent_type
         deprecation_warning(Warnings.W013.format(obj="Doc"))
+        # TODO: ENT_KB_ID ?
         if len(args) == 3:
             deprecation_warning(Warnings.W003)
             tag, lemma, ent_type = args
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 97b6a1adc..3f4f4418b 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -210,7 +210,7 @@ cdef class Span:
         words = [t.text for t in self]
         spaces = [bool(t.whitespace_) for t in self]
         cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_KB_ID]
         if self.doc.is_tagged:
             array_head.append(TAG)
         # If doc parsed add head and dep attribute
diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd
index bb9f7d070..ec5df3fac 100644
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@@ -53,6 +53,8 @@ cdef class Token:
             return token.ent_iob
         elif feat_name == ENT_TYPE:
             return token.ent_type
+        elif feat_name == ENT_KB_ID:
+            return token.ent_kb_id
         elif feat_name == SENT_START:
             return token.sent_start
         else:
@@ -79,5 +81,7 @@ cdef class Token:
             token.ent_iob = value
         elif feat_name == ENT_TYPE:
             token.ent_type = value
+        elif feat_name == ENT_KB_ID:
+            token.ent_kb_id = value
         elif feat_name == SENT_START:
             token.sent_start = value

From bee23cd8af0cfde6027e00fe506033a03c05170a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 25 Jun 2019 16:09:22 +0200
Subject: [PATCH 093/148] try Tok2Vec instead of SpacyVectors

---
 examples/pipeline/wikidata_entity_linking.py | 87 ++++++++++++--------
 spacy/_ml.py                                 | 25 ++++--
 2 files changed, 69 insertions(+), 43 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 2759da135..9dc2e514f 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -61,22 +61,23 @@ def run_pipeline():
     to_create_kb = False
 
     # read KB back in from file
-    to_read_kb = True
+    to_read_kb = False
     to_test_kb = False
 
     # create training dataset
     create_wp_training = False
 
     # train the EL pipe
-    train_pipe = True
-    measure_performance = True
+    train_pipe = False
+    measure_performance = False
 
     # test the EL pipe on a simple example
-    to_test_pipeline = True
+    to_test_pipeline = False
 
     # write the NLP object, read back in and test again
     to_write_nlp = False
-    to_read_nlp = False
+    to_read_nlp = True
+    test_from_file = True
 
     # STEP 1 : create prior probabilities from WP (run only once)
     if to_create_prior_probs:
@@ -134,21 +135,21 @@ def run_pipeline():
                                              training_output=TRAINING_DIR)
 
     # STEP 6: create and train the entity linking pipe
-    el_pipe = nlp_2.create_pipe(name='entity_linker', config={})
-    el_pipe.set_kb(kb_2)
-    nlp_2.add_pipe(el_pipe, last=True)
-
-    other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"]
-    with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
-        optimizer = nlp_2.begin_training()
-        optimizer.learn_rate = LEARN_RATE
-        optimizer.L2 = L2
-
     if train_pipe:
+        el_pipe = nlp_2.create_pipe(name='entity_linker', config={})
+        el_pipe.set_kb(kb_2)
+        nlp_2.add_pipe(el_pipe, last=True)
+
+        other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"]
+        with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
+            optimizer = nlp_2.begin_training()
+            optimizer.learn_rate = LEARN_RATE
+            optimizer.L2 = L2
+
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
         # define the size (nr of entities) of training and dev set
         train_limit = 5000
-        dev_limit = 5000
+        dev_limit = 10000
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
                                                         training_dir=TRAINING_DIR,
@@ -230,40 +231,56 @@ def run_pipeline():
             el_pipe.context_weight = 1
             el_pipe.prior_weight = 1
 
-    # STEP 8: apply the EL pipe on a toy example
-    if to_test_pipeline:
-        print()
-        print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())
-        print()
-        run_el_toy_example(nlp=nlp_2)
+        # STEP 8: apply the EL pipe on a toy example
+        if to_test_pipeline:
+            print()
+            print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now())
+            print()
+            run_el_toy_example(nlp=nlp_2)
 
-    # STEP 9: write the NLP pipeline (including entity linker) to file
-    if to_write_nlp:
-        print()
-        print("STEP 9: testing NLP IO", datetime.datetime.now())
-        print()
-        print("writing to", NLP_2_DIR)
-        nlp_2.to_disk(NLP_2_DIR)
-        print()
+        # STEP 9: write the NLP pipeline (including entity linker) to file
+        if to_write_nlp:
+            print()
+            print("STEP 9: testing NLP IO", datetime.datetime.now())
+            print()
+            print("writing to", NLP_2_DIR)
+            nlp_2.to_disk(NLP_2_DIR)
+            print()
+
+    # verify that the IO has gone correctly
+    if to_read_nlp:
         print("reading from", NLP_2_DIR)
         nlp_3 = spacy.load(NLP_2_DIR)
 
-        # verify that the IO has gone correctly
-        if to_read_nlp:
+        if test_from_file:
+            dev_limit = 5000
+            dev_data = training_set_creator.read_training(nlp=nlp_3,
+                                                          training_dir=TRAINING_DIR,
+                                                          dev=True,
+                                                          limit=dev_limit)
+
+            print("Dev testing from file on", len(dev_data), "articles")
             print()
-            print("running toy example with NLP 2")
+
+            dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data)
+            print("dev acc combo avg:", round(dev_acc_combo, 3),
+                  [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
+        else:
+            print("running toy example with NLP 3")
             run_el_toy_example(nlp=nlp_3)
 
     print()
     print("STOP", datetime.datetime.now())
 
 
-def _measure_accuracy(data, el_pipe):
+def _measure_accuracy(data, el_pipe=None):
+    # If the docs in the data require further processing with an entity linker, set el_pipe
     correct_by_label = dict()
     incorrect_by_label = dict()
 
     docs = [d for d, g in data if len(d) > 0]
-    docs = el_pipe.pipe(docs)
+    if el_pipe is not None:
+        docs = el_pipe.pipe(docs)
     golds = [g for d, g in data if len(d) > 0]
 
     for doc, gold in zip(docs, golds):
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 9139152aa..82db0fc05 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -655,23 +655,32 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
 def build_nel_encoder(in_width, hidden_width, end_width, **cfg):
     conv_depth = cfg.get("conv_depth", 2)
     cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
+    pretrained_vectors = cfg.get("pretrained_vectors")   # self.nlp.vocab.vectors.name
+
+    tok2vec = Tok2Vec(width=hidden_width, embed_size=in_width, pretrained_vectors=pretrained_vectors,
+                      cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth, bilstm_depth=0)
 
     with Model.define_operators({">>": chain, "**": clone}):
-        convolution = Residual((ExtractWindow(nW=1) >>
-                                LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
+        # convolution = Residual((ExtractWindow(nW=1) >>
+        #                         LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
 
-        encoder = SpacyVectors \
-                  >> with_flatten(Affine(hidden_width, in_width))\
-                  >> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \
-                  >> flatten_add_lengths \
-                  >> ParametricAttention(hidden_width) \
-                  >> Pooling(sum_pool) \
+        # encoder = SpacyVectors \
+        #           >> with_flatten(Affine(hidden_width, in_width)) \
+        #           >> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \
+        #          >> flatten_add_lengths \
+        #          >> ParametricAttention(hidden_width) \
+        #          >> Pooling(sum_pool) \
+        #          >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
+        #          >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
+
+        encoder = tok2vec >> flatten_add_lengths >> Pooling(mean_pool)\
                   >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
                   >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
 
         # TODO: ReLu or LN(Maxout)  ?
         # sum_pool or mean_pool ?
 
+    encoder.tok2vec = tok2vec
     encoder.nO = end_width
     return encoder
 

From 1de61f68d645f0157f3902713bb69be1cf1421e2 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 26 Jun 2019 13:53:10 +0200
Subject: [PATCH 094/148] improve speed of prediction loop

---
 examples/pipeline/wikidata_entity_linking.py | 39 ++++++++++++--------
 spacy/pipeline/pipes.pyx                     | 33 ++++++++++-------
 2 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 9dc2e514f..2d300f699 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -76,7 +76,7 @@ def run_pipeline():
 
     # write the NLP object, read back in and test again
     to_write_nlp = False
-    to_read_nlp = True
+    to_read_nlp = False
     test_from_file = True
 
     # STEP 1 : create prior probabilities from WP (run only once)
@@ -252,22 +252,27 @@ def run_pipeline():
         print("reading from", NLP_2_DIR)
         nlp_3 = spacy.load(NLP_2_DIR)
 
-        if test_from_file:
-            dev_limit = 5000
-            dev_data = training_set_creator.read_training(nlp=nlp_3,
-                                                          training_dir=TRAINING_DIR,
-                                                          dev=True,
-                                                          limit=dev_limit)
+        print("running toy example with NLP 3")
+        run_el_toy_example(nlp=nlp_3)
 
-            print("Dev testing from file on", len(dev_data), "articles")
-            print()
+    # testing performance with an NLP model from file
+    if test_from_file:
+        nlp_2 = spacy.load(NLP_1_DIR)
+        nlp_3 = spacy.load(NLP_2_DIR)
+        el_pipe = nlp_3.get_pipe("entity_linker")
 
-            dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data)
-            print("dev acc combo avg:", round(dev_acc_combo, 3),
-                  [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
-        else:
-            print("running toy example with NLP 3")
-            run_el_toy_example(nlp=nlp_3)
+        dev_limit = 10000
+        dev_data = training_set_creator.read_training(nlp=nlp_2,
+                                                      training_dir=TRAINING_DIR,
+                                                      dev=True,
+                                                      limit=dev_limit)
+
+        print("Dev testing from file on", len(dev_data), "articles")
+        print()
+
+        dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe)
+        print("dev acc combo avg:", round(dev_acc_combo, 3),
+              [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
 
     print()
     print("STOP", datetime.datetime.now())
@@ -280,7 +285,9 @@ def _measure_accuracy(data, el_pipe=None):
 
     docs = [d for d, g in data if len(d) > 0]
     if el_pipe is not None:
-        docs = el_pipe.pipe(docs)
+        print("applying el_pipe", datetime.datetime.now())
+        docs = list(el_pipe.pipe(docs, batch_size=10000000000))
+        print("done applying el_pipe", datetime.datetime.now())
     golds = [g for d, g in data if len(d) > 0]
 
     for doc, gold in zip(docs, golds):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 47ba4dc05..33b3baf8d 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -3,8 +3,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-import numpy as np
-
 import numpy
 import srsly
 from collections import OrderedDict
@@ -12,6 +10,7 @@ from thinc.api import chain
 from thinc.v2v import Affine, Maxout, Softmax
 from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical
+from thinc.neural.util import get_array_module
 
 from ..cli.pretrain import get_cossim_loss
 from .functions import merge_subtokens
@@ -1151,7 +1150,7 @@ class EntityLinker(Pipe):
 
         if len(entity_encodings) > 0:
             context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop)
-            entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
+            entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
 
             loss, d_scores = self.get_loss(scores=context_encodings, golds=entity_encodings, docs=None)
             bp_context(d_scores, sgd=sgd)
@@ -1192,24 +1191,30 @@ class EntityLinker(Pipe):
         if isinstance(docs, Doc):
             docs = [docs]
 
+        context_encodings = self.model(docs)
+        xp = get_array_module(context_encodings)
+
         for i, doc in enumerate(docs):
             if len(doc) > 0:
-                context_encoding = self.model([doc])
-                context_enc_t = np.transpose(context_encoding)
+                context_encoding = context_encodings[i]
+                context_enc_t = context_encoding.T
+                norm_1 = xp.linalg.norm(context_enc_t)
                 for ent in doc.ents:
                     candidates = self.kb.get_candidates(ent.text)
                     if candidates:
-                        scores = []
-                        for c in candidates:
-                            prior_prob = c.prior_prob * self.prior_weight
-                            kb_id = c.entity_
-                            entity_encoding = c.entity_vector
-                            sim = float(cosine(np.asarray([entity_encoding]), context_enc_t)) * self.context_weight
-                            score = prior_prob + sim - (prior_prob*sim)
-                            scores.append(score)
+                        prior_probs = xp.asarray([c.prior_prob for c in candidates])
+                        prior_probs *= self.prior_weight
+
+                        entity_encodings = xp.asarray([c.entity_vector for c in candidates])
+                        norm_2 = xp.linalg.norm(entity_encodings, axis=1)
+
+                        # cosine similarity
+                        sims = xp.dot(entity_encodings, context_enc_t) / (norm_1 * norm_2)
+                        sims *= self.context_weight
+                        scores = prior_probs + sims - (prior_probs*sims)
+                        best_index = scores.argmax()
 
                         # TODO: thresholding
-                        best_index = scores.index(max(scores))
                         best_candidate = candidates[best_index]
                         final_entities.append(ent)
                         final_kb_ids.append(best_candidate.entity_)

From dbc53b9870a76840d50c29cd1708e02c02414756 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 26 Jun 2019 15:55:26 +0200
Subject: [PATCH 095/148] rename to KBEntryC

---
 examples/pipeline/wikidata_entity_linking.py | 18 ++++++++----------
 spacy/kb.pxd                                 | 10 +++++-----
 spacy/kb.pyx                                 |  4 ++--
 spacy/structs.pxd                            |  2 +-
 4 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 2d300f699..9ce3b9559 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -61,23 +61,23 @@ def run_pipeline():
     to_create_kb = False
 
     # read KB back in from file
-    to_read_kb = False
+    to_read_kb = True
     to_test_kb = False
 
     # create training dataset
     create_wp_training = False
 
     # train the EL pipe
-    train_pipe = False
-    measure_performance = False
+    train_pipe = True
+    measure_performance = True
 
     # test the EL pipe on a simple example
-    to_test_pipeline = False
+    to_test_pipeline = True
 
     # write the NLP object, read back in and test again
-    to_write_nlp = False
+    to_write_nlp = True
     to_read_nlp = False
-    test_from_file = True
+    test_from_file = False
 
     # STEP 1 : create prior probabilities from WP (run only once)
     if to_create_prior_probs:
@@ -149,7 +149,7 @@ def run_pipeline():
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
         # define the size (nr of entities) of training and dev set
         train_limit = 5000
-        dev_limit = 10000
+        dev_limit = 5000
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
                                                         training_dir=TRAINING_DIR,
@@ -285,9 +285,7 @@ def _measure_accuracy(data, el_pipe=None):
 
     docs = [d for d, g in data if len(d) > 0]
     if el_pipe is not None:
-        print("applying el_pipe", datetime.datetime.now())
-        docs = list(el_pipe.pipe(docs, batch_size=10000000000))
-        print("done applying el_pipe", datetime.datetime.now())
+        docs = list(el_pipe.pipe(docs))
     golds = [g for d, g in data if len(d) > 0]
 
     for doc, gold in zip(docs, golds):
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index ccf150cd2..40b22b275 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -9,8 +9,8 @@ from libc.stdio cimport FILE
 from spacy.vocab cimport Vocab
 from .typedefs cimport hash_t
 
-from .structs cimport EntryC, AliasC
-ctypedef vector[EntryC] entry_vec
+from .structs cimport KBEntryC, AliasC
+ctypedef vector[KBEntryC] entry_vec
 ctypedef vector[AliasC] alias_vec
 ctypedef vector[float] float_vec
 ctypedef vector[float_vec] float_matrix
@@ -32,7 +32,7 @@ cdef class KnowledgeBase:
     cdef int64_t entity_vector_length
 
     # This maps 64bit keys (hash of unique entity string)
-    # to 64bit values (position of the _EntryC struct in the _entries vector).
+    # to 64bit values (position of the _KBEntryC struct in the _entries vector).
     # The PreshMap is pretty space efficient, as it uses open addressing. So
     # the only overhead is the vacancy rate, which is approximately 30%.
     cdef PreshMap _entry_index
@@ -88,7 +88,7 @@ cdef class KnowledgeBase:
         cdef int64_t new_index = self._entries.size()
 
         # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
-        cdef EntryC entry
+        cdef KBEntryC entry
         entry.entity_hash = entity_hash
         entry.vector_index = vector_index
         entry.feats_row = feats_row
@@ -121,7 +121,7 @@ cdef class KnowledgeBase:
         cdef int32_t dummy_value = 0
 
         # Avoid struct initializer to enable nogil
-        cdef EntryC entry
+        cdef KBEntryC entry
         entry.entity_hash = dummy_hash
         entry.vector_index = dummy_value
         entry.feats_row = dummy_value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 4d9d2b89b..7c2daa659 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -129,7 +129,7 @@ cdef class KnowledgeBase:
         self._entries = entry_vec(nr_entities+1)
 
         i = 0
-        cdef EntryC entry
+        cdef KBEntryC entry
         while i < nr_entities:
             entity_vector = vector_list[i]
             if len(entity_vector) != self.entity_vector_length:
@@ -250,7 +250,7 @@ cdef class KnowledgeBase:
         cdef int64_t entry_index
         cdef float prob
         cdef int32_t vector_index
-        cdef EntryC entry
+        cdef KBEntryC entry
         cdef AliasC alias
         cdef float vector_element
 
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 8de4d5f4c..e80b1b4d6 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -79,7 +79,7 @@ cdef struct TokenC:
 
 
 # Internal struct, for storage and disambiguation of entities.
-cdef struct EntryC:
+cdef struct KBEntryC:
 
     # The hash of this entry's unique ID/name in the kB
     hash_t entity_hash

From 68a0662019760a20bbc740be43b2ec58aa5a816e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 28 Jun 2019 08:29:31 +0200
Subject: [PATCH 096/148] context encoder with Tok2Vec + linking model instead
 of cosine

---
 bin/wiki_entity_linking/kb_creator.py         |  4 +-
 bin/wiki_entity_linking/train_descriptions.py |  4 +-
 .../training_set_creator.py                   |  3 +-
 examples/pipeline/wikidata_entity_linking.py  |  9 +--
 spacy/_ml.py                                  | 45 +++++++------
 spacy/pipeline/pipes.pyx                      | 66 ++++++++++++-------
 6 files changed, 73 insertions(+), 58 deletions(-)

diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
index 6ee139174..e8e081cef 100644
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@@ -33,7 +33,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
     else:
         # read the mappings from file
         title_to_id = get_entity_to_id(entity_def_output)
-        id_to_descr = _get_id_to_description(entity_descr_output)
+        id_to_descr = get_id_to_description(entity_descr_output)
 
     print()
     print(" * _get_entity_frequencies", datetime.datetime.now())
@@ -109,7 +109,7 @@ def get_entity_to_id(entity_def_output):
     return entity_to_id
 
 
-def _get_id_to_description(entity_descr_output):
+def get_id_to_description(entity_descr_output):
     id_to_desc = dict()
     with open(entity_descr_output, 'r', encoding='utf8') as csvfile:
         csvreader = csv.reader(csvfile, delimiter='|')
diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py
index 948a0e2d1..6a4d046e5 100644
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@@ -14,7 +14,7 @@ from thinc.neural._classes.affine import Affine
 class EntityEncoder:
     """
     Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D).
-    This entity vector will be stored in the KB, and context vectors will be trained to be similar to them.
+    This entity vector will be stored in the KB, for further downstream use in the entity model.
     """
 
     DROP = 0
@@ -97,7 +97,7 @@ class EntityEncoder:
             else:
                 indices[i] = 0
         word_vectors = doc.vocab.vectors.data[indices]
-        doc_vector = np.mean(word_vectors, axis=0)  # TODO: min? max?
+        doc_vector = np.mean(word_vectors, axis=0)
         return doc_vector
 
     def _build_network(self, orig_width, hidden_with):
diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index 51105ce09..436154409 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -14,8 +14,7 @@ Process Wikipedia interlinks to generate a training dataset for the EL algorithm
 Gold-standard entities are stored in one file in standoff format (by character offset).
 """
 
-# ENTITY_FILE = "gold_entities.csv"
-ENTITY_FILE = "gold_entities_1000000.csv"   # use this file for faster processing
+ENTITY_FILE = "gold_entities.csv"
 
 
 def create_training(wikipedia_input, entity_def_input, training_output):
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 9ce3b9559..600436a1d 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -42,9 +42,10 @@ MIN_PAIR_OCC = 5
 
 # model training parameters
 EPOCHS = 10
-DROPOUT = 0.1
+DROPOUT = 0.2
 LEARN_RATE = 0.005
 L2 = 1e-6
+CONTEXT_WIDTH=128
 
 
 def run_pipeline():
@@ -136,7 +137,8 @@ def run_pipeline():
 
     # STEP 6: create and train the entity linking pipe
     if train_pipe:
-        el_pipe = nlp_2.create_pipe(name='entity_linker', config={})
+        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
+        el_pipe = nlp_2.create_pipe(name='entity_linker', config={"context_width": CONTEXT_WIDTH})
         el_pipe.set_kb(kb_2)
         nlp_2.add_pipe(el_pipe, last=True)
 
@@ -146,9 +148,8 @@ def run_pipeline():
             optimizer.learn_rate = LEARN_RATE
             optimizer.L2 = L2
 
-        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
         # define the size (nr of entities) of training and dev set
-        train_limit = 5000
+        train_limit = 500000
         dev_limit = 5000
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 82db0fc05..b00ceda62 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -652,37 +652,36 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
     return model
 
 
-def build_nel_encoder(in_width, hidden_width, end_width, **cfg):
+def build_nel_encoder(embed_width, hidden_width, **cfg):
+    # TODO proper error
+    if "entity_width" not in cfg:
+        raise ValueError("entity_width not found")
+    if "context_width" not in cfg:
+        raise ValueError("context_width not found")
+
     conv_depth = cfg.get("conv_depth", 2)
     cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
     pretrained_vectors = cfg.get("pretrained_vectors")   # self.nlp.vocab.vectors.name
-
-    tok2vec = Tok2Vec(width=hidden_width, embed_size=in_width, pretrained_vectors=pretrained_vectors,
-                      cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth, bilstm_depth=0)
+    context_width = cfg.get("context_width")
+    entity_width = cfg.get("entity_width")
 
     with Model.define_operators({">>": chain, "**": clone}):
-        # convolution = Residual((ExtractWindow(nW=1) >>
-        #                         LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
+        model = Affine(1, entity_width+context_width+1, drop_factor=0.0)\
+                >> logistic
 
-        # encoder = SpacyVectors \
-        #           >> with_flatten(Affine(hidden_width, in_width)) \
-        #           >> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \
-        #          >> flatten_add_lengths \
-        #          >> ParametricAttention(hidden_width) \
-        #          >> Pooling(sum_pool) \
-        #          >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
-        #          >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
+        # context encoder
+        tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors,
+                          cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth,
+                          bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\
+                                >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
+                                >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
 
-        encoder = tok2vec >> flatten_add_lengths >> Pooling(mean_pool)\
-                  >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
-                  >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
+        model.tok2vec = tok2vec
 
-        # TODO: ReLu or LN(Maxout)  ?
-        # sum_pool or mean_pool ?
-
-    encoder.tok2vec = tok2vec
-    encoder.nO = end_width
-    return encoder
+    model.tok2vec = tok2vec
+    model.tok2vec.nO = context_width
+    model.nO = 1
+    return model
 
 @layerize
 def flatten(seqs, drop=0.0):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 33b3baf8d..25df31f70 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -5,6 +5,7 @@ from __future__ import unicode_literals
 
 import numpy
 import srsly
+import random
 from collections import OrderedDict
 from thinc.api import chain
 from thinc.v2v import Affine, Maxout, Softmax
@@ -229,7 +230,7 @@ class Tensorizer(Pipe):
 
         vocab (Vocab): A `Vocab` instance. The model must share the same
             `Vocab` instance with the `Doc` objects it will process.
-        model (Model): A `Model` instance or `True` allocate one later.
+        model (Model): A `Model` instance or `True` to allocate one later.
         **cfg: Config parameters.
 
         EXAMPLE:
@@ -386,7 +387,7 @@ class Tagger(Pipe):
     def predict(self, docs):
         self.require_model()
         if not any(len(doc) for doc in docs):
-            # Handle case where there are no tokens in any docs.
+            # Handle cases where there are no tokens in any docs.
             n_labels = len(self.labels)
             guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
             tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
@@ -1071,22 +1072,20 @@ class EntityLinker(Pipe):
 
     @classmethod
     def Model(cls, **cfg):
-        if "entity_width" not in cfg:
-            raise ValueError("entity_width not found")
-
         embed_width = cfg.get("embed_width", 300)
         hidden_width = cfg.get("hidden_width", 128)
-        entity_width = cfg.get("entity_width")  # this needs to correspond with the KB entity length
 
-        model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=entity_width, **cfg)
+        model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, **cfg)
         return model
 
     def __init__(self, **cfg):
         self.model = True
         self.kb = None
+        self.sgd_context = None
         self.cfg = dict(cfg)
         self.context_weight = cfg.get("context_weight", 1)
         self.prior_weight = cfg.get("prior_weight", 1)
+        self.context_width = cfg.get("context_width")
 
     def set_kb(self, kb):
         self.kb = kb
@@ -1107,6 +1106,7 @@ class EntityLinker(Pipe):
 
         if self.model is True:
             self.model = self.Model(**self.cfg)
+            self.sgd_context = self.create_optimizer()
 
         if sgd is None:
             sgd = self.create_optimizer()
@@ -1132,35 +1132,55 @@ class EntityLinker(Pipe):
 
         context_docs = []
         entity_encodings = []
+        labels = []
 
         for doc, gold in zip(docs, golds):
             for entity in gold.links:
                 start, end, gold_kb = entity
                 mention = doc.text[start:end]
-
                 candidates = self.kb.get_candidates(mention)
+                random.shuffle(candidates)
+                nr_neg = 0
                 for c in candidates:
                     kb_id = c.entity_
-                    # Currently only training on the positive instances
                     if kb_id == gold_kb:
-                        prior_prob = c.prior_prob
                         entity_encoding = c.entity_vector
                         entity_encodings.append(entity_encoding)
                         context_docs.append(doc)
+                        labels.append([1])
+                    else:   # elif nr_neg < 1:
+                        nr_neg += 1
+                        entity_encoding = c.entity_vector
+                        entity_encodings.append(entity_encoding)
+                        context_docs.append(doc)
+                        labels.append([0])
 
         if len(entity_encodings) > 0:
-            context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop)
+            context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
             entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
 
-            loss, d_scores = self.get_loss(scores=context_encodings, golds=entity_encodings, docs=None)
-            bp_context(d_scores, sgd=sgd)
+            mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
+            pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop)
+            labels = self.model.ops.asarray(labels, dtype="float32")
+
+            loss, d_scores = self.get_loss(prediction=pred, golds=labels, docs=None)
+            mention_gradient = bp_mention(d_scores, sgd=sgd)
+
+            context_gradients = [list(x[0:self.context_width]) for x in mention_gradient]
+            bp_context(self.model.ops.asarray(context_gradients, dtype="float32"), sgd=self.sgd_context)
 
             if losses is not None:
                 losses[self.name] += loss
             return loss
         return 0
 
-    def get_loss(self, docs, golds, scores):
+    def get_loss(self, docs, golds, prediction):
+        d_scores = (prediction - golds)
+        loss = (d_scores ** 2).sum()
+        loss = loss / len(golds)
+        return loss, d_scores
+
+    def get_loss_old(self, docs, golds, scores):
         # this loss function assumes we're only using positive examples
         loss, gradients = get_cossim_loss(yh=scores, y=golds)
         loss = loss / len(golds)
@@ -1191,30 +1211,26 @@ class EntityLinker(Pipe):
         if isinstance(docs, Doc):
             docs = [docs]
 
-        context_encodings = self.model(docs)
+        context_encodings = self.model.tok2vec(docs)
         xp = get_array_module(context_encodings)
 
         for i, doc in enumerate(docs):
             if len(doc) > 0:
                 context_encoding = context_encodings[i]
-                context_enc_t = context_encoding.T
-                norm_1 = xp.linalg.norm(context_enc_t)
                 for ent in doc.ents:
                     candidates = self.kb.get_candidates(ent.text)
                     if candidates:
-                        prior_probs = xp.asarray([c.prior_prob for c in candidates])
+                        random.shuffle(candidates)
+                        prior_probs = xp.asarray([[c.prior_prob] for c in candidates])
                         prior_probs *= self.prior_weight
 
                         entity_encodings = xp.asarray([c.entity_vector for c in candidates])
-                        norm_2 = xp.linalg.norm(entity_encodings, axis=1)
-
-                        # cosine similarity
-                        sims = xp.dot(entity_encodings, context_enc_t) / (norm_1 * norm_2)
-                        sims *= self.context_weight
-                        scores = prior_probs + sims - (prior_probs*sims)
-                        best_index = scores.argmax()
+                        mention_encodings = [list(context_encoding) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
+                        predictions = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))
+                        scores = (prior_probs + predictions - (xp.dot(prior_probs.T, predictions)))
 
                         # TODO: thresholding
+                        best_index = scores.argmax()
                         best_candidate = candidates[best_index]
                         final_entities.append(ent)
                         final_kb_ids.append(best_candidate.entity_)

From 1c80b852414f61f832ba29a3a7aac7d63c55218b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 28 Jun 2019 08:59:23 +0200
Subject: [PATCH 097/148] fix tests

---
 examples/pipeline/dummy_entity_linking.py  | 6 +++++-
 spacy/_ml.py                               | 2 +-
 spacy/tests/pipeline/test_entity_linker.py | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
index 3f1fabdfd..0e59db304 100644
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -41,8 +41,12 @@ def create_kb(vocab):
 
 
 def add_el(kb, nlp):
-    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"context_width": 64})
+    el_pipe.set_kb(kb)
     nlp.add_pipe(el_pipe, last=True)
+    nlp.begin_training()
+    el_pipe.context_weight = 0
+    el_pipe.prior_weight = 1
 
     for alias in ["Douglas Adams", "Douglas"]:
         candidates = nlp.linker.kb.get_candidates(alias)
diff --git a/spacy/_ml.py b/spacy/_ml.py
index b00ceda62..5a5bfa07e 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -666,7 +666,7 @@ def build_nel_encoder(embed_width, hidden_width, **cfg):
     entity_width = cfg.get("entity_width")
 
     with Model.define_operators({">>": chain, "**": clone}):
-        model = Affine(1, entity_width+context_width+1, drop_factor=0.0)\
+        model = Affine(1, entity_width+context_width, drop_factor=0.0)\
                 >> logistic
 
         # context encoder
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 7ea893408..cafc380ba 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -126,7 +126,7 @@ def test_preserving_links_asdoc(nlp):
     ruler.add_patterns(patterns)
     nlp.add_pipe(ruler)
 
-    el_pipe = nlp.create_pipe(name='entity_linker', config={})
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"context_width": 64})
     el_pipe.set_kb(mykb)
     el_pipe.begin_training()
     el_pipe.context_weight = 0

From c664f58246b5ec2a8233f28f2006dacb60681200 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 28 Jun 2019 16:22:58 +0200
Subject: [PATCH 098/148] adding prior probability as feature in the model

---
 examples/pipeline/wikidata_entity_linking.py | 10 +++--
 spacy/_ml.py                                 |  7 +--
 spacy/pipeline/pipes.pyx                     | 47 +++++++++++++-------
 3 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 600436a1d..a61af3660 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -45,7 +45,7 @@ EPOCHS = 10
 DROPOUT = 0.2
 LEARN_RATE = 0.005
 L2 = 1e-6
-CONTEXT_WIDTH=128
+CONTEXT_WIDTH = 128
 
 
 def run_pipeline():
@@ -138,7 +138,9 @@ def run_pipeline():
     # STEP 6: create and train the entity linking pipe
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
-        el_pipe = nlp_2.create_pipe(name='entity_linker', config={"context_width": CONTEXT_WIDTH})
+        el_pipe = nlp_2.create_pipe(name='entity_linker',
+                                    config={"context_width": CONTEXT_WIDTH,
+                                            "pretrained_vectors": nlp_2.vocab.vectors.name})
         el_pipe.set_kb(kb_2)
         nlp_2.add_pipe(el_pipe, last=True)
 
@@ -195,11 +197,11 @@ def run_pipeline():
                 if batchnr > 0:
                     with el_pipe.model.use_params(optimizer.averages):
                         el_pipe.context_weight = 1
-                        el_pipe.prior_weight = 0
+                        el_pipe.prior_weight = 1
                         dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
                         losses['entity_linker'] = losses['entity_linker'] / batchnr
                         print("Epoch, train loss", itn, round(losses['entity_linker'], 2),
-                              " / dev acc context avg", round(dev_acc_context, 3))
+                              " / dev acc avg", round(dev_acc_context, 3))
 
         # STEP 7: measure the performance of our trained pipe on an independent dev set
         if len(dev_data) and measure_performance:
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 5a5bfa07e..07037f653 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -666,15 +666,16 @@ def build_nel_encoder(embed_width, hidden_width, **cfg):
     entity_width = cfg.get("entity_width")
 
     with Model.define_operators({">>": chain, "**": clone}):
-        model = Affine(1, entity_width+context_width, drop_factor=0.0)\
+        model = Affine(entity_width, entity_width+context_width+1)\
+                >> Affine(1, entity_width, drop_factor=0.0)\
                 >> logistic
 
         # context encoder
         tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors,
-                          cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth,
+                          cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth,
                           bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\
                                 >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
-                                >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
+                                >> zero_init(Affine(context_width, hidden_width))
 
         model.tok2vec = tok2vec
 
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 25df31f70..d3f6fa776 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1132,7 +1132,8 @@ class EntityLinker(Pipe):
 
         context_docs = []
         entity_encodings = []
-        labels = []
+        cats = []
+        priors = []
 
         for doc, gold in zip(docs, golds):
             for entity in gold.links:
@@ -1143,27 +1144,33 @@ class EntityLinker(Pipe):
                 nr_neg = 0
                 for c in candidates:
                     kb_id = c.entity_
+                    entity_encoding = c.entity_vector
+                    entity_encodings.append(entity_encoding)
+                    context_docs.append(doc)
+
+                    if self.prior_weight > 0:
+                        priors.append([c.prior_prob])
+                    else:
+                        priors.append([0])
+
                     if kb_id == gold_kb:
-                        entity_encoding = c.entity_vector
-                        entity_encodings.append(entity_encoding)
-                        context_docs.append(doc)
-                        labels.append([1])
-                    else:   # elif nr_neg < 1:
+                        cats.append([1])
+                    else:
                         nr_neg += 1
-                        entity_encoding = c.entity_vector
-                        entity_encodings.append(entity_encoding)
-                        context_docs.append(doc)
-                        labels.append([0])
+                        cats.append([0])
 
         if len(entity_encodings) > 0:
+            assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats)
+
             context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
             entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
 
-            mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
+            mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i]
+                                 for i in range(len(entity_encodings))]
             pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop)
-            labels = self.model.ops.asarray(labels, dtype="float32")
+            cats = self.model.ops.asarray(cats, dtype="float32")
 
-            loss, d_scores = self.get_loss(prediction=pred, golds=labels, docs=None)
+            loss, d_scores = self.get_loss(prediction=pred, golds=cats, docs=None)
             mention_gradient = bp_mention(d_scores, sgd=sgd)
 
             context_gradients = [list(x[0:self.context_width]) for x in mention_gradient]
@@ -1221,13 +1228,19 @@ class EntityLinker(Pipe):
                     candidates = self.kb.get_candidates(ent.text)
                     if candidates:
                         random.shuffle(candidates)
+
+                        # this will set the prior probabilities to 0 (just like in training) if their weight is 0
                         prior_probs = xp.asarray([[c.prior_prob] for c in candidates])
                         prior_probs *= self.prior_weight
+                        scores = prior_probs
 
-                        entity_encodings = xp.asarray([c.entity_vector for c in candidates])
-                        mention_encodings = [list(context_encoding) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
-                        predictions = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))
-                        scores = (prior_probs + predictions - (xp.dot(prior_probs.T, predictions)))
+                        if self.context_weight > 0:
+                            entity_encodings = xp.asarray([c.entity_vector for c in candidates])
+                            assert len(entity_encodings) == len(prior_probs)
+                            mention_encodings = [list(context_encoding) + list(entity_encodings[i])
+                                                 + list(prior_probs[i])
+                                                 for i in range(len(entity_encodings))]
+                            scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))
 
                         # TODO: thresholding
                         best_index = scores.argmax()

From 2d2dea99244b520cd62813fb2de62cf78b5f09be Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Sat, 29 Jun 2019 14:52:36 +0200
Subject: [PATCH 099/148] experiment with adding NER types to the feature
 vector

---
 examples/pipeline/wikidata_entity_linking.py | 27 ++++++++++++--------
 spacy/_ml.py                                 |  4 +--
 spacy/pipeline/pipes.pyx                     | 26 ++++++++++++++++---
 3 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index a61af3660..c0a7e3c66 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -42,7 +42,7 @@ MIN_PAIR_OCC = 5
 
 # model training parameters
 EPOCHS = 10
-DROPOUT = 0.2
+DROPOUT = 0.5
 LEARN_RATE = 0.005
 L2 = 1e-6
 CONTEXT_WIDTH = 128
@@ -73,10 +73,10 @@ def run_pipeline():
     measure_performance = True
 
     # test the EL pipe on a simple example
-    to_test_pipeline = True
+    to_test_pipeline = False
 
     # write the NLP object, read back in and test again
-    to_write_nlp = True
+    to_write_nlp = False
     to_read_nlp = False
     test_from_file = False
 
@@ -138,9 +138,12 @@ def run_pipeline():
     # STEP 6: create and train the entity linking pipe
     if train_pipe:
         print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
+        type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)}
+        print(" -analysing", len(type_to_int), "different entity types")
         el_pipe = nlp_2.create_pipe(name='entity_linker',
                                     config={"context_width": CONTEXT_WIDTH,
-                                            "pretrained_vectors": nlp_2.vocab.vectors.name})
+                                            "pretrained_vectors": nlp_2.vocab.vectors.name,
+                                            "type_to_int": type_to_int})
         el_pipe.set_kb(kb_2)
         nlp_2.add_pipe(el_pipe, last=True)
 
@@ -151,8 +154,8 @@ def run_pipeline():
             optimizer.L2 = L2
 
         # define the size (nr of entities) of training and dev set
-        train_limit = 500000
-        dev_limit = 5000
+        train_limit = 50000
+        dev_limit = 50000
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
                                                         training_dir=TRAINING_DIR,
@@ -219,7 +222,7 @@ def run_pipeline():
                 # measuring combined accuracy (prior + context)
                 el_pipe.context_weight = 1
                 el_pipe.prior_weight = 1
-                dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe)
+                dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe, error_analysis=False)
                 print("dev acc combo avg:", round(dev_acc_combo, 3),
                       [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
 
@@ -264,7 +267,7 @@ def run_pipeline():
         nlp_3 = spacy.load(NLP_2_DIR)
         el_pipe = nlp_3.get_pipe("entity_linker")
 
-        dev_limit = 10000
+        dev_limit = 5000
         dev_data = training_set_creator.read_training(nlp=nlp_2,
                                                       training_dir=TRAINING_DIR,
                                                       dev=True,
@@ -273,7 +276,7 @@ def run_pipeline():
         print("Dev testing from file on", len(dev_data), "articles")
         print()
 
-        dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe)
+        dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe, error_analysis=False)
         print("dev acc combo avg:", round(dev_acc_combo, 3),
               [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
 
@@ -281,7 +284,7 @@ def run_pipeline():
     print("STOP", datetime.datetime.now())
 
 
-def _measure_accuracy(data, el_pipe=None):
+def _measure_accuracy(data, el_pipe=None, error_analysis=False):
     # If the docs in the data require further processing with an entity linker, set el_pipe
     correct_by_label = dict()
     incorrect_by_label = dict()
@@ -312,6 +315,10 @@ def _measure_accuracy(data, el_pipe=None):
                     else:
                         incorrect = incorrect_by_label.get(ent_label, 0)
                         incorrect_by_label[ent_label] = incorrect + 1
+                        if error_analysis:
+                            print(ent.text, "in", doc)
+                            print("Predicted",  pred_entity, "should have been", gold_entity)
+                            print()
 
         except Exception as e:
             print("Error assessing accuracy", e)
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 07037f653..cca324b45 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -652,7 +652,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
     return model
 
 
-def build_nel_encoder(embed_width, hidden_width, **cfg):
+def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
     # TODO proper error
     if "entity_width" not in cfg:
         raise ValueError("entity_width not found")
@@ -666,7 +666,7 @@ def build_nel_encoder(embed_width, hidden_width, **cfg):
     entity_width = cfg.get("entity_width")
 
     with Model.define_operators({">>": chain, "**": clone}):
-        model = Affine(entity_width, entity_width+context_width+1)\
+        model = Affine(entity_width, entity_width+context_width+1+ner_types)\
                 >> Affine(1, entity_width, drop_factor=0.0)\
                 >> logistic
 
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index d3f6fa776..f1a864fcf 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1074,8 +1074,9 @@ class EntityLinker(Pipe):
     def Model(cls, **cfg):
         embed_width = cfg.get("embed_width", 300)
         hidden_width = cfg.get("hidden_width", 128)
+        type_to_int = cfg.get("type_to_int", dict())
 
-        model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, **cfg)
+        model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg)
         return model
 
     def __init__(self, **cfg):
@@ -1086,6 +1087,7 @@ class EntityLinker(Pipe):
         self.context_weight = cfg.get("context_weight", 1)
         self.prior_weight = cfg.get("prior_weight", 1)
         self.context_width = cfg.get("context_width")
+        self.type_to_int = cfg.get("type_to_int", dict())
 
     def set_kb(self, kb):
         self.kb = kb
@@ -1134,11 +1136,22 @@ class EntityLinker(Pipe):
         entity_encodings = []
         cats = []
         priors = []
+        type_vectors = []
 
         for doc, gold in zip(docs, golds):
+            ents_by_offset = dict()
+            for ent in doc.ents:
+                ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
             for entity in gold.links:
                 start, end, gold_kb = entity
                 mention = doc.text[start:end]
+
+                gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)]
+                assert gold_ent is not None
+                type_vector = [0 for i in range(len(self.type_to_int))]
+                if len(self.type_to_int) > 0:
+                    type_vector[self.type_to_int[gold_ent.label_]] = 1
+
                 candidates = self.kb.get_candidates(mention)
                 random.shuffle(candidates)
                 nr_neg = 0
@@ -1147,6 +1160,7 @@ class EntityLinker(Pipe):
                     entity_encoding = c.entity_vector
                     entity_encodings.append(entity_encoding)
                     context_docs.append(doc)
+                    type_vectors.append(type_vector)
 
                     if self.prior_weight > 0:
                         priors.append([c.prior_prob])
@@ -1160,12 +1174,12 @@ class EntityLinker(Pipe):
                         cats.append([0])
 
         if len(entity_encodings) > 0:
-            assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats)
+            assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) == len(type_vectors)
 
             context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
             entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
 
-            mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i]
+            mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] + type_vectors[i]
                                  for i in range(len(entity_encodings))]
             pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop)
             cats = self.model.ops.asarray(cats, dtype="float32")
@@ -1225,6 +1239,10 @@ class EntityLinker(Pipe):
             if len(doc) > 0:
                 context_encoding = context_encodings[i]
                 for ent in doc.ents:
+                    type_vector = [0 for i in range(len(self.type_to_int))]
+                    if len(self.type_to_int) > 0:
+                        type_vector[self.type_to_int[ent.label_]] = 1
+
                     candidates = self.kb.get_candidates(ent.text)
                     if candidates:
                         random.shuffle(candidates)
@@ -1238,7 +1256,7 @@ class EntityLinker(Pipe):
                             entity_encodings = xp.asarray([c.entity_vector for c in candidates])
                             assert len(entity_encodings) == len(prior_probs)
                             mention_encodings = [list(context_encoding) + list(entity_encodings[i])
-                                                 + list(prior_probs[i])
+                                                 + list(prior_probs[i]) + type_vector
                                                  for i in range(len(entity_encodings))]
                             scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))
 

From 3420cbe49639ab77e36612d7c7ab5abeffe9cd46 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 3 Jul 2019 10:25:51 +0200
Subject: [PATCH 100/148] small fixes

---
 bin/wiki_entity_linking/training_set_creator.py | 5 ++---
 examples/pipeline/wikidata_entity_linking.py    | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index 436154409..5d401bb3f 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -7,7 +7,7 @@ import bz2
 import datetime
 
 from spacy.gold import GoldParse
-from bin.wiki_entity_linking import kb_creator, wikipedia_processor as wp
+from bin.wiki_entity_linking import kb_creator
 
 """
 Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
@@ -342,8 +342,7 @@ def read_training(nlp, training_dir, dev, limit):
                                 # currently feeding the gold data one entity per sentence at a time
                                 gold_start = int(start) - found_ent.sent.start_char
                                 gold_end = int(end) - found_ent.sent.start_char
-                                gold_entities = []
-                                gold_entities.append((gold_start, gold_end, wp_title))
+                                gold_entities = [(gold_start, gold_end, wp_title)]
                                 gold = GoldParse(doc=sent, links=gold_entities)
                                 data.append((sent, gold))
                                 total_entities += 1
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index c0a7e3c66..d914f033c 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -394,10 +394,10 @@ def _measure_baselines(data, kb):
             print("Error assessing accuracy", e)
 
     acc_prior, acc_prior_by_label = calculate_acc(prior_correct_by_label, prior_incorrect_by_label)
-    acc_random, acc_random_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label)
+    acc_rand, acc_rand_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label)
     acc_oracle, acc_oracle_by_label = calculate_acc(oracle_correct_by_label, oracle_incorrect_by_label)
 
-    return counts_by_label, acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label
+    return counts_by_label, acc_rand, acc_rand_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label
 
 
 def calculate_acc(correct_by_label, incorrect_by_label):

From 8840d4b1b3ac9aa9e774b576ea405a205b353f64 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 3 Jul 2019 13:35:36 +0200
Subject: [PATCH 101/148] fix for context encoder optimizer

---
 examples/pipeline/wikidata_entity_linking.py | 13 ++++----
 spacy/pipeline/pipes.pyx                     | 31 ++++++++++----------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index d914f033c..b57d9f541 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -73,11 +73,11 @@ def run_pipeline():
     measure_performance = True
 
     # test the EL pipe on a simple example
-    to_test_pipeline = False
+    to_test_pipeline = True
 
     # write the NLP object, read back in and test again
-    to_write_nlp = False
-    to_read_nlp = False
+    to_write_nlp = True
+    to_read_nlp = True
     test_from_file = False
 
     # STEP 1 : create prior probabilities from WP (run only once)
@@ -154,8 +154,8 @@ def run_pipeline():
             optimizer.L2 = L2
 
         # define the size (nr of entities) of training and dev set
-        train_limit = 50000
-        dev_limit = 50000
+        train_limit = 5
+        dev_limit = 5
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
                                                         training_dir=TRAINING_DIR,
@@ -250,7 +250,8 @@ def run_pipeline():
             print("STEP 9: testing NLP IO", datetime.datetime.now())
             print()
             print("writing to", NLP_2_DIR)
-            nlp_2.to_disk(NLP_2_DIR)
+            with el_pipe.model.use_params(optimizer.averages) and el_pipe.model.tok2vec.use_params(el_pipe.sgd_context.averages):
+                nlp_2.to_disk(NLP_2_DIR)
             print()
 
     # verify that the IO has gone correctly
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index f1a864fcf..91f5e7044 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1082,12 +1082,8 @@ class EntityLinker(Pipe):
     def __init__(self, **cfg):
         self.model = True
         self.kb = None
-        self.sgd_context = None
         self.cfg = dict(cfg)
-        self.context_weight = cfg.get("context_weight", 1)
-        self.prior_weight = cfg.get("prior_weight", 1)
-        self.context_width = cfg.get("context_width")
-        self.type_to_int = cfg.get("type_to_int", dict())
+        self.sgd_context = None
 
     def set_kb(self, kb):
         self.kb = kb
@@ -1112,6 +1108,7 @@ class EntityLinker(Pipe):
 
         if sgd is None:
             sgd = self.create_optimizer()
+
         return sgd
 
     def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
@@ -1138,6 +1135,8 @@ class EntityLinker(Pipe):
         priors = []
         type_vectors = []
 
+        type_to_int = self.cfg.get("type_to_int", dict())
+
         for doc, gold in zip(docs, golds):
             ents_by_offset = dict()
             for ent in doc.ents:
@@ -1148,9 +1147,9 @@ class EntityLinker(Pipe):
 
                 gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)]
                 assert gold_ent is not None
-                type_vector = [0 for i in range(len(self.type_to_int))]
-                if len(self.type_to_int) > 0:
-                    type_vector[self.type_to_int[gold_ent.label_]] = 1
+                type_vector = [0 for i in range(len(type_to_int))]
+                if len(type_to_int) > 0:
+                    type_vector[type_to_int[gold_ent.label_]] = 1
 
                 candidates = self.kb.get_candidates(mention)
                 random.shuffle(candidates)
@@ -1162,7 +1161,7 @@ class EntityLinker(Pipe):
                     context_docs.append(doc)
                     type_vectors.append(type_vector)
 
-                    if self.prior_weight > 0:
+                    if self.cfg.get("prior_weight", 1) > 0:
                         priors.append([c.prior_prob])
                     else:
                         priors.append([0])
@@ -1187,7 +1186,7 @@ class EntityLinker(Pipe):
             loss, d_scores = self.get_loss(prediction=pred, golds=cats, docs=None)
             mention_gradient = bp_mention(d_scores, sgd=sgd)
 
-            context_gradients = [list(x[0:self.context_width]) for x in mention_gradient]
+            context_gradients = [list(x[0:self.cfg.get("context_width")]) for x in mention_gradient]
             bp_context(self.model.ops.asarray(context_gradients, dtype="float32"), sgd=self.sgd_context)
 
             if losses is not None:
@@ -1235,13 +1234,15 @@ class EntityLinker(Pipe):
         context_encodings = self.model.tok2vec(docs)
         xp = get_array_module(context_encodings)
 
+        type_to_int = self.cfg.get("type_to_int", dict())
+
         for i, doc in enumerate(docs):
             if len(doc) > 0:
                 context_encoding = context_encodings[i]
                 for ent in doc.ents:
-                    type_vector = [0 for i in range(len(self.type_to_int))]
-                    if len(self.type_to_int) > 0:
-                        type_vector[self.type_to_int[ent.label_]] = 1
+                    type_vector = [0 for i in range(len(type_to_int))]
+                    if len(type_to_int) > 0:
+                        type_vector[type_to_int[ent.label_]] = 1
 
                     candidates = self.kb.get_candidates(ent.text)
                     if candidates:
@@ -1249,10 +1250,10 @@ class EntityLinker(Pipe):
 
                         # this will set the prior probabilities to 0 (just like in training) if their weight is 0
                         prior_probs = xp.asarray([[c.prior_prob] for c in candidates])
-                        prior_probs *= self.prior_weight
+                        prior_probs *= self.cfg.get("prior_weight", 1)
                         scores = prior_probs
 
-                        if self.context_weight > 0:
+                        if self.cfg.get("context_weight", 1) > 0:
                             entity_encodings = xp.asarray([c.entity_vector for c in candidates])
                             assert len(entity_encodings) == len(prior_probs)
                             mention_encodings = [list(context_encoding) + list(entity_encodings[i])

From 668b17ea4a7f5133f68b16586e6f4a1f45279bee Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 3 Jul 2019 15:00:42 +0200
Subject: [PATCH 102/148] deuglify kb deserializer

---
 spacy/language.py        |  9 +--------
 spacy/pipeline/pipes.pyx | 12 +++++++++++-
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 2225a763e..570630eb3 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -118,7 +118,7 @@ class Language(object):
         "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
         "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
         "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
-        "entity_linker": lambda nlp, **cfg: EntityLinker(**cfg),
+        "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
         "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
         "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
         "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
@@ -811,13 +811,6 @@ class Language(object):
             exclude = list(exclude) + ["vocab"]
         util.from_disk(path, deserializers, exclude)
 
-        # download the KB for the entity linking component - requires the vocab
-        for pipe_name, pipe in self.pipeline:
-            if pipe_name == "entity_linker":
-                kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=pipe.cfg["entity_width"])
-                kb.load_bulk(path / pipe_name / "kb")
-                pipe.set_kb(kb)
-
         self._path = path
         return self
 
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 91f5e7044..f4dc08251 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -13,6 +13,7 @@ from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical
 from thinc.neural.util import get_array_module
 
+from spacy.kb import KnowledgeBase
 from ..cli.pretrain import get_cossim_loss
 from .functions import merge_subtokens
 from ..tokens.doc cimport Doc
@@ -1079,7 +1080,8 @@ class EntityLinker(Pipe):
         model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg)
         return model
 
-    def __init__(self, **cfg):
+    def __init__(self, vocab, **cfg):
+        self.vocab = vocab
         self.model = True
         self.kb = None
         self.cfg = dict(cfg)
@@ -1277,6 +1279,7 @@ class EntityLinker(Pipe):
     def to_disk(self, path, exclude=tuple(), **kwargs):
         serialize = OrderedDict()
         serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
         serialize["kb"] = lambda p: self.kb.dump(p)
         if self.model not in (None, True, False):
             serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
@@ -1289,8 +1292,15 @@ class EntityLinker(Pipe):
                 self.model = self.Model(**self.cfg)
              self.model.from_bytes(p.open("rb").read())
 
+        def load_kb(p):
+            kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
+            kb.load_bulk(p)
+            self.set_kb(kb)
+
         deserialize = OrderedDict()
         deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
+        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
+        deserialize["kb"] = load_kb
         deserialize["model"] = load_model
         exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
         util.from_disk(path, deserialize, exclude)

From 0ea52c86b89e65b5bdca23dd331985033a9f0c2d Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 3 Jul 2019 15:02:10 +0200
Subject: [PATCH 103/148] remove redundancy

---
 spacy/language.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 570630eb3..39d95c689 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -11,7 +11,6 @@ from copy import copy, deepcopy
 from thinc.neural import Model
 import srsly
 
-from .kb import KnowledgeBase
 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
@@ -810,7 +809,6 @@ class Language(object):
             # Convert to list here in case exclude is (default) tuple
             exclude = list(exclude) + ["vocab"]
         util.from_disk(path, deserializers, exclude)
-
         self._path = path
         return self
 

From b7a0c9bf60757acdf0586b35ec755ccd8fab5099 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 3 Jul 2019 17:48:09 +0200
Subject: [PATCH 104/148] fixing the context/prior weight settings

---
 examples/pipeline/wikidata_entity_linking.py | 49 +++++++++-----------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index b57d9f541..17c2976dd 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -154,8 +154,8 @@ def run_pipeline():
             optimizer.L2 = L2
 
         # define the size (nr of entities) of training and dev set
-        train_limit = 5
-        dev_limit = 5
+        train_limit = 5000
+        dev_limit = 5000
 
         train_data = training_set_creator.read_training(nlp=nlp_2,
                                                         training_dir=TRAINING_DIR,
@@ -198,13 +198,12 @@ def run_pipeline():
                             print("Error updating batch:", e)
 
                 if batchnr > 0:
-                    with el_pipe.model.use_params(optimizer.averages):
-                        el_pipe.context_weight = 1
-                        el_pipe.prior_weight = 1
-                        dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
-                        losses['entity_linker'] = losses['entity_linker'] / batchnr
-                        print("Epoch, train loss", itn, round(losses['entity_linker'], 2),
-                              " / dev acc avg", round(dev_acc_context, 3))
+                    el_pipe.cfg["context_weight"] = 1
+                    el_pipe.cfg["prior_weight"] = 1
+                    dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
+                    losses['entity_linker'] = losses['entity_linker'] / batchnr
+                    print("Epoch, train loss", itn, round(losses['entity_linker'], 2),
+                          " / dev acc avg", round(dev_acc_context, 3))
 
         # STEP 7: measure the performance of our trained pipe on an independent dev set
         if len(dev_data) and measure_performance:
@@ -218,24 +217,19 @@ def run_pipeline():
             print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()])
             print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()])
 
-            with el_pipe.model.use_params(optimizer.averages):
-                # measuring combined accuracy (prior + context)
-                el_pipe.context_weight = 1
-                el_pipe.prior_weight = 1
-                dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe, error_analysis=False)
-                print("dev acc combo avg:", round(dev_acc_combo, 3),
-                      [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
+            # using only context
+            el_pipe.cfg["context_weight"] = 1
+            el_pipe.cfg["prior_weight"] = 0
+            dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
+            print("dev acc context avg:", round(dev_acc_context, 3),
+                  [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()])
 
-                # using only context
-                el_pipe.context_weight = 1
-                el_pipe.prior_weight = 0
-                dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
-                print("dev acc context avg:", round(dev_acc_context, 3),
-                      [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()])
-
-            # reset for follow-up tests
-            el_pipe.context_weight = 1
-            el_pipe.prior_weight = 1
+            # measuring combined accuracy (prior + context)
+            el_pipe.cfg["context_weight"] = 1
+            el_pipe.cfg["prior_weight"] = 1
+            dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe, error_analysis=False)
+            print("dev acc combo avg:", round(dev_acc_combo, 3),
+                  [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
 
         # STEP 8: apply the EL pipe on a toy example
         if to_test_pipeline:
@@ -250,8 +244,7 @@ def run_pipeline():
             print("STEP 9: testing NLP IO", datetime.datetime.now())
             print()
             print("writing to", NLP_2_DIR)
-            with el_pipe.model.use_params(optimizer.averages) and el_pipe.model.tok2vec.use_params(el_pipe.sgd_context.averages):
-                nlp_2.to_disk(NLP_2_DIR)
+            nlp_2.to_disk(NLP_2_DIR)
             print()
 
     # verify that the IO has gone correctly

From 6d577f0b92f6c3d0333b6816ed9b443a39817e3a Mon Sep 17 00:00:00 2001
From: Alejandro Alcalde <algui91@gmail.com>
Date: Tue, 9 Jul 2019 20:54:59 +0200
Subject: [PATCH 105/148] Evaluation of NER model per entity type, closes #3490
 (#3911)

* Evaluation of NER model per entity type, closes ##3490

Now each ent score is tracked individually in order to have its own Precision, Recall and F1 Score

* Keep track of each entity individually using dicts

* Improving how to compute the scores for each entity

* Fixed bug computing scores for ents

* Formatting with black

* Added key ents_per_type to the scores function

The key `ents_per_type` contains the metrics Precision, Recall and F1-Score for each entity individually
---
 spacy/scorer.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index 32716b852..c01353520 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -52,6 +52,7 @@ class Scorer(object):
         self.labelled = PRFScore()
         self.tags = PRFScore()
         self.ner = PRFScore()
+        self.ner_per_ents = dict()
         self.eval_punct = eval_punct
 
     @property
@@ -104,6 +105,15 @@ class Scorer(object):
             "ents_f": self.ents_f,
             "tags_acc": self.tags_acc,
             "token_acc": self.token_acc,
+            "ents_per_type": self.__scores_per_ents(),
+        }
+
+    def __scores_per_ents(self):
+        """RETURNS (dict): Scores per NER entity
+        """
+        return {
+            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
+            for k, v in self.ner_per_ents.items()
         }
 
     def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")):
@@ -149,13 +159,31 @@ class Scorer(object):
                     cand_deps.add((gold_i, gold_head, token.dep_.lower()))
         if "-" not in [token[-1] for token in gold.orig_annot]:
             cand_ents = set()
+            current_ent = {k.label_: set() for k in doc.ents}
+            current_gold = {k.label_: set() for k in doc.ents}
             for ent in doc.ents:
+                if ent.label_ not in self.ner_per_ents:
+                    self.ner_per_ents[ent.label_] = PRFScore()
                 first = gold.cand_to_gold[ent.start]
                 last = gold.cand_to_gold[ent.end - 1]
                 if first is None or last is None:
                     self.ner.fp += 1
+                    self.ner_per_ents[ent.label_].fp += 1
                 else:
                     cand_ents.add((ent.label_, first, last))
+                    current_ent[ent.label_].add(
+                        tuple(x for x in cand_ents if x[0] == ent.label_)
+                    )
+                    current_gold[ent.label_].add(
+                        tuple(x for x in gold_ents if x[0] == ent.label_)
+                    )
+            # Scores per ent
+            [
+                v.score_set(current_ent[k], current_gold[k])
+                for k, v in self.ner_per_ents.items()
+                if k in current_ent
+            ]
+            # Score for all ents
             self.ner.score_set(cand_ents, gold_ents)
         self.tags.score_set(cand_tags, gold_tags)
         self.labelled.score_set(cand_deps, gold_deps)

From 04982ccc4033ec15864bba659430a8408ca94774 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20B=C3=B6ing?=
 <33514570+BreakBB@users.noreply.github.com>
Date: Tue, 9 Jul 2019 21:48:30 +0200
Subject: [PATCH 106/148] =?UTF-8?q?Update=20pretrain=20to=20prevent=20unin?=
 =?UTF-8?q?tended=20overwriting=20of=20weight=20fil=E2=80=A6=20(#3902)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update pretrain to prevent unintended overwriting of weight files for #3859

* Add '--epoch-start' to pretrain docs

* Add mising pretrain arguments to bash example

* Update doc tag for v2.1.5
---
 spacy/cli/pretrain.py   | 33 +++++++++++++++++++++++++++++++--
 website/docs/api/cli.md |  9 +++++----
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 2fe5b247a..678f12be1 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -5,6 +5,7 @@ import plac
 import random
 import numpy
 import time
+import re
 from collections import Counter
 from pathlib import Path
 from thinc.v2v import Affine, Maxout
@@ -65,6 +66,13 @@ from .train import _load_pretrained_tok2vec
         "t2v",
         Path,
     ),
+    epoch_start=(
+        "The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been "
+        "renamed. Prevents unintended overwriting of existing weight files.",
+        "option",
+        "es",
+        int
+    ),
 )
 def pretrain(
     texts_loc,
@@ -83,6 +91,7 @@ def pretrain(
     seed=0,
     n_save_every=None,
     init_tok2vec=None,
+    epoch_start=None,
 ):
     """
     Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
@@ -151,9 +160,29 @@ def pretrain(
     if init_tok2vec is not None:
         components = _load_pretrained_tok2vec(nlp, init_tok2vec)
         msg.text("Loaded pretrained tok2vec for: {}".format(components))
+        # Parse the epoch number from the given weight file
+        model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
+        if model_name:
+            # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
+            epoch_start = int(model_name.group(0)[5:][:-4]) + 1
+        else:
+            if not epoch_start:
+                msg.fail(
+                    "You have to use the '--epoch-start' argument when using a renamed weight file for "
+                    "'--init-tok2vec'", exits=True
+                )
+            elif epoch_start < 0:
+                msg.fail(
+                    "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid" % epoch_start,
+                    exits=True
+                )
+    else:
+        # Without '--init-tok2vec' the '--epoch-start' argument is ignored
+        epoch_start = 0
+
     optimizer = create_default_optimizer(model.ops)
     tracker = ProgressTracker(frequency=10000)
-    msg.divider("Pre-training tok2vec layer")
+    msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
     row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
     msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
 
@@ -174,7 +203,7 @@ def pretrain(
                 file_.write(srsly.json_dumps(log) + "\n")
 
     skip_counter = 0
-    for epoch in range(n_iter):
+    for epoch in range(epoch_start, n_iter + epoch_start):
         for batch_id, batch in enumerate(
             util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
         ):
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index a69e62219..7af134e40 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -284,9 +284,9 @@ same between pretraining and training. The API and errors around this need some
 improvement.
 
 ```bash
-$ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width]
-[--depth] [--embed-rows] [--loss_func] [--dropout] [--seed] [--n-iter] [--use-vectors]
-[--n-save_every]
+$ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
+[--width] [--depth] [--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] [--min-length]
+[--seed] [--n-iter] [--use-vectors] [--n-save_every] [--init-tok2vec] [--epoch-start]
 ```
 
 | Argument                | Type       | Description                                                                                                                       |
@@ -306,7 +306,8 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width]
 | `--n-iter`, `-i`        | option     | Number of iterations to pretrain.                                                                                                 |
 | `--use-vectors`, `-uv`  | flag       | Whether to use the static vectors as input features.                                                                              |
 | `--n-save-every`, `-se` | option     | Save model every X batches.                                                                                                       |
-| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option        | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.|
+| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.|
+| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.|
 | **CREATES**             | weights    | The pre-trained weights that can be used to initialize `spacy train`.                                                             |
 
 ### JSONL format for raw text {#pretrain-jsonl}

From 547464609d8da5230bf2bcbb020b2abfde5dd216 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 9 Jul 2019 21:50:30 +0200
Subject: [PATCH 107/148] Remove merge_subtokens from parser postprocessing for
 now

---
 spacy/pipeline/pipes.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 1d4eeadce..1f4dd4253 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1001,7 +1001,7 @@ cdef class DependencyParser(Parser):
 
     @property
     def postprocesses(self):
-        return [nonproj.deprojectivize, merge_subtokens]
+        return [nonproj.deprojectivize]
 
     def add_multitask_objective(self, target):
         if target == "cloze":

From 58f06e61800a477f67d13911068fd24892ccfa15 Mon Sep 17 00:00:00 2001
From: cedar101 <cedar101@gmail.com>
Date: Wed, 10 Jul 2019 05:23:16 +0900
Subject: [PATCH 108/148] Korean support (#3901)

* start lang/ko

* add test codes

* using natto-py

* add test_ko_tokenizer_full_tags()

* spaCy contributor agreement

* external dependency for ko

* collections.namedtuple for python version < 3.5

* case fix

* tuple unpacking

* add jongseong(final consonant)

* apply mecab option

* Remove Pipfile for now


Co-authored-by: Ines Montani <ines@ines.io>
---
 .github/contributors/cedar101.md          | 106 +++++++++++++++++++
 .gitignore                                |   2 +
 setup.py                                  |   1 +
 spacy/lang/ko/__init__.py                 | 118 ++++++++++++++++++++++
 spacy/lang/ko/examples.py                 |  15 +++
 spacy/lang/ko/stop_words.py               |  68 +++++++++++++
 spacy/lang/ko/tag_map.py                  |  66 ++++++++++++
 spacy/tests/conftest.py                   |   6 ++
 spacy/tests/lang/ko/__init__.py           |   0
 spacy/tests/lang/ko/test_lemmatization.py |  13 +++
 spacy/tests/lang/ko/test_tokenizer.py     |  46 +++++++++
 website/meta/languages.json               |  11 ++
 12 files changed, 452 insertions(+)
 create mode 100644 .github/contributors/cedar101.md
 create mode 100644 spacy/lang/ko/__init__.py
 create mode 100644 spacy/lang/ko/examples.py
 create mode 100644 spacy/lang/ko/stop_words.py
 create mode 100644 spacy/lang/ko/tag_map.py
 create mode 100644 spacy/tests/lang/ko/__init__.py
 create mode 100644 spacy/tests/lang/ko/test_lemmatization.py
 create mode 100644 spacy/tests/lang/ko/test_tokenizer.py

diff --git a/.github/contributors/cedar101.md b/.github/contributors/cedar101.md
new file mode 100644
index 000000000..4d04ebacf
--- /dev/null
+++ b/.github/contributors/cedar101.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                    |
+|------------------------------- | ------------------------ |
+| Name                           | Kim, Baeg-il             |
+| Company name (if applicable)   |                          |
+| Title or role (if applicable)  |                          |
+| Date                           | 2019-07-03               |
+| GitHub username                | cedar101                 |
+| Website (optional)             |                          |
diff --git a/.gitignore b/.gitignore
index ef586ac8d..35d431d48 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,6 +56,8 @@ parts/
 sdist/
 var/
 *.egg-info/
+pip-wheel-metadata/
+Pipfile.lock
 .installed.cfg
 *.egg
 .eggs
diff --git a/setup.py b/setup.py
index 33623588c..544188f4a 100755
--- a/setup.py
+++ b/setup.py
@@ -246,6 +246,7 @@ def setup_package():
                 "cuda100": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda100>=5.0.0b4"],
                 # Language tokenizers with external dependencies
                 "ja": ["mecab-python3==0.7"],
+                "ko": ["natto-py==0.9.0"],
             },
             python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*",
             classifiers=[
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
new file mode 100644
index 000000000..111d01720
--- /dev/null
+++ b/spacy/lang/ko/__init__.py
@@ -0,0 +1,118 @@
+# encoding: utf8
+from __future__ import unicode_literals, print_function
+
+import re
+import sys
+
+
+from .stop_words import STOP_WORDS
+from .tag_map import TAG_MAP, POS
+from ...attrs import LANG
+from ...language import Language
+from ...tokens import Doc
+from ...compat import copy_reg
+from ...util import DummyTokenizer
+from ...compat import is_python3, is_python_pre_3_5
+
+is_python_post_3_7 = is_python3 and sys.version_info[1] >= 7
+
+# fmt: off
+if is_python_pre_3_5:
+    from collections import namedtuple
+    Morpheme = namedtuple("Morpheme", "surface lemma tag")
+elif is_python_post_3_7:
+    from dataclasses import dataclass
+    @dataclass(frozen=True)
+    class Morpheme:
+        surface: str
+        lemma: str
+        tag: str
+else:
+    from typing import NamedTuple
+    class Morpheme(NamedTuple):
+        surface: str
+        lemma: str
+        tag: str
+
+
+def try_mecab_import():
+    try:
+        from natto import MeCab
+        return MeCab
+    except ImportError:
+        raise ImportError(
+            "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
+            "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
+            "and [natto-py](https://github.com/buruzaemon/natto-py)"
+        )
+# fmt: on
+
+
+def check_spaces(text, tokens):
+    token_pattern = re.compile(r"\s?".join(f"({t})" for t in tokens))
+    m = token_pattern.match(text)
+    if m is not None:
+        for i in range(1, m.lastindex):
+            yield m.end(i) < m.start(i + 1)
+        yield False
+
+
+class KoreanTokenizer(DummyTokenizer):
+    def __init__(self, cls, nlp=None):
+        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
+        self.Tokenizer = try_mecab_import()
+
+    def __call__(self, text):
+        dtokens = list(self.detailed_tokens(text))
+        surfaces = [dt.surface for dt in dtokens]
+        doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
+        for token, dtoken in zip(doc, dtokens):
+            first_tag, sep, eomi_tags = dtoken.tag.partition("+")
+            token.tag_ = first_tag  # stem(어간) or pre-final(선어말 어미)
+            token.lemma_ = dtoken.lemma
+        doc.user_data["full_tags"] = [dt.tag for dt in dtokens]
+        return doc
+
+    def detailed_tokens(self, text):
+        # 품사 태그(POS)[0], 의미 부류(semantic class)[1],	종성 유무(jongseong)[2], 읽기(reading)[3],
+        # 타입(type)[4], 첫번째 품사(start pos)[5],	마지막 품사(end pos)[6], 표현(expression)[7], *
+        with self.Tokenizer("-F%f[0],%f[7]") as tokenizer:
+            for node in tokenizer.parse(text, as_nodes=True):
+                if node.is_eos():
+                    break
+                surface = node.surface
+                feature = node.feature
+                tag, _, expr = feature.partition(",")
+                lemma, _, remainder = expr.partition("/")
+                if lemma == "*":
+                    lemma = surface
+                yield Morpheme(surface, lemma, tag)
+
+
+class KoreanDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda _text: "ko"
+    stop_words = STOP_WORDS
+    tag_map = TAG_MAP
+    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
+
+    @classmethod
+    def create_tokenizer(cls, nlp=None):
+        return KoreanTokenizer(cls, nlp)
+
+
+class Korean(Language):
+    lang = "ko"
+    Defaults = KoreanDefaults
+
+    def make_doc(self, text):
+        return self.tokenizer(text)
+
+
+def pickle_korean(instance):
+    return Korean, tuple()
+
+
+copy_reg.pickle(Korean, pickle_korean)
+
+__all__ = ["Korean"]
diff --git a/spacy/lang/ko/examples.py b/spacy/lang/ko/examples.py
new file mode 100644
index 000000000..10a6ea9bd
--- /dev/null
+++ b/spacy/lang/ko/examples.py
@@ -0,0 +1,15 @@
+# coding: utf8
+from __future__ import unicode_literals
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.ko.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+sentences = [
+    "애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.",
+    "자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.",
+    "자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.",
+    "런던은 영국의 수도이자 가장 큰 도시입니다."
+]
diff --git a/spacy/lang/ko/stop_words.py b/spacy/lang/ko/stop_words.py
new file mode 100644
index 000000000..53cf6f29a
--- /dev/null
+++ b/spacy/lang/ko/stop_words.py
@@ -0,0 +1,68 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+STOP_WORDS = set("""
+이
+있
+하
+것
+들
+그
+되
+수
+이
+보
+않
+없
+나
+주
+아니
+등
+같
+때
+년
+가
+한
+지
+오
+말
+일
+그렇
+위하
+때문
+그것
+두
+말하
+알
+그러나
+받
+못하
+일
+그런
+또
+더
+많
+그리고
+좋
+크
+시키
+그러
+하나
+살
+데
+안
+어떤
+번
+나
+다른
+어떻
+들
+이렇
+점
+싶
+말
+좀
+원
+잘
+놓
+""".split())
diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py
new file mode 100644
index 000000000..ed6b58170
--- /dev/null
+++ b/spacy/lang/ko/tag_map.py
@@ -0,0 +1,66 @@
+# encoding: utf8
+from __future__ import unicode_literals
+from collections import defaultdict
+
+from ...symbols import (POS, PUNCT, INTJ, X, SYM,
+                        ADJ, AUX, ADP, CONJ, NOUN, PRON, VERB, ADV, PROPN, 
+                        NUM, DET)
+        
+# 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴
+# https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265
+# https://universaldependencies.org/u/pos/
+TAG_MAP = {
+    # J.{1,2} 조사 
+    "JKS": {POS: ADP},  
+    "JKC": {POS: ADP},
+    "JKG": {POS: ADP},
+    "JKO": {POS: ADP},
+    "JKB": {POS: ADP},
+    "JKV": {POS: ADP},
+    "JKQ": {POS: ADP},
+    "JX": {POS: ADP},   # 보조사
+    "JC": {POS: CONJ},  # 접속 조사
+    "MAJ": {POS: CONJ}, # 접속 부사
+    "MAG": {POS: ADV},  # 일반 부사
+    "MM": {POS: DET},   # 관형사
+    
+    "XPN": {POS: X},  # 접두사
+    # XS. 접미사 
+    "XSN": {POS: X},
+    "XSV": {POS: X},
+    "XSA": {POS: X},
+    "XR": {POS: X},     # 어근
+    # E.{1,2} 어미
+    "EP": {POS: X},
+    "EF": {POS: X},
+    "EC": {POS: X},
+    "ETN": {POS: X},
+    "ETM": {POS: X},
+    
+    "IC": {POS: INTJ},  # 감탄사
+
+    "VV": {POS: VERB},  # 동사
+    "VA": {POS: ADJ},   # 형용사
+    "VX": {POS: AUX},   # 보조 용언
+    "VCP": {POS: ADP},  # 긍정 지정사(이다)
+    "VCN": {POS: ADJ},  # 부정 지정사(아니다)
+
+    "NNG": {POS: NOUN}, # 일반 명사(general noun)
+    "NNB": {POS: NOUN}, # 의존 명사
+    "NNBC": {POS: NOUN}, # 의존 명사(단위: unit)
+    "NNP": {POS: PROPN}, # 고유 명사(proper noun)
+    "NP": {POS: PRON},  # 대명사
+    "NR": {POS: NUM},   # 수사(numerals)
+    "SN": {POS: NUM},   # 숫자
+    
+    # S.{1,2} 부호
+    # 문장 부호 
+    "SF": {POS: PUNCT}, # period or other EOS marker
+    "SE": {POS: PUNCT},
+    "SC": {POS: PUNCT}, # comma, etc.
+    "SSO": {POS: PUNCT},    # open bracket
+    "SSC": {POS: PUNCT},    # close bracket
+    "SY": {POS: SYM},   # 기타 기호 
+    "SL": {POS: X},     # 외국어
+    "SH": {POS: X},     # 한자
+}
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 4bef85a1b..fdd86616d 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -124,6 +124,12 @@ def ja_tokenizer():
     return get_lang_class("ja").Defaults.create_tokenizer()
 
 
+@pytest.fixture(scope="session")
+def ko_tokenizer():
+    pytest.importorskip("natto")
+    return get_lang_class("ko").Defaults.create_tokenizer()
+
+
 @pytest.fixture(scope="session")
 def lt_tokenizer():
     return get_lang_class("lt").Defaults.create_tokenizer()
diff --git a/spacy/tests/lang/ko/__init__.py b/spacy/tests/lang/ko/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py
new file mode 100644
index 000000000..67371d4ce
--- /dev/null
+++ b/spacy/tests/lang/ko/test_lemmatization.py
@@ -0,0 +1,13 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    "word,lemma",
+    [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")],
+)
+def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
+    test_lemma = ko_tokenizer(word)[0].lemma_
+    assert test_lemma == lemma
diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py
new file mode 100644
index 000000000..bd1d94aec
--- /dev/null
+++ b/spacy/tests/lang/ko/test_tokenizer.py
@@ -0,0 +1,46 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+# fmt: off
+TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."),
+                   ("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 .")]
+
+TAG_TESTS = [("서울 타워 근처에 살고 있습니다.", 
+              "NNP NNG NNG JKB VV EC VX EF SF"),
+             ("영등포구에 있는 맛집 좀 알려주세요.", 
+              "NNP JKB VV ETM NNG MAG VV VX EP SF")]
+
+FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.",
+                   "NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")]
+
+POS_TESTS = [("서울 타워 근처에 살고 있습니다.", 
+              "PROPN NOUN NOUN ADP VERB X AUX X PUNCT"),
+             ("영등포구에 있는 맛집 좀 알려주세요.",
+              "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
+# fmt: on
+
+
+@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
+def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
+    tokens = [token.text for token in ko_tokenizer(text)]
+    assert tokens == expected_tokens.split()
+
+
+@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
+def test_ko_tokenizer_tags(ko_tokenizer, text, expected_tags):
+    tags = [token.tag_ for token in ko_tokenizer(text)]
+    assert tags == expected_tags.split()
+
+
+@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
+def test_ko_tokenizer_full_tags(ko_tokenizer, text, expected_tags):
+    tags = ko_tokenizer(text).user_data["full_tags"]
+    assert tags == expected_tags.split()
+
+
+@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
+def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
+    pos = [token.pos_ for token in ko_tokenizer(text)]
+    assert pos == expected_pos.split()
diff --git a/website/meta/languages.json b/website/meta/languages.json
index cfa468d7f..1169a3397 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -153,6 +153,17 @@
             "example": "これは文章です。",
             "has_examples": true
         },
+        {
+            "code": "ko",
+            "name": "Korean",
+            "dependencies": [
+                { "name": "mecab-ko", "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" },
+                { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
+                { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py"}
+            ],
+            "example": "이것은 문장입니다.",
+            "has_examples": true
+        },
         {
             "code": "vi",
             "name": "Vietnamese",

From 205c73a58914b3fd9aebdd0708582fb7a80fd625 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20B=C3=B6ing?= <b.boeing@online.de>
Date: Wed, 10 Jul 2019 10:16:48 +0200
Subject: [PATCH 109/148] Update tokenizer and doc init example  (#3939)

* Fix Doc.to_json hyperlink

* Update tokenizer and doc init examples

* Change "matchin rules" to "punctuation rules"

* Auto-format
---
 spacy/tokens/doc.pyx          | 5 +++--
 website/docs/api/doc.md       | 2 +-
 website/docs/api/tokenizer.md | 8 +++++++-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 131c43d37..373771247 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -85,13 +85,14 @@ cdef class Doc:
     Python-level `Token` and `Span` objects are views of this array, i.e.
     they don't own the data themselves.
 
-    EXAMPLE: Construction 1
+    EXAMPLE:
+        Construction 1
         >>> doc = nlp(u'Some text')
 
         Construction 2
         >>> from spacy.tokens import Doc
         >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
-                      spaces=[True, False, False])
+        >>>           spaces=[True, False, False])
 
     DOCS: https://spacy.io/api/doc
     """
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index f5a94335f..bf9801564 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -264,7 +264,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
 | ----------- | -------------------------------------- | ----------------------------------------------- |
 | **RETURNS** | `numpy.ndarray[ndim=2, dtype='int32']` | The lowest common ancestor matrix of the `Doc`. |
 
-## Doc.to_json {#to_json, tag="method" new="2.1"}
+## Doc.to_json {#to_json tag="method" new="2.1"}
 
 Convert a Doc to JSON. The format it produces will be the new format for the
 [`spacy train`](/api/cli#train) command (not implemented yet). If custom
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index 5bc0df625..67e67f5c9 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -9,7 +9,10 @@ Segment text, and create `Doc` objects with the discovered segment boundaries.
 
 ## Tokenizer.\_\_init\_\_ {#init tag="method"}
 
-Create a `Tokenizer`, to create `Doc` objects given unicode text.
+Create a `Tokenizer`, to create `Doc` objects given unicode text. For examples
+of how to construct a custom tokenizer with different tokenization rules, see
+the
+[usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers).
 
 > #### Example
 >
@@ -18,11 +21,14 @@ Create a `Tokenizer`, to create `Doc` objects given unicode text.
 > from spacy.tokenizer import Tokenizer
 > from spacy.lang.en import English
 > nlp = English()
+> # Create a blank Tokenizer with just the English vocab
 > tokenizer = Tokenizer(nlp.vocab)
 >
 > # Construction 2
 > from spacy.lang.en import English
 > nlp = English()
+> # Create a Tokenizer with the default settings for English
+> # including punctuation rules and exceptions
 > tokenizer = nlp.Defaults.create_tokenizer(nlp)
 > ```
 

From 881f5bc401a2c16294a4152d05981ebd8e7691c5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Jul 2019 10:27:29 +0200
Subject: [PATCH 110/148] Auto-format

---
 website/docs/usage/training.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 91513588c..773b70f05 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -630,13 +630,13 @@ should be somewhat larger, especially if your documents are long.
 
 ### Learning rate, regularization and gradient clipping {#tips-hyperparams}
 
-By default spaCy uses the Adam solver, with default settings (`learn_rate=0.001`,
-`beta1=0.9`, `beta2=0.999`). Some researchers have said they found
-these settings terrible on their problems – but they've always performed very
-well in training spaCy's models, in combination with the rest of our recipe. You
-can change these settings directly, by modifying the corresponding attributes on
-the `optimizer` object. You can also set environment variables, to adjust the
-defaults.
+By default spaCy uses the Adam solver, with default settings
+(`learn_rate=0.001`, `beta1=0.9`, `beta2=0.999`). Some researchers have said
+they found these settings terrible on their problems – but they've always
+performed very well in training spaCy's models, in combination with the rest of
+our recipe. You can change these settings directly, by modifying the
+corresponding attributes on the `optimizer` object. You can also set environment
+variables, to adjust the defaults.
 
 There are two other key hyper-parameters of the solver: `L2` **regularization**,
 and **gradient clipping** (`max_grad_norm`). Gradient clipping is a hack that's

From ebe58e7fa18af919eb69b9e468d0ec30c9338dcc Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Jul 2019 10:27:33 +0200
Subject: [PATCH 111/148] Document gold.docs_to_json [ci skip]

---
 website/docs/api/annotation.md |  4 +++-
 website/docs/api/goldparse.md  | 21 +++++++++++++++++++++
 website/docs/usage/training.md |  3 +++
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md
index a5bb30b6f..ed0e0b3e0 100644
--- a/website/docs/api/annotation.md
+++ b/website/docs/api/annotation.md
@@ -520,7 +520,9 @@ spaCy takes training data in JSON format. The built-in
 [`convert`](/api/cli#convert) command helps you convert the `.conllu` format
 used by the
 [Universal Dependencies corpora](https://github.com/UniversalDependencies) to
-spaCy's training format.
+spaCy's training format. To convert one or more existing `Doc` objects to
+spaCy's JSON format, you can use the
+[`gold.docs_to_json`](/api/goldparse#docs_to_json) helper.
 
 > #### Annotating entities
 >
diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md
index ca5b6a811..13f68a85d 100644
--- a/website/docs/api/goldparse.md
+++ b/website/docs/api/goldparse.md
@@ -55,6 +55,27 @@ Whether the provided syntactic annotations form a projective dependency tree.
 
 ## Utilities {#util}
 
+### gold.docs_to_json {#docs_to_json tag="function"}
+
+Convert a list of Doc objects into the
+[JSON-serializable format](/api/annotation#json-input) used by the
+[`spacy train`](/api/cli#train) command.
+
+> #### Example
+>
+> ```python
+> from spacy.gold import docs_to_json
+>
+> doc = nlp(u"I like London")
+> json_data = docs_to_json([doc])
+> ```
+
+| Name        | Type             | Description                                |
+| ----------- | ---------------- | ------------------------------------------ |
+| `docs`      | iterable / `Doc` | The `Doc` object(s) to convert.            |
+| `id`        | int              | ID to assign to the JSON. Defaults to `0`. |
+| **RETURNS** | list             | The data in spaCy's JSON format.           |
+
 ### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
 
 Encode labelled spans into per-token tags, using the
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 773b70f05..b84bf4e12 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -39,6 +39,9 @@ mkdir models
 python -m spacy train es models ancora-json/es_ancora-ud-train.json ancora-json/es_ancora-ud-dev.json
 ```
 
+You can also use the [`gold.docs_to_json`](/api/goldparse#docs_to_json) helper
+to convert a list of `Doc` objects to spaCy's JSON training format.
+
 #### Understanding the training output
 
 When you train a model using the [`spacy train`](/api/cli#train) command, you'll

From 8721849423e42fe99cdd6905aa98b94af446d82b Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Jul 2019 11:19:28 +0200
Subject: [PATCH 112/148] Update Scorer.ents_per_type

---
 spacy/scorer.py            | 19 ++++++++++---------
 website/docs/api/scorer.md | 21 +++++++++++----------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index c01353520..b9994e3f2 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -92,6 +92,15 @@ class Scorer(object):
         """RETURNS (float): Named entity accuracy (F-score)."""
         return self.ner.fscore * 100
 
+    @property
+    def ents_per_type(self):
+        """RETURNS (dict): Scores per entity label.
+        """
+        return {
+            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
+            for k, v in self.ner_per_ents.items()
+        }
+
     @property
     def scores(self):
         """RETURNS (dict): All scores with keys `uas`, `las`, `ents_p`,
@@ -103,17 +112,9 @@ class Scorer(object):
             "ents_p": self.ents_p,
             "ents_r": self.ents_r,
             "ents_f": self.ents_f,
+            "ents_per_type": self.ents_per_type,
             "tags_acc": self.tags_acc,
             "token_acc": self.token_acc,
-            "ents_per_type": self.__scores_per_ents(),
-        }
-
-    def __scores_per_ents(self):
-        """RETURNS (dict): Scores per NER entity
-        """
-        return {
-            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
-            for k, v in self.ner_per_ents.items()
         }
 
     def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")):
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index e6a8595fd..2af4ec0ce 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -46,13 +46,14 @@ Update the evaluation scores from a single [`Doc`](/api/doc) /
 
 ## Properties
 
-| Name        | Type  | Description                                                                                  |
-| ----------- | ----- | -------------------------------------------------------------------------------------------- |
-| `token_acc` | float | Tokenization accuracy.                                                                       |
-| `tags_acc`  | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`).                           |
-| `uas`       | float | Unlabelled dependency score.                                                                 |
-| `las`       | float | Labelled dependency score.                                                                   |
-| `ents_p`    | float | Named entity accuracy (precision).                                                           |
-| `ents_r`    | float | Named entity accuracy (recall).                                                              |
-| `ents_f`    | float | Named entity accuracy (F-score).                                                             |
-| `scores`    | dict  | All scores with keys `uas`, `las`, `ents_p`, `ents_r`, `ents_f`, `tags_acc` and `token_acc`. |
+| Name                                           | Type  | Description                                                                                                   |
+| ---------------------------------------------- | ----- | ------------------------------------------------------------------------------------------------------------- |
+| `token_acc`                                    | float | Tokenization accuracy.                                                                                        |
+| `tags_acc`                                     | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`).                                            |
+| `uas`                                          | float | Unlabelled dependency score.                                                                                  |
+| `las`                                          | float | Labelled dependency score.                                                                                    |
+| `ents_p`                                       | float | Named entity accuracy (precision).                                                                            |
+| `ents_r`                                       | float | Named entity accuracy (recall).                                                                               |
+| `ents_f`                                       | float | Named entity accuracy (F-score).                                                                              |
+| `ents_per_type` <Tag variant="new">2.1.5</Tag> | dict  | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores.                         |
+| `scores`                                       | dict  | All scores with keys `uas`, `las`, `ents_p`, `ents_r`, `ents_f`, `ents_per_type`, `tags_acc` and `token_acc`. |

From 4ebb4865fe057192b8649e6a5c4bd33c60d49981 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Jul 2019 11:19:48 +0200
Subject: [PATCH 113/148] Update languages.json

---
 website/meta/languages.json | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/website/meta/languages.json b/website/meta/languages.json
index 1169a3397..ef336ef5f 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -104,6 +104,7 @@
         { "code": "ga", "name": "Irish" },
         { "code": "bn", "name": "Bengali", "has_examples": true },
         { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
+        { "code": "mr", "name": "Marathi" },
         { "code": "kn", "name": "Kannada" },
         { "code": "ta", "name": "Tamil", "has_examples": true },
         {
@@ -157,9 +158,12 @@
             "code": "ko",
             "name": "Korean",
             "dependencies": [
-                { "name": "mecab-ko", "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" },
+                {
+                    "name": "mecab-ko",
+                    "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
+                },
                 { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
-                { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py"}
+                { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
             ],
             "example": "이것은 문장입니다.",
             "has_examples": true

From ea2050079b61b89a5c6e75951c4565aa504a2510 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Jul 2019 12:03:05 +0200
Subject: [PATCH 114/148] Auto-format

---
 spacy/pipeline/entityruler.py | 64 +++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 4f89e4186..35fefd02c 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -10,7 +10,7 @@ from ..util import ensure_path, to_disk, from_disk
 from ..tokens import Span
 from ..matcher import Matcher, PhraseMatcher
 
-DEFAULT_ENT_ID_SEP = '||'
+DEFAULT_ENT_ID_SEP = "||"
 
 
 class EntityRuler(object):
@@ -53,7 +53,9 @@ class EntityRuler(object):
         self.matcher = Matcher(nlp.vocab)
         if phrase_matcher_attr is not None:
             self.phrase_matcher_attr = phrase_matcher_attr
-            self.phrase_matcher = PhraseMatcher(nlp.vocab, attr=self.phrase_matcher_attr)
+            self.phrase_matcher = PhraseMatcher(
+                nlp.vocab, attr=self.phrase_matcher_attr
+            )
         else:
             self.phrase_matcher_attr = None
             self.phrase_matcher = PhraseMatcher(nlp.vocab)
@@ -223,13 +225,14 @@ class EntityRuler(object):
         """
         cfg = srsly.msgpack_loads(patterns_bytes)
         if isinstance(cfg, dict):
-            self.add_patterns(cfg.get('patterns', cfg))
-            self.overwrite = cfg.get('overwrite', False)
-            self.phrase_matcher_attr = cfg.get('phrase_matcher_attr', None)
+            self.add_patterns(cfg.get("patterns", cfg))
+            self.overwrite = cfg.get("overwrite", False)
+            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
             if self.phrase_matcher_attr is not None:
-                self.phrase_matcher = PhraseMatcher(self.nlp.vocab,
-                                                    attr=self.phrase_matcher_attr)
-            self.ent_id_sep = cfg.get('ent_id_sep', DEFAULT_ENT_ID_SEP)
+                self.phrase_matcher = PhraseMatcher(
+                    self.nlp.vocab, attr=self.phrase_matcher_attr
+                )
+            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
         else:
             self.add_patterns(cfg)
         return self
@@ -242,11 +245,14 @@ class EntityRuler(object):
         DOCS: https://spacy.io/api/entityruler#to_bytes
         """
 
-        serial = OrderedDict((
-            ('overwrite', self.overwrite),
-            ('ent_id_sep', self.ent_id_sep),
-            ('phrase_matcher_attr', self.phrase_matcher_attr),
-            ('patterns', self.patterns)))
+        serial = OrderedDict(
+            (
+                ("overwrite", self.overwrite),
+                ("ent_id_sep", self.ent_id_sep),
+                ("phrase_matcher_attr", self.phrase_matcher_attr),
+                ("patterns", self.patterns),
+            )
+        )
         return srsly.msgpack_dumps(serial)
 
     def from_disk(self, path, **kwargs):
@@ -266,17 +272,20 @@ class EntityRuler(object):
         else:
             cfg = {}
             deserializers = {
-                'patterns': lambda p: self.add_patterns(srsly.read_jsonl(p.with_suffix('.jsonl'))),
-                'cfg': lambda p: cfg.update(srsly.read_json(p))
+                "patterns": lambda p: self.add_patterns(
+                    srsly.read_jsonl(p.with_suffix(".jsonl"))
+                ),
+                "cfg": lambda p: cfg.update(srsly.read_json(p)),
             }
             from_disk(path, deserializers, {})
-            self.overwrite = cfg.get('overwrite', False)
-            self.phrase_matcher_attr = cfg.get('phrase_matcher_attr')
-            self.ent_id_sep = cfg.get('ent_id_sep', DEFAULT_ENT_ID_SEP)
+            self.overwrite = cfg.get("overwrite", False)
+            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
+            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
 
             if self.phrase_matcher_attr is not None:
-                self.phrase_matcher = PhraseMatcher(self.nlp.vocab,
-                                                    attr=self.phrase_matcher_attr)
+                self.phrase_matcher = PhraseMatcher(
+                    self.nlp.vocab, attr=self.phrase_matcher_attr
+                )
         return self
 
     def to_disk(self, path, **kwargs):
@@ -289,13 +298,16 @@ class EntityRuler(object):
 
         DOCS: https://spacy.io/api/entityruler#to_disk
         """
-        cfg = {'overwrite': self.overwrite,
-               'phrase_matcher_attr': self.phrase_matcher_attr,
-               'ent_id_sep': self.ent_id_sep}
+        cfg = {
+            "overwrite": self.overwrite,
+            "phrase_matcher_attr": self.phrase_matcher_attr,
+            "ent_id_sep": self.ent_id_sep,
+        }
         serializers = {
-            'patterns': lambda p: srsly.write_jsonl(p.with_suffix('.jsonl'),
-                                                    self.patterns),
-            'cfg': lambda p: srsly.write_json(p, cfg)
+            "patterns": lambda p: srsly.write_jsonl(
+                p.with_suffix(".jsonl"), self.patterns
+            ),
+            "cfg": lambda p: srsly.write_json(p, cfg),
         }
         path = ensure_path(path)
         to_disk(path, serializers, {})

From 874d914a440553f8e4e3964b5677647ca9b2d967 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Jul 2019 12:13:23 +0200
Subject: [PATCH 115/148] Tidy up test

---
 spacy/tests/regression/test_issue3526.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py
index 3949c4b1c..62c9d5532 100644
--- a/spacy/tests/regression/test_issue3526.py
+++ b/spacy/tests/regression/test_issue3526.py
@@ -7,6 +7,7 @@ from spacy.language import Language
 from spacy.pipeline import EntityRuler
 from spacy import load
 import srsly
+
 from ..util import make_tempdir
 
 
@@ -79,8 +80,10 @@ def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
     nlp.add_pipe(ruler)
     with make_tempdir() as tmpdir:
         nlp.to_disk(tmpdir)
-        assert nlp.pipeline[-1][-1].patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert nlp.pipeline[-1][-1].overwrite is True
+        ruler = nlp.get_pipe("entity_ruler")
+        assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert ruler.overwrite is True
         nlp2 = load(tmpdir)
-        assert nlp2.pipeline[-1][-1].patterns == [{"label": "ORG", "pattern": "Apple"}]
-        assert nlp2.pipeline[-1][-1].overwrite is True
+        new_ruler = nlp2.get_pipe("entity_ruler")
+        assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
+        assert new_ruler.overwrite is True

From 570ab1f481fabbbc520d965e14494637680a22b9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Jul 2019 12:14:12 +0200
Subject: [PATCH 116/148] Fix handling of old entity ruler files

Expected an `entity_ruler.jsonl` file in the top-level model directory, so the path passed to from_disk by default (model path plus componentn name), but with the suffix ".jsonl".
---
 spacy/pipeline/entityruler.py            | 5 +++--
 spacy/tests/regression/test_issue3526.py | 7 +++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 35fefd02c..9bbbb2c48 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -266,8 +266,9 @@ class EntityRuler(object):
         DOCS: https://spacy.io/api/entityruler#from_disk
         """
         path = ensure_path(path)
-        if path.is_file():
-            patterns = srsly.read_jsonl(path)
+        depr_patterns_path = path.with_suffix(".jsonl")
+        if depr_patterns_path.is_file():
+            patterns = srsly.read_jsonl(depr_patterns_path)
             self.add_patterns(patterns)
         else:
             cfg = {}
diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py
index 62c9d5532..c6f513730 100644
--- a/spacy/tests/regression/test_issue3526.py
+++ b/spacy/tests/regression/test_issue3526.py
@@ -62,10 +62,9 @@ def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
     nlp = Language(vocab=en_vocab)
     ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
     with make_tempdir() as tmpdir:
-        out_file = tmpdir / "entity_ruler.jsonl"
-        srsly.write_jsonl(out_file, ruler.patterns)
-        new_ruler = EntityRuler(nlp)
-        new_ruler = new_ruler.from_disk(out_file)
+        out_file = tmpdir / "entity_ruler"
+        srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
+        new_ruler = EntityRuler(nlp).from_disk(out_file)
         for pattern in ruler.patterns:
             assert pattern in new_ruler.patterns
         assert len(new_ruler) == len(ruler)

From 40cd03fc358b12568aea95f7d11cc122677ad7dc Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Jul 2019 12:25:45 +0200
Subject: [PATCH 117/148] Improve EntityRuler serialization

---
 spacy/pipeline/entityruler.py   |  9 ++++---
 website/docs/api/entityruler.md | 46 +++++++++++++++++++--------------
 2 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 9bbbb2c48..35b465ceb 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -293,12 +293,13 @@ class EntityRuler(object):
         """Save the entity ruler patterns to a directory. The patterns will be
         saved as newline-delimited JSON (JSONL).
 
-        path (unicode / Path): The JSONL file to load.
+        path (unicode / Path): The JSONL file to save.
         **kwargs: Other config paramters, mostly for consistency.
         RETURNS (EntityRuler): The loaded entity ruler.
 
         DOCS: https://spacy.io/api/entityruler#to_disk
         """
+        path = ensure_path(path)
         cfg = {
             "overwrite": self.overwrite,
             "phrase_matcher_attr": self.phrase_matcher_attr,
@@ -310,5 +311,7 @@ class EntityRuler(object):
             ),
             "cfg": lambda p: srsly.write_json(p, cfg),
         }
-        path = ensure_path(path)
-        to_disk(path, serializers, {})
+        if path.suffix == ".jsonl":  # user wants to save only JSONL
+            srsly.write_jsonl(path, self.patterns)
+        else:
+            to_disk(path, serializers, {})
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index dcbf99da5..5c05450f8 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -30,14 +30,14 @@ be a token pattern (list) or a phrase pattern (string). For example:
 > ruler = EntityRuler(nlp, overwrite_ents=True)
 > ```
 
-| Name             | Type          | Description                                                                                                                                           |
-| ---------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `nlp`            | `Language`    | The shared nlp object to pass the vocab to the matchers and process phrase patterns.                                                                  |
-| `patterns`       | iterable      | Optional patterns to load in.                                                                                                                         |
-| `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None`
-| `overwrite_ents` | bool          | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`.                      |
-| `**cfg`          | -             | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. |
-| **RETURNS**      | `EntityRuler` | The newly constructed object.                                                                                                                         |
+| Name                  | Type          | Description                                                                                                                                           |
+| --------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `nlp`                 | `Language`    | The shared nlp object to pass the vocab to the matchers and process phrase patterns.                                                                  |
+| `patterns`            | iterable      | Optional patterns to load in.                                                                                                                         |
+| `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None`                                                       |
+| `overwrite_ents`      | bool          | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`.                      |
+| `**cfg`               | -             | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. |
+| **RETURNS**           | `EntityRuler` | The newly constructed object.                                                                                                                         |
 
 ## EntityRuler.\_\len\_\_ {#len tag="method"}
 
@@ -123,35 +123,41 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on
 ## EntityRuler.to_disk {#to_disk tag="method"}
 
 Save the entity ruler patterns to a directory. The patterns will be saved as
-newline-delimited JSON (JSONL).
+newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided,
+only the patterns are saved as JSONL. If a directory name is provided, a
+`patterns.jsonl` and `cfg` file with the component configuration is exported.
 
 > #### Example
 >
 > ```python
 > ruler = EntityRuler(nlp)
-> ruler.to_disk("/path/to/rules.jsonl")
+> ruler.to_disk("/path/to/patterns.jsonl")  # saves patterns only
+> ruler.to_disk("/path/to/entity_ruler")    # saves patterns and config
 > ```
 
-| Name   | Type             | Description                                                                                                      |
-| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name   | Type             | Description                                                                                                                         |
+| ------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `path` | unicode / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 
 ## EntityRuler.from_disk {#from_disk tag="method"}
 
-Load the entity ruler from a file. Expects a file containing newline-delimited
-JSON (JSONL) with one entry per line.
+Load the entity ruler from a file. Expects either a file containing
+newline-delimited JSON (JSONL) with one entry per line, or a directory
+containing a `patterns.jsonl` file and a `cfg` file with the component
+configuration.
 
 > #### Example
 >
 > ```python
 > ruler = EntityRuler(nlp)
-> ruler.from_disk("/path/to/rules.jsonl")
+> ruler.from_disk("/path/to/patterns.jsonl")  # loads patterns only
+> ruler.from_disk("/path/to/entity_ruler")    # loads patterns and config
 > ```
 
-| Name        | Type             | Description                                                                 |
-| ----------- | ---------------- | --------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a JSONL file. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `EntityRuler`    | The modified `EntityRuler` object.                                          |
+| Name        | Type             | Description                                                                              |
+| ----------- | ---------------- | ---------------------------------------------------------------------------------------- |
+| `path`      | unicode / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `EntityRuler`    | The modified `EntityRuler` object.                                                       |
 
 ## EntityRuler.to_bytes {#to_bytes tag="method"}
 

From 82045aac8a948c4c1ae0f5a04314c8111c06d34d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Jul 2019 12:49:18 +0200
Subject: [PATCH 118/148] Merge regression tests

---
 spacy/tests/regression/test_issue2001-2500.py |  10 +
 spacy/tests/regression/test_issue3001-3500.py | 334 ++++++++++++++++++
 spacy/tests/regression/test_issue3002.py      |  11 -
 spacy/tests/regression/test_issue3009.py      |  67 ----
 spacy/tests/regression/test_issue3012.py      |  31 --
 spacy/tests/regression/test_issue3199.py      |  15 -
 spacy/tests/regression/test_issue3209.py      |  23 --
 spacy/tests/regression/test_issue3248.py      |  27 --
 spacy/tests/regression/test_issue3277.py      |  11 -
 spacy/tests/regression/test_issue3288.py      |  18 -
 spacy/tests/regression/test_issue3289.py      |  15 -
 spacy/tests/regression/test_issue3328.py      |  19 -
 spacy/tests/regression/test_issue3331.py      |  21 --
 spacy/tests/regression/test_issue3345.py      |  26 --
 spacy/tests/regression/test_issue3356.py      |  72 ----
 spacy/tests/regression/test_issue3410.py      |  21 --
 spacy/tests/regression/test_issue3447.py      |  14 -
 spacy/tests/regression/test_issue3449.py      |  21 --
 spacy/tests/regression/test_issue3468.py      |  21 --
 19 files changed, 344 insertions(+), 433 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue3001-3500.py
 delete mode 100644 spacy/tests/regression/test_issue3002.py
 delete mode 100644 spacy/tests/regression/test_issue3009.py
 delete mode 100644 spacy/tests/regression/test_issue3012.py
 delete mode 100644 spacy/tests/regression/test_issue3199.py
 delete mode 100644 spacy/tests/regression/test_issue3209.py
 delete mode 100644 spacy/tests/regression/test_issue3248.py
 delete mode 100644 spacy/tests/regression/test_issue3277.py
 delete mode 100644 spacy/tests/regression/test_issue3288.py
 delete mode 100644 spacy/tests/regression/test_issue3289.py
 delete mode 100644 spacy/tests/regression/test_issue3328.py
 delete mode 100644 spacy/tests/regression/test_issue3331.py
 delete mode 100644 spacy/tests/regression/test_issue3345.py
 delete mode 100644 spacy/tests/regression/test_issue3356.py
 delete mode 100644 spacy/tests/regression/test_issue3410.py
 delete mode 100644 spacy/tests/regression/test_issue3447.py
 delete mode 100644 spacy/tests/regression/test_issue3449.py
 delete mode 100644 spacy/tests/regression/test_issue3468.py

diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 82b3a81a9..4292c8d23 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import pytest
 import numpy
 from spacy.tokens import Doc
+from spacy.matcher import Matcher
 from spacy.displacy import render
 from spacy.gold import iob_to_biluo
 from spacy.lang.it import Italian
@@ -123,6 +124,15 @@ def test_issue2396(en_vocab):
     assert (span.get_lca_matrix() == matrix).all()
 
 
+def test_issue2464(en_vocab):
+    """Test problem with successive ?. This is the same bug, so putting it here."""
+    matcher = Matcher(en_vocab)
+    doc = Doc(en_vocab, words=["a", "b"])
+    matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}])
+    matches = matcher(doc)
+    assert len(matches) == 3
+
+
 def test_issue2482():
     """Test we can serialize and deserialize a blank NER or parser model."""
     nlp = Italian()
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
new file mode 100644
index 000000000..3b0c2f1ed
--- /dev/null
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -0,0 +1,334 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.lang.en import English
+from spacy.lang.de import German
+from spacy.pipeline import EntityRuler, EntityRecognizer
+from spacy.matcher import Matcher, PhraseMatcher
+from spacy.tokens import Doc
+from spacy.vocab import Vocab
+from spacy.attrs import ENT_IOB, ENT_TYPE
+from spacy.compat import pickle, is_python2, unescape_unicode
+from spacy import displacy
+from spacy.util import decaying
+import numpy
+import re
+
+from ..util import get_doc
+
+
+def test_issue3002():
+    """Test that the tokenizer doesn't hang on a long list of dots"""
+    nlp = German()
+    doc = nlp(
+        "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
+    )
+    assert len(doc) == 5
+
+
+def test_issue3009(en_vocab):
+    """Test problem with matcher quantifiers"""
+    patterns = [
+        [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"POS": "ADP"}],
+        [
+            {"LEMMA": "have"},
+            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
+            {"LOWER": "to"},
+            {"LOWER": "do"},
+            {"POS": "ADP"},
+        ],
+        [
+            {"LEMMA": "have"},
+            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
+            {"LOWER": "to"},
+            {"LOWER": "do"},
+            {"POS": "ADP"},
+        ],
+    ]
+    words = ["also", "has", "to", "do", "with"]
+    tags = ["RB", "VBZ", "TO", "VB", "IN"]
+    doc = get_doc(en_vocab, words=words, tags=tags)
+    matcher = Matcher(en_vocab)
+    for i, pattern in enumerate(patterns):
+        matcher.add(str(i), None, pattern)
+        matches = matcher(doc)
+        assert matches
+
+
+def test_issue3012(en_vocab):
+    """Test that the is_tagged attribute doesn't get overwritten when we from_array
+    without tag information."""
+    words = ["This", "is", "10", "%", "."]
+    tags = ["DT", "VBZ", "CD", "NN", "."]
+    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
+    ents = [(2, 4, "PERCENT")]
+    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
+    assert doc.is_tagged
+
+    expected = ("10", "NUM", "CD", "PERCENT")
+    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
+
+    header = [ENT_IOB, ENT_TYPE]
+    ent_array = doc.to_array(header)
+    doc.from_array(header, ent_array)
+
+    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
+
+    # Serializing then deserializing
+    doc_bytes = doc.to_bytes()
+    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
+    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
+
+
+def test_issue3199():
+    """Test that Span.noun_chunks works correctly if no noun chunks iterator
+    is available. To make this test future-proof, we're constructing a Doc
+    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
+    """
+    doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
+    doc.is_parsed = True
+    assert list(doc[0:3].noun_chunks) == []
+
+
+def test_issue3209():
+    """Test issue that occurred in spaCy nightly where NER labels were being
+    mapped to classes incorrectly after loading the model, when the labels
+    were added using ner.add_label().
+    """
+    nlp = English()
+    ner = nlp.create_pipe("ner")
+    nlp.add_pipe(ner)
+
+    ner.add_label("ANIMAL")
+    nlp.begin_training()
+    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
+    assert ner.move_names == move_names
+    nlp2 = English()
+    nlp2.add_pipe(nlp2.create_pipe("ner"))
+    nlp2.from_bytes(nlp.to_bytes())
+    assert nlp2.get_pipe("ner").move_names == move_names
+
+
+def test_issue3248_1():
+    """Test that the PhraseMatcher correctly reports its number of rules, not
+    total number of patterns."""
+    nlp = English()
+    matcher = PhraseMatcher(nlp.vocab)
+    matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
+    matcher.add("TEST2", None, nlp("d"))
+    assert len(matcher) == 2
+
+
+def test_issue3248_2():
+    """Test that the PhraseMatcher can be pickled correctly."""
+    nlp = English()
+    matcher = PhraseMatcher(nlp.vocab)
+    matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
+    matcher.add("TEST2", None, nlp("d"))
+    data = pickle.dumps(matcher)
+    new_matcher = pickle.loads(data)
+    assert len(new_matcher) == len(matcher)
+
+
+def test_issue3277(es_tokenizer):
+    """Test that hyphens are split correctly as prefixes."""
+    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
+    assert len(doc) == 14
+    assert doc[0].text == "\u2014"
+    assert doc[5].text == "\u2013"
+    assert doc[9].text == "\u2013"
+
+
+def test_issue3288(en_vocab):
+    """Test that retokenization works correctly via displaCy when punctuation
+    is merged onto the preceeding token and tensor is resized."""
+    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
+    heads = [1, 0, -1, 1, 0, 1, -2, -3]
+    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
+    displacy.render(doc)
+
+
+def test_issue3289():
+    """Test that Language.to_bytes handles serializing a pipeline component
+    with an uninitialized model."""
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("textcat"))
+    bytes_data = nlp.to_bytes()
+    new_nlp = English()
+    new_nlp.add_pipe(nlp.create_pipe("textcat"))
+    new_nlp.from_bytes(bytes_data)
+
+
+def test_issue3328(en_vocab):
+    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
+    matcher = Matcher(en_vocab)
+    patterns = [
+        [{"LOWER": {"IN": ["hello", "how"]}}],
+        [{"LOWER": {"IN": ["you", "doing"]}}],
+    ]
+    matcher.add("TEST", None, *patterns)
+    matches = matcher(doc)
+    assert len(matches) == 4
+    matched_texts = [doc[start:end].text for _, start, end in matches]
+    assert matched_texts == ["Hello", "how", "you", "doing"]
+
+
+@pytest.mark.xfail
+def test_issue3331(en_vocab):
+    """Test that duplicate patterns for different rules result in multiple
+    matches, one per rule.
+    """
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"]))
+    matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"]))
+    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
+    matches = matcher(doc)
+    assert len(matches) == 2
+    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
+    assert sorted(match_ids) == ["A", "B"]
+
+
+def test_issue3345():
+    """Test case where preset entity crosses sentence boundary."""
+    nlp = English()
+    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
+    doc[4].is_sent_start = True
+    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
+    ner = EntityRecognizer(doc.vocab)
+    # Add the OUT action. I wouldn't have thought this would be necessary...
+    ner.moves.add_action(5, "")
+    ner.add_label("GPE")
+    doc = ruler(doc)
+    # Get into the state just before "New"
+    state = ner.moves.init_batch([doc])[0]
+    ner.moves.apply_transition(state, "O")
+    ner.moves.apply_transition(state, "O")
+    ner.moves.apply_transition(state, "O")
+    # Check that B-GPE is valid.
+    assert ner.moves.is_valid(state, "B-GPE")
+
+
+if is_python2:
+    # If we have this test in Python 3, pytest chokes, as it can't print the
+    # string above in the xpass message.
+    prefix_search = (
+        b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])"
+        b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?"
+        b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}"
+        b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|"
+        b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|"
+        b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|"
+        b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|"
+        b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|"
+        b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|"
+        b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|"
+        b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|"
+        b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|"
+        b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|"
+        b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|"
+        b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F"
+        b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8"
+        b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17"
+        b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC"
+        b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940"
+        b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103"
+        b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125"
+        b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F"
+        b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4"
+        b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5"
+        b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B"
+        b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440"
+        b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2"
+        b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800"
+        b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76"
+        b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80"
+        b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004"
+        b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191"
+        b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250"
+        b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0"
+        b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77"
+        b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137"
+        b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E"
+        b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877"
+        b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45"
+        b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129"
+        b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C"
+        b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245"
+        b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A"
+        b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86"
+        b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0"
+        b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1"
+        b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6"
+        b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250"
+        b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400"
+        b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700"
+        b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810"
+        b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890"
+        b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940"
+        b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2"
+        b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF"
+        b"\\U0001FA60-\\U0001FA6D]"
+    )
+
+    def test_issue3356():
+        pattern = re.compile(unescape_unicode(prefix_search.decode("utf8")))
+        assert not pattern.search("hello")
+
+
+def test_issue3410():
+    texts = ["Hello world", "This is a test"]
+    nlp = English()
+    matcher = Matcher(nlp.vocab)
+    phrasematcher = PhraseMatcher(nlp.vocab)
+    with pytest.deprecated_call():
+        docs = list(nlp.pipe(texts, n_threads=4))
+    with pytest.deprecated_call():
+        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
+    with pytest.deprecated_call():
+        list(matcher.pipe(docs, n_threads=4))
+    with pytest.deprecated_call():
+        list(phrasematcher.pipe(docs, n_threads=4))
+
+
+def test_issue3447():
+    sizes = decaying(10.0, 1.0, 0.5)
+    size = next(sizes)
+    assert size == 10.0
+    size = next(sizes)
+    assert size == 10.0 - 0.5
+    size = next(sizes)
+    assert size == 10.0 - 0.5 - 0.5
+
+
+@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
+def test_issue3449():
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
+    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
+    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
+    t1 = nlp(text1)
+    t2 = nlp(text2)
+    t3 = nlp(text3)
+    assert t1[5].text == "I"
+    assert t2[5].text == "I"
+    assert t3[5].text == "I"
+
+
+def test_issue3468():
+    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
+    be restored after serialization."""
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
+    doc = nlp("Hello world")
+    assert doc[0].is_sent_start
+    assert doc.is_sentenced
+    assert len(list(doc.sents)) == 1
+    doc_bytes = doc.to_bytes()
+    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
+    assert new_doc[0].is_sent_start
+    assert new_doc.is_sentenced
+    assert len(list(new_doc.sents)) == 1
diff --git a/spacy/tests/regression/test_issue3002.py b/spacy/tests/regression/test_issue3002.py
deleted file mode 100644
index 54e661d1f..000000000
--- a/spacy/tests/regression/test_issue3002.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from spacy.lang.de import German
-
-
-def test_issue3002():
-    """Test that the tokenizer doesn't hang on a long list of dots"""
-    nlp = German()
-    doc = nlp('880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl')
-    assert len(doc) == 5
diff --git a/spacy/tests/regression/test_issue3009.py b/spacy/tests/regression/test_issue3009.py
deleted file mode 100644
index 25f208903..000000000
--- a/spacy/tests/regression/test_issue3009.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import pytest
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-PATTERNS = [
-    ("1", [[{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"POS": "ADP"}]]),
-    (
-        "2",
-        [
-            [
-                {"LEMMA": "have"},
-                {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
-                {"LOWER": "to"},
-                {"LOWER": "do"},
-                {"POS": "ADP"},
-            ]
-        ],
-    ),
-    (
-        "3",
-        [
-            [
-                {"LEMMA": "have"},
-                {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
-                {"LOWER": "to"},
-                {"LOWER": "do"},
-                {"POS": "ADP"},
-            ]
-        ],
-    ),
-]
-
-
-@pytest.fixture
-def doc(en_tokenizer):
-    doc = en_tokenizer("also has to do with")
-    doc[0].tag_ = "RB"
-    doc[1].tag_ = "VBZ"
-    doc[2].tag_ = "TO"
-    doc[3].tag_ = "VB"
-    doc[4].tag_ = "IN"
-    return doc
-
-
-@pytest.fixture
-def matcher(en_tokenizer):
-    return Matcher(en_tokenizer.vocab)
-
-
-@pytest.mark.parametrize("pattern", PATTERNS)
-def test_issue3009(doc, matcher, pattern):
-    """Test problem with matcher quantifiers"""
-    matcher.add(pattern[0], None, *pattern[1])
-    matches = matcher(doc)
-    assert matches
-
-
-def test_issue2464(matcher):
-    """Test problem with successive ?. This is the same bug, so putting it here."""
-    doc = Doc(matcher.vocab, words=["a", "b"])
-    matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}])
-    matches = matcher(doc)
-    assert len(matches) == 3
diff --git a/spacy/tests/regression/test_issue3012.py b/spacy/tests/regression/test_issue3012.py
deleted file mode 100644
index 8fdc8b318..000000000
--- a/spacy/tests/regression/test_issue3012.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from ...attrs import ENT_IOB, ENT_TYPE
-from ...tokens import Doc
-from ..util import get_doc
-
-
-def test_issue3012(en_vocab):
-    """Test that the is_tagged attribute doesn't get overwritten when we from_array
-    without tag information."""
-    words = ["This", "is", "10", "%", "."]
-    tags = ["DT", "VBZ", "CD", "NN", "."]
-    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
-    ents = [(2, 4, "PERCENT")]
-    doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
-    assert doc.is_tagged
-
-    expected = ("10", "NUM", "CD", "PERCENT")
-    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
-
-    header = [ENT_IOB, ENT_TYPE]
-    ent_array = doc.to_array(header)
-    doc.from_array(header, ent_array)
-
-    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
-
-    # serializing then deserializing
-    doc_bytes = doc.to_bytes()
-    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
-    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
diff --git a/spacy/tests/regression/test_issue3199.py b/spacy/tests/regression/test_issue3199.py
deleted file mode 100644
index d80a55330..000000000
--- a/spacy/tests/regression/test_issue3199.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from spacy.tokens import Doc
-from spacy.vocab import Vocab
-
-
-def test_issue3199():
-    """Test that Span.noun_chunks works correctly if no noun chunks iterator
-    is available. To make this test future-proof, we're constructing a Doc
-    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
-    """
-    doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
-    doc.is_parsed = True
-    assert list(doc[0:3].noun_chunks) == []
diff --git a/spacy/tests/regression/test_issue3209.py b/spacy/tests/regression/test_issue3209.py
deleted file mode 100644
index 469e38b8c..000000000
--- a/spacy/tests/regression/test_issue3209.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from spacy.lang.en import English
-
-
-def test_issue3209():
-    """Test issue that occurred in spaCy nightly where NER labels were being
-    mapped to classes incorrectly after loading the model, when the labels
-    were added using ner.add_label().
-    """
-    nlp = English()
-    ner = nlp.create_pipe("ner")
-    nlp.add_pipe(ner)
-
-    ner.add_label("ANIMAL")
-    nlp.begin_training()
-    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
-    assert ner.move_names == move_names
-    nlp2 = English()
-    nlp2.add_pipe(nlp2.create_pipe("ner"))
-    nlp2.from_bytes(nlp.to_bytes())
-    assert nlp2.get_pipe("ner").move_names == move_names
diff --git a/spacy/tests/regression/test_issue3248.py b/spacy/tests/regression/test_issue3248.py
deleted file mode 100644
index c4b592f3c..000000000
--- a/spacy/tests/regression/test_issue3248.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from spacy.matcher import PhraseMatcher
-from spacy.lang.en import English
-from spacy.compat import pickle
-
-
-def test_issue3248_1():
-    """Test that the PhraseMatcher correctly reports its number of rules, not
-    total number of patterns."""
-    nlp = English()
-    matcher = PhraseMatcher(nlp.vocab)
-    matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
-    matcher.add("TEST2", None, nlp("d"))
-    assert len(matcher) == 2
-
-
-def test_issue3248_2():
-    """Test that the PhraseMatcher can be pickled correctly."""
-    nlp = English()
-    matcher = PhraseMatcher(nlp.vocab)
-    matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
-    matcher.add("TEST2", None, nlp("d"))
-    data = pickle.dumps(matcher)
-    new_matcher = pickle.loads(data)
-    assert len(new_matcher) == len(matcher)
diff --git a/spacy/tests/regression/test_issue3277.py b/spacy/tests/regression/test_issue3277.py
deleted file mode 100644
index 88ea67774..000000000
--- a/spacy/tests/regression/test_issue3277.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-
-def test_issue3277(es_tokenizer):
-    """Test that hyphens are split correctly as prefixes."""
-    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
-    assert len(doc) == 14
-    assert doc[0].text == "\u2014"
-    assert doc[5].text == "\u2013"
-    assert doc[9].text == "\u2013"
diff --git a/spacy/tests/regression/test_issue3288.py b/spacy/tests/regression/test_issue3288.py
deleted file mode 100644
index 188bf361c..000000000
--- a/spacy/tests/regression/test_issue3288.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import numpy
-from spacy import displacy
-
-from ..util import get_doc
-
-
-def test_issue3288(en_vocab):
-    """Test that retokenization works correctly via displaCy when punctuation
-    is merged onto the preceeding token and tensor is resized."""
-    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
-    heads = [1, 0, -1, 1, 0, 1, -2, -3]
-    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
-    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
-    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
-    displacy.render(doc)
diff --git a/spacy/tests/regression/test_issue3289.py b/spacy/tests/regression/test_issue3289.py
deleted file mode 100644
index 0e64f07ce..000000000
--- a/spacy/tests/regression/test_issue3289.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from spacy.lang.en import English
-
-
-def test_issue3289():
-    """Test that Language.to_bytes handles serializing a pipeline component
-    with an uninitialized model."""
-    nlp = English()
-    nlp.add_pipe(nlp.create_pipe("textcat"))
-    bytes_data = nlp.to_bytes()
-    new_nlp = English()
-    new_nlp.add_pipe(nlp.create_pipe("textcat"))
-    new_nlp.from_bytes(bytes_data)
diff --git a/spacy/tests/regression/test_issue3328.py b/spacy/tests/regression/test_issue3328.py
deleted file mode 100644
index c397feebb..000000000
--- a/spacy/tests/regression/test_issue3328.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from spacy.matcher import Matcher
-from spacy.tokens import Doc
-
-
-def test_issue3328(en_vocab):
-    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
-    matcher = Matcher(en_vocab)
-    patterns = [
-        [{"LOWER": {"IN": ["hello", "how"]}}],
-        [{"LOWER": {"IN": ["you", "doing"]}}],
-    ]
-    matcher.add("TEST", None, *patterns)
-    matches = matcher(doc)
-    assert len(matches) == 4
-    matched_texts = [doc[start:end].text for _, start, end in matches]
-    assert matched_texts == ["Hello", "how", "you", "doing"]
diff --git a/spacy/tests/regression/test_issue3331.py b/spacy/tests/regression/test_issue3331.py
deleted file mode 100644
index c30712f81..000000000
--- a/spacy/tests/regression/test_issue3331.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import pytest
-from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
-
-
-@pytest.mark.xfail
-def test_issue3331(en_vocab):
-    """Test that duplicate patterns for different rules result in multiple
-    matches, one per rule.
-    """
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"]))
-    matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"]))
-    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
-    matches = matcher(doc)
-    assert len(matches) == 2
-    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
-    assert sorted(match_ids) == ["A", "B"]
diff --git a/spacy/tests/regression/test_issue3345.py b/spacy/tests/regression/test_issue3345.py
deleted file mode 100644
index c358fd7bc..000000000
--- a/spacy/tests/regression/test_issue3345.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from spacy.lang.en import English
-from spacy.tokens import Doc
-from spacy.pipeline import EntityRuler, EntityRecognizer
-
-
-def test_issue3345():
-    """Test case where preset entity crosses sentence boundary."""
-    nlp = English()
-    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
-    doc[4].is_sent_start = True
-    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
-    ner = EntityRecognizer(doc.vocab)
-    # Add the OUT action. I wouldn't have thought this would be necessary...
-    ner.moves.add_action(5, "")
-    ner.add_label("GPE")
-    doc = ruler(doc)
-    # Get into the state just before "New"
-    state = ner.moves.init_batch([doc])[0]
-    ner.moves.apply_transition(state, "O")
-    ner.moves.apply_transition(state, "O")
-    ner.moves.apply_transition(state, "O")
-    # Check that B-GPE is valid.
-    assert ner.moves.is_valid(state, "B-GPE")
diff --git a/spacy/tests/regression/test_issue3356.py b/spacy/tests/regression/test_issue3356.py
deleted file mode 100644
index f8d16459c..000000000
--- a/spacy/tests/regression/test_issue3356.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import re
-from spacy import compat
-
-prefix_search = (
-    b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])"
-    b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?"
-    b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}"
-    b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|"
-    b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|"
-    b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|"
-    b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|"
-    b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|"
-    b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|"
-    b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|"
-    b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|"
-    b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|"
-    b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|"
-    b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|"
-    b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F"
-    b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8"
-    b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17"
-    b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC"
-    b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940"
-    b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103"
-    b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125"
-    b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F"
-    b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4"
-    b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5"
-    b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B"
-    b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440"
-    b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2"
-    b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800"
-    b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76"
-    b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80"
-    b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004"
-    b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191"
-    b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250"
-    b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0"
-    b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77"
-    b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137"
-    b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E"
-    b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877"
-    b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45"
-    b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129"
-    b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C"
-    b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245"
-    b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A"
-    b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86"
-    b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0"
-    b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1"
-    b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6"
-    b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250"
-    b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400"
-    b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700"
-    b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810"
-    b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890"
-    b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940"
-    b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2"
-    b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF"
-    b"\\U0001FA60-\\U0001FA6D]"
-)
-
-
-if compat.is_python2:
-    # If we have this test in Python 3, pytest chokes, as it can't print the
-    # string above in the xpass message.
-    def test_issue3356():
-        pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8")))
-        assert not pattern.search("hello")
diff --git a/spacy/tests/regression/test_issue3410.py b/spacy/tests/regression/test_issue3410.py
deleted file mode 100644
index 5d2ac5ba3..000000000
--- a/spacy/tests/regression/test_issue3410.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import pytest
-from spacy.lang.en import English
-from spacy.matcher import Matcher, PhraseMatcher
-
-
-def test_issue3410():
-    texts = ["Hello world", "This is a test"]
-    nlp = English()
-    matcher = Matcher(nlp.vocab)
-    phrasematcher = PhraseMatcher(nlp.vocab)
-    with pytest.deprecated_call():
-        docs = list(nlp.pipe(texts, n_threads=4))
-    with pytest.deprecated_call():
-        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
-    with pytest.deprecated_call():
-        list(matcher.pipe(docs, n_threads=4))
-    with pytest.deprecated_call():
-        list(phrasematcher.pipe(docs, n_threads=4))
diff --git a/spacy/tests/regression/test_issue3447.py b/spacy/tests/regression/test_issue3447.py
deleted file mode 100644
index 0ca1f9e67..000000000
--- a/spacy/tests/regression/test_issue3447.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from spacy.util import decaying
-
-
-def test_issue3447():
-    sizes = decaying(10.0, 1.0, 0.5)
-    size = next(sizes)
-    assert size == 10.0
-    size = next(sizes)
-    assert size == 10.0 - 0.5
-    size = next(sizes)
-    assert size == 10.0 - 0.5 - 0.5
diff --git a/spacy/tests/regression/test_issue3449.py b/spacy/tests/regression/test_issue3449.py
deleted file mode 100644
index deff49fd6..000000000
--- a/spacy/tests/regression/test_issue3449.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import pytest
-
-from spacy.lang.en import English
-
-
-@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
-def test_issue3449():
-    nlp = English()
-    nlp.add_pipe(nlp.create_pipe("sentencizer"))
-    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
-    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
-    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
-    t1 = nlp(text1)
-    t2 = nlp(text2)
-    t3 = nlp(text3)
-    assert t1[5].text == "I"
-    assert t2[5].text == "I"
-    assert t3[5].text == "I"
diff --git a/spacy/tests/regression/test_issue3468.py b/spacy/tests/regression/test_issue3468.py
deleted file mode 100644
index ebbed2640..000000000
--- a/spacy/tests/regression/test_issue3468.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from spacy.lang.en import English
-from spacy.tokens import Doc
-
-
-def test_issue3468():
-    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
-    be restored after serialization."""
-    nlp = English()
-    nlp.add_pipe(nlp.create_pipe("sentencizer"))
-    doc = nlp("Hello world")
-    assert doc[0].is_sent_start
-    assert doc.is_sentenced
-    assert len(list(doc.sents)) == 1
-    doc_bytes = doc.to_bytes()
-    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
-    assert new_doc[0].is_sent_start
-    assert new_doc.is_sentenced
-    assert len(list(new_doc.sents)) == 1

From 4e04080b760fd3019d74259ae2172a836846317d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Jul 2019 13:00:52 +0200
Subject: [PATCH 119/148] Only compare sorted patterns in test

Try to work around flaky tests on Python 3.5
---
 spacy/tests/pipeline/test_entity_ruler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index a371be38b..5ab1a3af0 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -111,7 +111,7 @@ def test_entity_ruler_serialize_bytes(nlp, patterns):
     assert len(new_ruler.patterns) == len(ruler.patterns)
     for pattern in ruler.patterns:
         assert pattern in new_ruler.patterns
-    assert new_ruler.labels == ruler.labels
+    assert sorted(new_ruler.labels) == sorted(ruler.labels)
 
 
 def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns):

From 87f7ec34d503c3cde11570ce5b4ebb961dbb37fe Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 10 Jul 2019 13:53:34 +0200
Subject: [PATCH 120/148] Add test for #3880

---
 spacy/tests/regression/test_issue3880.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue3880.py

diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py
new file mode 100644
index 000000000..2171b5911
--- /dev/null
+++ b/spacy/tests/regression/test_issue3880.py
@@ -0,0 +1,23 @@
+# coding: utf8
+from __future__ import unicode_literals
+from spacy.lang.en import English
+import pytest
+
+
+@pytest.mark.xfail
+def test_issue3880():
+    """Test that `nlp.pipe()` works when an empty string ends the batch.
+
+    Fixed in v7.0.5 of Thinc.
+    """
+    texts = ["hello", "world", "", ""]
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe("parser"))
+    nlp.add_pipe(nlp.create_pipe("ner"))
+    nlp.add_pipe(nlp.create_pipe("tagger"))
+    nlp.get_pipe("parser").add_label("dep")
+    nlp.get_pipe("ner").add_label("PERSON")
+    nlp.get_pipe("tagger").add_label("NN")
+    nlp.begin_training()
+    for doc in nlp.pipe(texts):
+        pass

From 465456edb9871dd4bcc24e5a6236bfe272e5f137 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 10 Jul 2019 14:01:17 +0200
Subject: [PATCH 121/148] Un-xfail test #3880

---
 spacy/tests/regression/test_issue3880.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py
index 2171b5911..ecc12afa3 100644
--- a/spacy/tests/regression/test_issue3880.py
+++ b/spacy/tests/regression/test_issue3880.py
@@ -4,7 +4,6 @@ from spacy.lang.en import English
 import pytest
 
 
-@pytest.mark.xfail
 def test_issue3880():
     """Test that `nlp.pipe()` works when an empty string ends the batch.
 

From 3d18600c052be8dca59e9193310f7fc6041011f8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 10 Jul 2019 19:21:23 +0200
Subject: [PATCH 122/148] Return True from doc.is_... when no ambiguity

* Make doc.is_sentenced return True if len(doc) < 2.

* Make doc.is_nered return True if len(doc) == 0, for consistency.

Closes #3934
---
 spacy/tokens/doc.pyx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index a040cdc67..c77e5c44e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -240,6 +240,8 @@ cdef class Doc:
             return True
         if self.is_parsed:
             return True
+        if len(self) < 2:
+            return True
         for i in range(1, self.length):
             if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
                 return True
@@ -251,6 +253,8 @@ cdef class Doc:
         *any* of the tokens has a named entity tag set (even if the others are
         uknown values).
         """
+        if len(self) == 0:
+            return True
         for i in range(self.length):
             if self.c[i].ent_iob != 0:
                 return True

From b94c5443d90c1fe60eb41d3a520bd8fa8d92f860 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 10 Jul 2019 19:37:20 +0200
Subject: [PATCH 123/148] Rename Binder->DocBox, and improve it.

---
 spacy/tokens/_serialize.py | 73 +++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 28 deletions(-)

diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 43ea78242..c4478e080 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -11,29 +11,27 @@ from ..tokens import Doc
 from ..attrs import SPACY, ORTH
 
 
-class Binder(object):
+class DocBox(object):
     """Serialize analyses from a collection of doc objects."""
 
-    def __init__(self, attrs=None):
-        """Create a Binder object, to hold serialized annotations.
+    def __init__(self, attrs=None, store_user_data=False):
+        """Create a DocBox object, to hold serialized annotations.
 
         attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
             always serialized, so they're not required. Defaults to None.
         """
         attrs = attrs or []
-        self.attrs = list(attrs)
         # Ensure ORTH is always attrs[0]
-        if ORTH in self.attrs:
-            self.attrs.pop(ORTH)
-        if SPACY in self.attrs:
-            self.attrs.pop(SPACY)
+        self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
         self.attrs.insert(0, ORTH)
         self.tokens = []
         self.spaces = []
+        self.user_data = []
         self.strings = set()
+        self.store_user_data = store_user_data
 
     def add(self, doc):
-        """Add a doc's annotations to the binder for serialization."""
+        """Add a doc's annotations to the DocBox for serialization."""
         array = doc.to_array(self.attrs)
         if len(array.shape) == 1:
             array = array.reshape((array.shape[0], 1))
@@ -43,27 +41,35 @@ class Binder(object):
         spaces = spaces.reshape((spaces.shape[0], 1))
         self.spaces.append(numpy.asarray(spaces, dtype=bool))
         self.strings.update(w.text for w in doc)
+        if self.store_user_data:
+            self.user_data.append(srsly.msgpack_dumps(doc.user_data))
 
     def get_docs(self, vocab):
         """Recover Doc objects from the annotations, using the given vocab."""
         for string in self.strings:
             vocab[string]
         orth_col = self.attrs.index(ORTH)
-        for tokens, spaces in zip(self.tokens, self.spaces):
+        for i in range(len(self.tokens)):
+            tokens = self.tokens[i]
+            spaces = self.spaces[i]
             words = [vocab.strings[orth] for orth in tokens[:, orth_col]]
             doc = Doc(vocab, words=words, spaces=spaces)
             doc = doc.from_array(self.attrs, tokens)
+            if self.store_user_data:
+                doc.user_data.update(srsly.msgpack_loads(self.user_data[i]))
             yield doc
 
     def merge(self, other):
-        """Extend the annotations of this binder with the annotations from another."""
+        """Extend the annotations of this DocBox with the annotations from another."""
         assert self.attrs == other.attrs
         self.tokens.extend(other.tokens)
         self.spaces.extend(other.spaces)
         self.strings.update(other.strings)
+        if self.store_user_data:
+            self.user_data.extend(other.user_data)
 
     def to_bytes(self):
-        """Serialize the binder's annotations into a byte string."""
+        """Serialize the DocBox's annotations into a byte string."""
         for tokens in self.tokens:
             assert len(tokens.shape) == 2, tokens.shape
         lengths = [len(tokens) for tokens in self.tokens]
@@ -74,10 +80,12 @@ class Binder(object):
             "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
             "strings": list(self.strings),
         }
+        if self.store_user_data:
+            msg["user_data"] = self.user_data
         return gzip.compress(srsly.msgpack_dumps(msg))
 
     def from_bytes(self, string):
-        """Deserialize the binder's annotations from a byte string."""
+        """Deserialize the DocBox's annotations from a byte string."""
         msg = srsly.msgpack_loads(gzip.decompress(string))
         self.attrs = msg["attrs"]
         self.strings = set(msg["strings"])
@@ -89,29 +97,38 @@ class Binder(object):
         flat_spaces = flat_spaces.reshape((flat_spaces.size, 1))
         self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
         self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
+        if self.store_user_data and "user_data" in msg:
+            self.user_data = list(msg["user_data"])
         for tokens in self.tokens:
             assert len(tokens.shape) == 2, tokens.shape
         return self
 
 
-def merge_bytes(binder_strings):
-    """Concatenate multiple serialized binders into one byte string."""
-    output = None
-    for byte_string in binder_strings:
-        binder = Binder().from_bytes(byte_string)
-        if output is None:
-            output = binder
-        else:
-            output.merge(binder)
-    return output.to_bytes()
+def merge_boxes(boxes):
+    merged = None
+    for byte_string in boxes:
+        if byte_string is not None:
+            box = DocBox(store_user_data=True).from_bytes(byte_string)
+            if merged is None:
+                merged = box
+            else:
+                merged.merge(box)
+    if merged is not None:
+        return merged.to_bytes()
+    else:
+        return b''
 
 
-def pickle_binder(binder):
-    return (unpickle_binder, (binder.to_bytes(),))
+def pickle_box(box):
+    return (unpickle_box, (box.to_bytes(),))
 
 
-def unpickle_binder(byte_string):
-    return Binder().from_bytes(byte_string)
+def unpickle_box(byte_string):
+    return Box().from_bytes(byte_string)
 
 
-copy_reg.pickle(Binder, pickle_binder, unpickle_binder)
+copy_reg.pickle(Box, pickle_box, unpickle_box)
+# Compatibility, as we had named it this previously.
+Binder = DocBox
+
+__all__ = ["DocBox"]

From c4c21cb4281133890d0b59c4b5a847d1ef9bff30 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 10 Jul 2019 19:39:38 +0200
Subject: [PATCH 124/148] more friendly textcat errors (#3946)

* more friendly textcat errors with require_model and require_labels

* update thinc version with recent bugfix
---
 requirements.txt         | 2 +-
 spacy/errors.py          | 1 +
 spacy/pipeline/pipes.pyx | 7 +++++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8cc52dfe4..58761b95c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=2.0.1,<2.1.0
-thinc>=7.0.2,<7.1.0
+thinc>=7.0.5,<7.1.0
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.2.0,<1.1.0
diff --git a/spacy/errors.py b/spacy/errors.py
index 8f2eab3a1..347ad1fca 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -403,6 +403,7 @@ class Errors(object):
     E140 = ("The list of entities, prior probabilities and entity vectors should be of equal length.")
     E141 = ("Entity vectors should be of length {required} instead of the provided {found}.")
     E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or 'cosine'")
+    E143 = ("Labels for component '{name}' not initialized. Did you forget to call add_label()?")
 
 
 @add_codes
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index d99a1f73e..891e8d4e3 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -902,6 +902,11 @@ class TextCategorizer(Pipe):
     def labels(self):
         return tuple(self.cfg.setdefault("labels", []))
 
+    def require_labels(self):
+        """Raise an error if the component's model has no labels defined."""
+        if not self.labels:
+            raise ValueError(Errors.E143.format(name=self.name))
+
     @labels.setter
     def labels(self, value):
         self.cfg["labels"] = tuple(value)
@@ -931,6 +936,7 @@ class TextCategorizer(Pipe):
                 doc.cats[label] = float(scores[i, j])
 
     def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
+        self.require_model()
         scores, bp_scores = self.model.begin_update(docs, drop=drop)
         loss, d_scores = self.get_loss(docs, golds, scores)
         bp_scores(d_scores, sgd=sgd)
@@ -985,6 +991,7 @@ class TextCategorizer(Pipe):
     def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
         if self.model is True:
             self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
+            self.require_labels()
             self.model = self.Model(len(self.labels), **self.cfg)
             link_vectors_to_models(self.vocab)
         if sgd is None:

From c6cb78275888228cc647a950d9adfbf545a60ad6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 10 Jul 2019 22:54:09 +0200
Subject: [PATCH 125/148] Set version to 2.1.5.dev0

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 5e7093606..758809934 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -4,7 +4,7 @@
 # fmt: off
 
 __title__ = "spacy"
-__version__ = "2.1.4"
+__version__ = "2.1.5.dev0"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"

From a89fecce97c06d7315bb955de1127025fa310b4b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 11 Jul 2019 00:43:55 +0200
Subject: [PATCH 126/148] failing unit test for issue #3869

---
 spacy/tests/regression/test_issue3869.py | 29 ++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue3869.py

diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py
new file mode 100644
index 000000000..72a485042
--- /dev/null
+++ b/spacy/tests/regression/test_issue3869.py
@@ -0,0 +1,29 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+from spacy.attrs import IS_ALPHA
+from spacy.lang.en import English
+
+
+@pytest.mark.parametrize(
+    "sentence",
+    [
+        'The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.',
+        'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.'
+    ],
+)
+def test_issue3869(sentence):
+    """Test that the Doc's count_by function works consistently"""
+    nlp = English()
+
+    doc = nlp(sentence)
+
+    count = 0
+    for token in doc:
+        count += token.is_alpha
+
+    assert count == doc.count_by(IS_ALPHA).get(1, 0)
+
+

From e0804123854b91bbad5a3e084de867d5fbbff788 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 11 Jul 2019 01:53:06 +0200
Subject: [PATCH 127/148] tracked the bug down to PreshCounter.inc - still
 unclear what goes wrong

---
 spacy/tests/regression/test_issue3869.py |  6 ++++-
 spacy/tokens/doc.pxd                     |  1 +
 spacy/tokens/doc.pyx                     | 33 +++++++++++++++++++++++-
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py
index 72a485042..d76da6989 100644
--- a/spacy/tests/regression/test_issue3869.py
+++ b/spacy/tests/regression/test_issue3869.py
@@ -11,13 +11,17 @@ from spacy.lang.en import English
     "sentence",
     [
         'The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.',
-        'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.'
+        'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.',
+        'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s number one',
+        'Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.',
+        "It was a missed assignment, but it shouldn't have resulted in a turnover ..."
     ],
 )
 def test_issue3869(sentence):
     """Test that the Doc's count_by function works consistently"""
     nlp = English()
 
+    print()
     doc = nlp(sentence)
 
     count = 0
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 7cdc2316a..cc05cb495 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -8,6 +8,7 @@ from ..typedefs cimport attr_t
 from ..attrs cimport attr_id_t
 
 
+
 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
 
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index c77e5c44e..657b9a1d6 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -13,6 +13,7 @@ from libc.math cimport sqrt
 import numpy
 import numpy.linalg
 import struct
+from libc.stdint cimport int64_t
 import srsly
 from thinc.neural.util import get_array_module, copy_array
 
@@ -710,22 +711,52 @@ cdef class Doc:
         cdef int i
         cdef attr_t attr
         cdef size_t count
+        cdef int64_t this_value
+
+        print("COUNTING")
 
         if counts is None:
             counts = PreshCounter()
             output_dict = True
+            print("counts None")
         else:
             output_dict = False
         # Take this check out of the loop, for a bit of extra speed
         if exclude is None:
+            print("exclude None")
             for i in range(self.length):
-                counts.inc(get_token_attr(&self.c[i], attr_id), 1)
+                print()
+                print("token", self[i])
+                this_value = get_token_attr(&self.c[i], attr_id)
+                print("token attr value", this_value)
+                print("type attr value", type(this_value))
+
+                print(i, "key this_value before", counts.c_map.cells[this_value].key)
+                print(i, "value this_value before", <int64_t>counts.c_map.cells[this_value].value)
+                counts.inc(this_value, 1)
+                print(i, "key this_value after", counts.c_map.cells[this_value].key)
+                print(i, "value this_value after", <int64_t>counts.c_map.cells[this_value].value)
+
+                print(i, "key 0", counts.c_map.cells[0].key)
+                print(i, "value 0", <int64_t>counts.c_map.cells[0].value)
+                print(i, "key 1", counts.c_map.cells[1].key)
+                print(i, "value 1", <int64_t>counts.c_map.cells[1].value)
         else:
             for i in range(self.length):
                 if not exclude(self[i]):
                     attr = get_token_attr(&self.c[i], attr_id)
                     counts.inc(attr, 1)
         if output_dict:
+            print("output_dict")
+            print(counts.length)
+            print(counts.total)
+            print("key 0", counts.c_map.cells[0].key)
+            print("value 0", <int64_t>counts.c_map.cells[0].value)
+            print("key 1", counts.c_map.cells[1].key)
+            print("value 1", <int64_t>counts.c_map.cells[1].value)
+            print()
+            print(dict(counts))
+            print()
             return dict(counts)
 
     def _realloc(self, new_size):

From d5311b3c42554d5288f3fcb9261ae301a21fd9e1 Mon Sep 17 00:00:00 2001
From: yash <patadiayash@gmail.com>
Date: Thu, 11 Jul 2019 14:53:14 +0530
Subject: [PATCH 128/148] Add test file for issue (#3625) and spacy contributor
 agreement

---
 .github/contributors/yashpatadia.md      | 106 +++++++++++++++++++++++
 spacy/tests/regression/test_issue3625.py |   9 ++
 2 files changed, 115 insertions(+)
 create mode 100644 .github/contributors/yashpatadia.md
 create mode 100644 spacy/tests/regression/test_issue3625.py

diff --git a/.github/contributors/yashpatadia.md b/.github/contributors/yashpatadia.md
new file mode 100644
index 000000000..2dcf9211d
--- /dev/null
+++ b/.github/contributors/yashpatadia.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |     Yash Patadia     |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           |      11/07/2019      |
+| GitHub username                |       yash1994       |
+| Website (optional)             |                      |
\ No newline at end of file
diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py
new file mode 100644
index 000000000..f61b834fb
--- /dev/null
+++ b/spacy/tests/regression/test_issue3625.py
@@ -0,0 +1,9 @@
+from __future__ import unicode_literals
+
+from spacy.lang.hi import Hindi
+
+def test_issue3625():
+    """Test that default punctuation rules applies to hindi unicode characters"""
+    nlp = Hindi()
+    doc = nlp(u"hi. how हुए. होटल, होटल")
+    assert [token.text for token in doc] == ['hi', '.', 'how', 'हुए', '.', 'होटल', ',', 'होटल']
\ No newline at end of file

From 815f8d13dd0cfe034201b2c35452012a7adb1e03 Mon Sep 17 00:00:00 2001
From: yash <patadiayash@gmail.com>
Date: Thu, 11 Jul 2019 15:00:51 +0530
Subject: [PATCH 129/148] Fix default punctuation rules for hindi text (#3625
 explosion)

---
 spacy/lang/char_classes.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index cb2e817d5..fb320b2ff 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -9,6 +9,8 @@ _bengali = r"\u0980-\u09FF"
 
 _hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F"
 
+_hindi = r"\u0900-\u097F"
+
 # Latin standard
 _latin_u_standard = r"A-Z"
 _latin_l_standard = r"a-z"
@@ -193,7 +195,7 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ"
 _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
 _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
 
-_uncased = _bengali + _hebrew + _persian + _sinhala
+_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi
 
 ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
 ALPHA_LOWER = group_chars(_lower + _uncased)

From bd3c3f342b01cd0b48e1a02bc11bc37c9d9e63a9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 11 Jul 2019 11:48:55 +0200
Subject: [PATCH 130/148] Fix _serialize

---
 spacy/tokens/_serialize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index c4478e080..57bc98f4b 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -124,10 +124,10 @@ def pickle_box(box):
 
 
 def unpickle_box(byte_string):
-    return Box().from_bytes(byte_string)
+    return DocBox().from_bytes(byte_string)
 
 
-copy_reg.pickle(Box, pickle_box, unpickle_box)
+copy_reg.pickle(DocBox, pickle_box, unpickle_box)
 # Compatibility, as we had named it this previously.
 Binder = DocBox
 

From 0491a8e7c83dcbf8a293305681b498d38514541f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 11 Jul 2019 11:49:36 +0200
Subject: [PATCH 131/148] Reformat

---
 spacy/tokens/_serialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 57bc98f4b..41f524839 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -116,7 +116,7 @@ def merge_boxes(boxes):
     if merged is not None:
         return merged.to_bytes()
     else:
-        return b''
+        return b""
 
 
 def pickle_box(box):

From ae2d52e323ea8959caf474d23de857d59b5b6ca8 Mon Sep 17 00:00:00 2001
From: yash <patadiayash@gmail.com>
Date: Thu, 11 Jul 2019 15:26:27 +0530
Subject: [PATCH 132/148] Add default encoding utf-8 for test file

---
 spacy/tests/regression/test_issue3625.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py
index f61b834fb..e3e0f25ee 100644
--- a/spacy/tests/regression/test_issue3625.py
+++ b/spacy/tests/regression/test_issue3625.py
@@ -1,3 +1,4 @@
+# coding: utf8
 from __future__ import unicode_literals
 
 from spacy.lang.hi import Hindi

From 0b8406a05cf497ce40071efb56894fee7f20b4d2 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 11 Jul 2019 12:02:25 +0200
Subject: [PATCH 133/148] Tidy up and auto-format

---
 spacy/_ml.py                              | 31 ++++++++----
 spacy/lang/ko/__init__.py                 |  4 +-
 spacy/lang/ko/tag_map.py                  | 59 ++++++++++-------------
 spacy/tests/lang/ko/test_lemmatization.py |  3 +-
 spacy/tests/lang/ko/test_tokenizer.py     |  6 +--
 spacy/tests/lang/lt/test_text.py          | 48 +++++++++++-------
 spacy/tests/matcher/test_matcher_api.py   | 44 ++++++++++++-----
 spacy/tests/regression/test_issue3880.py  |  2 +-
 8 files changed, 118 insertions(+), 79 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index cca324b45..d16e124dc 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -661,21 +661,33 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
 
     conv_depth = cfg.get("conv_depth", 2)
     cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
-    pretrained_vectors = cfg.get("pretrained_vectors")   # self.nlp.vocab.vectors.name
+    pretrained_vectors = cfg.get("pretrained_vectors")  # self.nlp.vocab.vectors.name
     context_width = cfg.get("context_width")
     entity_width = cfg.get("entity_width")
 
     with Model.define_operators({">>": chain, "**": clone}):
-        model = Affine(entity_width, entity_width+context_width+1+ner_types)\
-                >> Affine(1, entity_width, drop_factor=0.0)\
-                >> logistic
+        model = (
+            Affine(entity_width, entity_width + context_width + 1 + ner_types)
+            >> Affine(1, entity_width, drop_factor=0.0)
+            >> logistic
+        )
 
         # context encoder
-        tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors,
-                          cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth,
-                          bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\
-                                >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
-                                >> zero_init(Affine(context_width, hidden_width))
+        tok2vec = (
+            Tok2Vec(
+                width=hidden_width,
+                embed_size=embed_width,
+                pretrained_vectors=pretrained_vectors,
+                cnn_maxout_pieces=cnn_maxout_pieces,
+                subword_features=True,
+                conv_depth=conv_depth,
+                bilstm_depth=0,
+            )
+            >> flatten_add_lengths
+            >> Pooling(mean_pool)
+            >> Residual(zero_init(Maxout(hidden_width, hidden_width)))
+            >> zero_init(Affine(context_width, hidden_width))
+        )
 
         model.tok2vec = tok2vec
 
@@ -684,6 +696,7 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
     model.nO = 1
     return model
 
+
 @layerize
 def flatten(seqs, drop=0.0):
     ops = Model.ops
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index 111d01720..f5dff75f1 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -6,7 +6,7 @@ import sys
 
 
 from .stop_words import STOP_WORDS
-from .tag_map import TAG_MAP, POS
+from .tag_map import TAG_MAP
 from ...attrs import LANG
 from ...language import Language
 from ...tokens import Doc
@@ -22,6 +22,7 @@ if is_python_pre_3_5:
     Morpheme = namedtuple("Morpheme", "surface lemma tag")
 elif is_python_post_3_7:
     from dataclasses import dataclass
+
     @dataclass(frozen=True)
     class Morpheme:
         surface: str
@@ -29,6 +30,7 @@ elif is_python_post_3_7:
         tag: str
 else:
     from typing import NamedTuple
+
     class Morpheme(NamedTuple):
         surface: str
         lemma: str
diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py
index ed6b58170..57317c969 100644
--- a/spacy/lang/ko/tag_map.py
+++ b/spacy/lang/ko/tag_map.py
@@ -1,66 +1,59 @@
 # encoding: utf8
 from __future__ import unicode_literals
-from collections import defaultdict
 
-from ...symbols import (POS, PUNCT, INTJ, X, SYM,
-                        ADJ, AUX, ADP, CONJ, NOUN, PRON, VERB, ADV, PROPN, 
-                        NUM, DET)
-        
+from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON
+from ...symbols import VERB, ADV, PROPN, NUM, DET
+
 # 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴
 # https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265
 # https://universaldependencies.org/u/pos/
 TAG_MAP = {
-    # J.{1,2} 조사 
-    "JKS": {POS: ADP},  
+    # J.{1,2} 조사
+    "JKS": {POS: ADP},
     "JKC": {POS: ADP},
     "JKG": {POS: ADP},
     "JKO": {POS: ADP},
     "JKB": {POS: ADP},
     "JKV": {POS: ADP},
     "JKQ": {POS: ADP},
-    "JX": {POS: ADP},   # 보조사
+    "JX": {POS: ADP},  # 보조사
     "JC": {POS: CONJ},  # 접속 조사
-    "MAJ": {POS: CONJ}, # 접속 부사
+    "MAJ": {POS: CONJ},  # 접속 부사
     "MAG": {POS: ADV},  # 일반 부사
-    "MM": {POS: DET},   # 관형사
-    
+    "MM": {POS: DET},  # 관형사
     "XPN": {POS: X},  # 접두사
-    # XS. 접미사 
+    # XS. 접미사
     "XSN": {POS: X},
     "XSV": {POS: X},
     "XSA": {POS: X},
-    "XR": {POS: X},     # 어근
+    "XR": {POS: X},  # 어근
     # E.{1,2} 어미
     "EP": {POS: X},
     "EF": {POS: X},
     "EC": {POS: X},
     "ETN": {POS: X},
     "ETM": {POS: X},
-    
     "IC": {POS: INTJ},  # 감탄사
-
     "VV": {POS: VERB},  # 동사
-    "VA": {POS: ADJ},   # 형용사
-    "VX": {POS: AUX},   # 보조 용언
+    "VA": {POS: ADJ},  # 형용사
+    "VX": {POS: AUX},  # 보조 용언
     "VCP": {POS: ADP},  # 긍정 지정사(이다)
     "VCN": {POS: ADJ},  # 부정 지정사(아니다)
-
-    "NNG": {POS: NOUN}, # 일반 명사(general noun)
-    "NNB": {POS: NOUN}, # 의존 명사
-    "NNBC": {POS: NOUN}, # 의존 명사(단위: unit)
-    "NNP": {POS: PROPN}, # 고유 명사(proper noun)
+    "NNG": {POS: NOUN},  # 일반 명사(general noun)
+    "NNB": {POS: NOUN},  # 의존 명사
+    "NNBC": {POS: NOUN},  # 의존 명사(단위: unit)
+    "NNP": {POS: PROPN},  # 고유 명사(proper noun)
     "NP": {POS: PRON},  # 대명사
-    "NR": {POS: NUM},   # 수사(numerals)
-    "SN": {POS: NUM},   # 숫자
-    
+    "NR": {POS: NUM},  # 수사(numerals)
+    "SN": {POS: NUM},  # 숫자
     # S.{1,2} 부호
-    # 문장 부호 
-    "SF": {POS: PUNCT}, # period or other EOS marker
+    # 문장 부호
+    "SF": {POS: PUNCT},  # period or other EOS marker
     "SE": {POS: PUNCT},
-    "SC": {POS: PUNCT}, # comma, etc.
-    "SSO": {POS: PUNCT},    # open bracket
-    "SSC": {POS: PUNCT},    # close bracket
-    "SY": {POS: SYM},   # 기타 기호 
-    "SL": {POS: X},     # 외국어
-    "SH": {POS: X},     # 한자
+    "SC": {POS: PUNCT},  # comma, etc.
+    "SSO": {POS: PUNCT},  # open bracket
+    "SSC": {POS: PUNCT},  # close bracket
+    "SY": {POS: SYM},  # 기타 기호
+    "SL": {POS: X},  # 외국어
+    "SH": {POS: X},  # 한자
 }
diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py
index 67371d4ce..42c306c11 100644
--- a/spacy/tests/lang/ko/test_lemmatization.py
+++ b/spacy/tests/lang/ko/test_lemmatization.py
@@ -5,8 +5,7 @@ import pytest
 
 
 @pytest.mark.parametrize(
-    "word,lemma",
-    [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")],
+    "word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
 )
 def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
     test_lemma = ko_tokenizer(word)[0].lemma_
diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py
index bd1d94aec..cc7b5fd77 100644
--- a/spacy/tests/lang/ko/test_tokenizer.py
+++ b/spacy/tests/lang/ko/test_tokenizer.py
@@ -7,15 +7,15 @@ import pytest
 TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."),
                    ("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 .")]
 
-TAG_TESTS = [("서울 타워 근처에 살고 있습니다.", 
+TAG_TESTS = [("서울 타워 근처에 살고 있습니다.",
               "NNP NNG NNG JKB VV EC VX EF SF"),
-             ("영등포구에 있는 맛집 좀 알려주세요.", 
+             ("영등포구에 있는 맛집 좀 알려주세요.",
               "NNP JKB VV ETM NNG MAG VV VX EP SF")]
 
 FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.",
                    "NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")]
 
-POS_TESTS = [("서울 타워 근처에 살고 있습니다.", 
+POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
               "PROPN NOUN NOUN ADP VERB X AUX X PUNCT"),
              ("영등포구에 있는 맛집 좀 알려주세요.",
               "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
diff --git a/spacy/tests/lang/lt/test_text.py b/spacy/tests/lang/lt/test_text.py
index d2550067b..7afc6d497 100644
--- a/spacy/tests/lang/lt/test_text.py
+++ b/spacy/tests/lang/lt/test_text.py
@@ -5,16 +5,26 @@ import pytest
 
 
 def test_lt_tokenizer_handles_long_text(lt_tokenizer):
-    text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią 
-vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis 
+    text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią
+vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis
 yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
     tokens = lt_tokenizer(text.replace("\n", ""))
     assert len(tokens) == 42
 
 
-@pytest.mark.parametrize('text,length', [
-    ("177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", 15),
-    ("ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", 16)])
+@pytest.mark.parametrize(
+    "text,length",
+    [
+        (
+            "177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.",
+            15,
+        ),
+        (
+            "ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.",
+            16,
+        ),
+    ],
+)
 def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length):
     tokens = lt_tokenizer(text)
     assert len(tokens) == length
@@ -26,18 +36,22 @@ def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text):
     assert len(tokens) == 1
 
 
-@pytest.mark.parametrize("text,match", [
-    ("10", True),
-    ("1", True),
-    ("10,000", True),
-    ("10,00", True),
-    ("999.0", True),
-    ("vienas", True),
-    ("du", True),
-    ("milijardas", True),
-    ("šuo", False),
-    (",", False),
-    ("1/2", True)])
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("10,000", True),
+        ("10,00", True),
+        ("999.0", True),
+        ("vienas", True),
+        ("du", True),
+        ("milijardas", True),
+        ("šuo", False),
+        (",", False),
+        ("1/2", True),
+    ],
+)
 def test_lt_lex_attrs_like_number(lt_tokenizer, text, match):
     tokens = lt_tokenizer(text)
     assert len(tokens) == 1
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 54ddd6789..013700d52 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -5,7 +5,6 @@ import pytest
 import re
 from spacy.matcher import Matcher, DependencyMatcher
 from spacy.tokens import Doc, Token
-from ..util import get_doc
 
 
 @pytest.fixture
@@ -288,24 +287,43 @@ def deps():
 def dependency_matcher(en_vocab):
     def is_brown_yellow(text):
         return bool(re.compile(r"brown|yellow|over").match(text))
+
     IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
 
     pattern1 = [
         {"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}},
-        {"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},"PATTERN": {"ORTH": "quick", "DEP": "amod"}},
-        {"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, "PATTERN": {IS_BROWN_YELLOW: True}},
+        {
+            "SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
+            "PATTERN": {"ORTH": "quick", "DEP": "amod"},
+        },
+        {
+            "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
+            "PATTERN": {IS_BROWN_YELLOW: True},
+        },
     ]
 
     pattern2 = [
         {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
-        {"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
-        {"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}
+        {
+            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
+        {
+            "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
     ]
 
     pattern3 = [
         {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
-        {"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
-        {"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"}, "PATTERN": {"ORTH": "brown"}}
+        {
+            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
+        {
+            "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"},
+            "PATTERN": {"ORTH": "brown"},
+        },
     ]
 
     matcher = DependencyMatcher(en_vocab)
@@ -320,9 +338,9 @@ def test_dependency_matcher_compile(dependency_matcher):
     assert len(dependency_matcher) == 3
 
 
-def test_dependency_matcher(dependency_matcher, text, heads, deps):
-    doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
-    matches = dependency_matcher(doc)
-    # assert matches[0][1] == [[3, 1, 2]]
-    # assert matches[1][1] == [[4, 3, 3]]
-    # assert matches[2][1] == [[4, 3, 2]]
+# def test_dependency_matcher(dependency_matcher, text, heads, deps):
+#     doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
+#     matches = dependency_matcher(doc)
+#     assert matches[0][1] == [[3, 1, 2]]
+#     assert matches[1][1] == [[4, 3, 3]]
+#     assert matches[2][1] == [[4, 3, 2]]
diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py
index ecc12afa3..6de373f11 100644
--- a/spacy/tests/regression/test_issue3880.py
+++ b/spacy/tests/regression/test_issue3880.py
@@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
+
 from spacy.lang.en import English
-import pytest
 
 
 def test_issue3880():

From d16675660775853f6530495f464dc715d052e2a7 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 11 Jul 2019 12:16:43 +0200
Subject: [PATCH 134/148] Fix test

---
 spacy/tests/lang/lt/test_text.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/lang/lt/test_text.py b/spacy/tests/lang/lt/test_text.py
index 7afc6d497..cac32aa4d 100644
--- a/spacy/tests/lang/lt/test_text.py
+++ b/spacy/tests/lang/lt/test_text.py
@@ -5,10 +5,8 @@ import pytest
 
 
 def test_lt_tokenizer_handles_long_text(lt_tokenizer):
-    text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią
-vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis
-yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
-    tokens = lt_tokenizer(text.replace("\n", ""))
+    text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui."""
+    tokens = lt_tokenizer(text)
     assert len(tokens) == 42
 
 
From e19f4ee719af1a4ce8391c6934ff3edf4cdb7ca3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 11 Jul 2019 12:32:59 +0200
Subject: [PATCH 135/148] Add warning message re Issue #3853

---
 spacy/_ml.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index d16e124dc..abb44e1b7 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -299,7 +299,14 @@ def link_vectors_to_models(vocab):
     data = ops.asarray(vectors.data)
     # Set an entry here, so that vectors are accessed by StaticVectors
     # (unideal, I know)
-    thinc.extra.load_nlp.VECTORS[(ops.device, vectors.name)] = data
+    key = (ops.device, vectors.name)
+    if key in thinc.extra.load_nlp.VECTORS:
+        if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
+            print(
+                "Warning: Registering vectors data under the same ID as "
+                "existing vectors, and the new vectors data seems different. "
+                "This might lead to incorrect results. See Issue #3853")
+    thinc.extra.load_nlp.VECTORS[key] = data
 
 
 def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):

From b40b4c2c31c7e43f7cee1f491e57d444bf1fd6d1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 11 Jul 2019 12:55:11 +0200
Subject: [PATCH 136/148] =?UTF-8?q?=F0=9F=92=AB=20Fix=20issue=20#3839:=20I?=
 =?UTF-8?q?ncorrect=20entity=20IDs=20from=20Matcher=20with=20operators=20(?=
 =?UTF-8?q?#3949)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add regression test for issue #3541

* Add comment on bugfix

* Remove incorrect test

* Un-xfail test
---
 spacy/matcher/matcher.pyx                | 8 ++++----
 spacy/tests/regression/test_issue3839.py | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 2dd8c2940..86658ce99 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -262,13 +262,13 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
 
 
 cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
+    # There have been a few bugs here.
     # The code was originally designed to always have pattern[1].attrs.value
     # be the ent_id when we get to the end of a pattern. However, Issue #2671
     # showed this wasn't the case when we had a reject-and-continue before a
-    # match. I still don't really understand what's going on here, but this
-    # workaround does resolve the issue.
-    while pattern.attrs.attr != ID and \
-            (pattern.nr_attr > 0 or pattern.nr_extra_attr > 0 or pattern.nr_py > 0):
+    # match.
+    # The patch to #2671 was wrong though, which came up in #3839.
+    while pattern.attrs.attr != ID:
         pattern += 1
     return pattern.attrs.value
 
diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py
index fa915faf0..34d6bb46e 100644
--- a/spacy/tests/regression/test_issue3839.py
+++ b/spacy/tests/regression/test_issue3839.py
@@ -6,7 +6,6 @@ from spacy.matcher import Matcher
 from spacy.tokens import Doc
 
 
-@pytest.mark.xfail
 def test_issue3839(en_vocab):
     """Test that match IDs returned by the matcher are correct, are in the string """
     doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])

From 0f0f07318a9bbf37ca3f4e008c35a7c88ded777f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 11 Jul 2019 13:05:53 +0200
Subject: [PATCH 137/148] counter instead of preshcounter

---
 bin/train_word_vectors.py |  1 -
 spacy/tokens/doc.pxd      |  1 -
 spacy/tokens/doc.pyx      | 37 +++++--------------------------------
 3 files changed, 5 insertions(+), 34 deletions(-)

diff --git a/bin/train_word_vectors.py b/bin/train_word_vectors.py
index 624e339a0..663ce060d 100644
--- a/bin/train_word_vectors.py
+++ b/bin/train_word_vectors.py
@@ -5,7 +5,6 @@ import logging
 from pathlib import Path
 from collections import defaultdict
 from gensim.models import Word2Vec
-from preshed.counter import PreshCounter
 import plac
 import spacy
 
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index cc05cb495..4b8578fe0 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -1,6 +1,5 @@
 from cymem.cymem cimport Pool
 cimport numpy as np
-from preshed.counter cimport PreshCounter
 
 from ..vocab cimport Vocab
 from ..structs cimport TokenC, LexemeC
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 657b9a1d6..3b0c2425c 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -9,6 +9,7 @@ cimport cython
 cimport numpy as np
 from libc.string cimport memcpy, memset
 from libc.math cimport sqrt
+from collections import Counter
 
 import numpy
 import numpy.linalg
@@ -698,7 +699,7 @@ cdef class Doc:
         # Handle 1d case
         return output if len(attr_ids) >= 2 else output.reshape((self.length,))
 
-    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
+    def count_by(self, attr_id_t attr_id, exclude=None, object counts=None):
         """Count the frequencies of a given attribute. Produces a dict of
         `{attribute (int): count (ints)}` frequencies, keyed by the values of
         the given attribute ID.
@@ -713,50 +714,22 @@ cdef class Doc:
         cdef size_t count
         cdef int64_t this_value
 
-        print("COUNTING")
-
         if counts is None:
-            counts = PreshCounter()
+            counts = Counter()
             output_dict = True
-            print("counts None")
         else:
             output_dict = False
         # Take this check out of the loop, for a bit of extra speed
         if exclude is None:
-            print("exclude None")
             for i in range(self.length):
-                print()
-                print("token", self[i])
                 this_value = get_token_attr(&self.c[i], attr_id)
-                print("token attr value", this_value)
-                print("type attr value", type(this_value))
-
-                print(i, "key this_value before", counts.c_map.cells[this_value].key)
-                print(i, "value this_value before", <int64_t>counts.c_map.cells[this_value].value)
-                counts.inc(this_value, 1)
-                print(i, "key this_value after", counts.c_map.cells[this_value].key)
-                print(i, "value this_value after", <int64_t>counts.c_map.cells[this_value].value)
-
-                print(i, "key 0", counts.c_map.cells[0].key)
-                print(i, "value 0", <int64_t>counts.c_map.cells[0].value)
-                print(i, "key 1", counts.c_map.cells[1].key)
-                print(i, "value 1", <int64_t>counts.c_map.cells[1].value)
+                counts[this_value] += 1
         else:
             for i in range(self.length):
                 if not exclude(self[i]):
                     attr = get_token_attr(&self.c[i], attr_id)
-                    counts.inc(attr, 1)
+                    counts[attr] += 1
         if output_dict:
-            print("output_dict")
-            print(counts.length)
-            print(counts.total)
-            print("key 0", counts.c_map.cells[0].key)
-            print("value 0", <int64_t>counts.c_map.cells[0].value)
-            print("key 1", counts.c_map.cells[1].key)
-            print("value 1", <int64_t>counts.c_map.cells[1].value)
-            print()
-            print(dict(counts))
-            print()
             return dict(counts)
 
     def _realloc(self, new_size):

From 349107daa3b0804c62861dbaa810e9a1488960b1 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 11 Jul 2019 13:09:22 +0200
Subject: [PATCH 138/148] cleanup

---
 spacy/tests/regression/test_issue3869.py | 2 --
 spacy/tokens/doc.pxd                     | 1 -
 spacy/tokens/doc.pyx                     | 8 ++------
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py
index d76da6989..42584b133 100644
--- a/spacy/tests/regression/test_issue3869.py
+++ b/spacy/tests/regression/test_issue3869.py
@@ -20,8 +20,6 @@ from spacy.lang.en import English
 def test_issue3869(sentence):
     """Test that the Doc's count_by function works consistently"""
     nlp = English()
-
-    print()
     doc = nlp(sentence)
 
     count = 0
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index 4b8578fe0..62665fcc5 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -7,7 +7,6 @@ from ..typedefs cimport attr_t
 from ..attrs cimport attr_id_t
 
 
-
 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
 
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3b0c2425c..c1883f9c0 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -14,7 +14,6 @@ from collections import Counter
 import numpy
 import numpy.linalg
 import struct
-from libc.stdint cimport int64_t
 import srsly
 from thinc.neural.util import get_array_module, copy_array
 
@@ -712,7 +711,6 @@ cdef class Doc:
         cdef int i
         cdef attr_t attr
         cdef size_t count
-        cdef int64_t this_value
 
         if counts is None:
             counts = Counter()
@@ -722,13 +720,11 @@ cdef class Doc:
         # Take this check out of the loop, for a bit of extra speed
         if exclude is None:
             for i in range(self.length):
-                this_value = get_token_attr(&self.c[i], attr_id)
-                counts[this_value] += 1
+                counts[get_token_attr(&self.c[i], attr_id)] += 1
         else:
             for i in range(self.length):
                 if not exclude(self[i]):
-                    attr = get_token_attr(&self.c[i], attr_id)
-                    counts[attr] += 1
+                    counts[get_token_attr(&self.c[i], attr_id)] += 1
         if output_dict:
             return dict(counts)
 

From 7369949d2e90872c56ec60ba9229ddf4bac92590 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 11 Jul 2019 14:44:32 +0200
Subject: [PATCH 139/148] Add warning for #3853

---
 spacy/errors.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index 347ad1fca..ed3d6afb9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -82,6 +82,8 @@ class Warnings(object):
             "parallel inference via multiprocessing.")
     W017 = ("Alias '{alias}' already exists in the Knowledge base.")
     W018 = ("Entity '{entity}' already exists in the Knowledge base.")
+    W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
+            "previously loaded vectors. See Issue #3853.")
 
 
 @add_codes

From 09dc01a4266b0f6b3a557918a4af44eac685f1bb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 11 Jul 2019 14:46:29 +0200
Subject: [PATCH 140/148] Fix #3853, and add warning

---
 spacy/_ml.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index abb44e1b7..4d9bb4c2b 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -24,7 +24,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed
 import thinc.extra.load_nlp
 
 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
-from .errors import Errors
+from .errors import Errors, user_warning, Warnings
 from . import util
 
 try:
@@ -302,10 +302,13 @@ def link_vectors_to_models(vocab):
     key = (ops.device, vectors.name)
     if key in thinc.extra.load_nlp.VECTORS:
         if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
-            print(
-                "Warning: Registering vectors data under the same ID as "
-                "existing vectors, and the new vectors data seems different. "
-                "This might lead to incorrect results. See Issue #3853")
+            # This is a hack to avoid the problem in #3853. Maybe we should
+            # print a warning as well?
+            old_name = vectors.name
+            new_name = vectors.name + "_%d" % data.shape[0]
+            user_warning(Warnings.W019.format(old=old_name, new=new_name))
+            vectors.name = new_name
+            key = (ops.device, vectors.name)
     thinc.extra.load_nlp.VECTORS[key] = data
 
 
From cda9fc3dae6ae430298cc293e378eac5813bf685 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 11 Jul 2019 15:53:13 +0200
Subject: [PATCH 141/148] Update Thinc version pin

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.py         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 80bb5905a..35ff96903 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,6 @@ requires = ["setuptools",
             "cymem>=2.0.2,<2.1.0",
             "preshed>=2.0.1,<2.1.0",
             "murmurhash>=0.28.0,<1.1.0",
-            "thinc==7.0.0.dev6",
+            "thinc>=7.0.6,<7.1.0",
             ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 58761b95c..99935f335 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=2.0.1,<2.1.0
-thinc>=7.0.5,<7.1.0
+thinc>=7.0.6,<7.1.0
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.2.0,<1.1.0
diff --git a/setup.py b/setup.py
index 544188f4a..4d6416cad 100755
--- a/setup.py
+++ b/setup.py
@@ -228,7 +228,7 @@ def setup_package():
                 "murmurhash>=0.28.0,<1.1.0",
                 "cymem>=2.0.2,<2.1.0",
                 "preshed>=2.0.1,<2.1.0",
-                "thinc>=7.0.2,<7.1.0",
+                "thinc>=7.0.6,<7.1.0",
                 "blis>=0.2.2,<0.3.0",
                 "plac<1.0.0,>=0.9.6",
                 "requests>=2.13.0,<3.0.0",

From 123929b58bcdb6738f0d0ca0254783f6ab8e61fb Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 12 Jul 2019 00:15:35 +0200
Subject: [PATCH 142/148] Update Thinc version pin

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.py         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 35ff96903..35f3d9215 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,6 @@ requires = ["setuptools",
             "cymem>=2.0.2,<2.1.0",
             "preshed>=2.0.1,<2.1.0",
             "murmurhash>=0.28.0,<1.1.0",
-            "thinc>=7.0.6,<7.1.0",
+            "thinc>=7.0.8,<7.1.0",
             ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 99935f335..5a6870cd3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=2.0.1,<2.1.0
-thinc>=7.0.6,<7.1.0
+thinc>=7.0.8,<7.1.0
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.2.0,<1.1.0
diff --git a/setup.py b/setup.py
index 4d6416cad..b36c48316 100755
--- a/setup.py
+++ b/setup.py
@@ -228,7 +228,7 @@ def setup_package():
                 "murmurhash>=0.28.0,<1.1.0",
                 "cymem>=2.0.2,<2.1.0",
                 "preshed>=2.0.1,<2.1.0",
-                "thinc>=7.0.6,<7.1.0",
+                "thinc>=7.0.8,<7.1.0",
                 "blis>=0.2.2,<0.3.0",
                 "plac<1.0.0,>=0.9.6",
                 "requests>=2.13.0,<3.0.0",

From ed774cb9530d701cdfbad72fa84d86d8ac965706 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 12 Jul 2019 10:01:35 +0200
Subject: [PATCH 143/148] Fixing ngram bug (#3953)

* minimal failing example for Issue #3661

* referenced Issue #3661 instead of Issue #3611

* cleanup
---
 spacy/tests/regression/test_issue3611.py | 51 ++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue3611.py

diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py
new file mode 100644
index 000000000..29aa5370d
--- /dev/null
+++ b/spacy/tests/regression/test_issue3611.py
@@ -0,0 +1,51 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+import spacy
+from spacy.util import minibatch, compounding
+
+
+def test_issue3611():
+    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = ["This is an offensive text",
+               "This is the second offensive text",
+               "inoff"]
+    y_train = ["offensive", "offensive", "inoffensive"]
+
+    # preparing the data
+    pos_cats = list()
+    for train_instance in y_train:
+        pos_cats.append({label: label == train_instance for label in unique_classes})
+    train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats]))
+
+    # set up the spacy model with a text categorizer component
+    nlp = spacy.blank('en')
+
+    textcat = nlp.create_pipe(
+        "textcat",
+        config={
+            "exclusive_classes": True,
+            "architecture": "bow",
+            "ngram_size": 2
+        }
+    )
+
+    for label in unique_classes:
+        textcat.add_label(label)
+    nlp.add_pipe(textcat, last=True)
+
+    # training the network
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
+    with nlp.disable_pipes(*other_pipes):
+        optimizer = nlp.begin_training()
+        for i in range(3):
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                texts, annotations = zip(*batch)
+                nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses)
+
+

From 3bc4d618f920998e76cc5302a1ce79d285cdc5c3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 12 Jul 2019 13:26:12 +0200
Subject: [PATCH 144/148] Set version to v2.1.5

---
 spacy/about.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index 758809934..8fb7d23bc 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -4,13 +4,13 @@
 # fmt: off
 
 __title__ = "spacy"
-__version__ = "2.1.5.dev0"
+__version__ = "2.1.5"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"
 __email__ = "contact@explosion.ai"
 __license__ = "MIT"
-__release__ = False
+__release__ = True
 
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From 02e12b08527242ae976a62c17ba425a962d36e77 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 12 Jul 2019 13:36:47 +0200
Subject: [PATCH 145/148] Update landing with IRL videos [ci skip]

---
 website/src/widgets/landing.js | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index f55aa5aa3..e9dec87f4 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -152,20 +152,21 @@ const Landing = ({ data }) => {
             <LandingBannerGrid>
                 <LandingBanner
                     title="spaCy IRL 2019: Two days of NLP"
-                    label="Join us in Berlin"
-                    to="https://irl.spacy.io/2019"
-                    button="Get tickets"
+                    label="Watch the videos"
+                    to="https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc"
+                    button="Watch the videos"
                     background="#ffc194"
                     backgroundImage={irlBackground}
                     color="#1a1e23"
                     small
                 >
-                    We're pleased to invite the spaCy community and other folks working on Natural
+                    We were pleased to invite the spaCy community and other folks working on Natural
                     Language Processing to Berlin this summer for a small and intimate event{' '}
-                    <strong>July 5-6, 2019</strong>. The event includes a hands-on training day for
-                    teams using spaCy in production, followed by a one-track conference. We've
-                    booked a beautiful venue, hand-picked an awesome lineup of speakers and
-                    scheduled plenty of social time to get to know each other and exchange ideas.
+                    <strong>July 6, 2019</strong>. We booked a beautiful venue, hand-picked an
+                    awesome lineup of speakers and scheduled plenty of social time to get to know
+                    each other and exchange ideas. The YouTube playlist includes 12 talks about NLP
+                    research, development and applications, with keynotes by Sebastian Ruder
+                    (DeepMind) and Yoav Goldberg (Allen AI).
                 </LandingBanner>
 
                 <LandingBanner

From 72810268793a3ee72259a712505909edba1711f4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Fri, 12 Jul 2019 17:40:00 +0200
Subject: [PATCH 146/148] Increment version [ci skip]

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 8fb7d23bc..16e5e9522 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -4,7 +4,7 @@
 # fmt: off
 
 __title__ = "spacy"
-__version__ = "2.1.5"
+__version__ = "2.1.6"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"

From c345c042b0b2c2b8b1607f8f2a8f8ebbe745aa88 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 12 Jul 2019 17:48:16 +0200
Subject: [PATCH 147/148] Fix symbol alignment

---
 spacy/symbols.pxd | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 4501861a2..5922ee588 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -81,7 +81,6 @@ cdef enum symbol_t:
     DEP
     ENT_IOB
     ENT_TYPE
-    ENT_KB_ID
     HEAD
     SENT_START
     SPACY
@@ -461,3 +460,5 @@ cdef enum symbol_t:
     xcomp
 
     acl
+
+    ENT_KB_ID

From ef666656b397b6ec6c5f2693c22afd5a65dea9d3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 12 Jul 2019 17:59:47 +0200
Subject: [PATCH 148/148] Fix attrs alignment

---
 spacy/attrs.pxd | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index c5ba8d765..d9aca078c 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -1,4 +1,6 @@
 # Reserve 64 values for flag features
+from . cimport symbols
+
 cdef enum attr_id_t:
     NULL_ATTR
     IS_ALPHA
@@ -82,10 +84,10 @@ cdef enum attr_id_t:
     DEP
     ENT_IOB
     ENT_TYPE
-    ENT_KB_ID
     HEAD
     SENT_START
     SPACY
     PROB
 
     LANG
+    ENT_KB_ID = symbols.ENT_KB_ID