From d0c763ba447282d53ac7d25354afde468f0e4a73 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 17:33:25 +0100
Subject: [PATCH] minimal EL pipe

---
 spacy/kb.pxd                           |  14 ++--
 spacy/kb.pyx                           |   3 +-
 spacy/language.py                      |   4 +
 spacy/pipeline/pipes.pyx               | 100 ++++---------------------
 spacy/sandbox_test_sofie/testing_el.py |  17 +++--
 5 files changed, 37 insertions(+), 101 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 4ae34bfa7..5fd239998 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -109,7 +109,7 @@ cdef class KnowledgeBase:
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
-        cdef int64_t entity_index = self._entries.size()
+        cdef int64_t new_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
                 entity_hash=entity_hash,
@@ -117,22 +117,22 @@ cdef class KnowledgeBase:
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._entry_index[entity_hash] = entity_index
-        return entity_index
+        self._entry_index[entity_hash] = new_index
+        return new_index
 
     cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
         """Connect a mention to a list of potential entities with their prior probabilities ."""
-        cdef int64_t alias_index = self._aliases_table.size()
+        cdef int64_t new_index = self._aliases_table.size()
 
         self._aliases_table.push_back(
             _AliasC(
                 entry_indices=entry_indices,
                 probs=probs
             ))
-        self._alias_index[alias_hash] = alias_index
-        return alias_index
+        self._alias_index[alias_hash] = new_index
+        return new_index
 
-    cdef inline create_empty_vectors(self):
+    cdef inline _create_empty_vectors(self):
         """ 
         Making sure the first element of each vector is a dummy,
         because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 62080e1be..33a79da04 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -65,7 +65,7 @@ cdef class KnowledgeBase:
         self._alias_index = PreshMap()
         self.mem = Pool()
         self.strings = StringStore()
-        self.create_empty_vectors()
+        self._create_empty_vectors()
 
     def __len__(self):
         return self.get_size_entities()
@@ -151,4 +151,3 @@ cdef class KnowledgeBase:
                           prior_prob=prob)
                 for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                 if entry_index != 0]
-
diff --git a/spacy/language.py b/spacy/language.py
index 736899341..f80d8699d 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -209,6 +209,10 @@ class Language(object):
     def entity(self):
         return self.get_pipe("ner")
 
+    @property
+    def linker(self):
+        return self.get_pipe("el")
+
     @property
     def matcher(self):
         return self.get_pipe("matcher")
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index e1e5471be..5866518a7 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1045,44 +1045,28 @@ class EntityLinker(Pipe):
 
     @classmethod
     def Model(cls, nr_class=1, **cfg):
-        embed_size = util.env_opt("embed_size", 2000)
-        if "token_vector_width" in cfg:
-            token_vector_width = cfg["token_vector_width"]
-        else:
-            token_vector_width = util.env_opt("token_vector_width", 96)
-        if cfg.get('architecture') == 'simple_cnn':
-            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
-            return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
-        else:
-            return None # build_text_classifier(nr_class, **cfg)
+        # TODO: non-dummy EL implementation
+        return None
 
-
-    def __init__(self, vocab, model=True, **cfg):
-        self.vocab = vocab
-        self.model = model
-        self._rehearsal_model = None
+    def __init__(self, model=True, **cfg):
+        self.model = False
         self.cfg = dict(cfg)
+        self.kb = self.cfg["kb"]
 
     def __call__(self, doc):
-        # scores, tensors = self.predict([doc])
-        scores, tensors = None, None
-        self.set_annotations([doc], scores, tensors=tensors)
+        self.set_annotations([doc], scores=None, tensors=None)
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
+        """Apply the pipe to a stream of documents.
+        Both __call__ and pipe should delegate to the `predict()`
+        and `set_annotations()` methods.
+        """
         for docs in util.minibatch(stream, size=batch_size):
             docs = list(docs)
-            scores, tensors = self.predict(docs)
-            self.set_annotations(docs, scores, tensors=tensors)
+            self.set_annotations(docs, scores=None, tensors=None)
             yield from docs
 
-    def predict(self, docs):
-        # self.require_model()
-        scores = self.model(docs)
-        scores = self.model.ops.asarray(scores)
-        tensors = [doc.tensor for doc in docs]
-        return scores, tensors
-
     def set_annotations(self, docs, scores, tensors=None):
         # TODO Sofie: actually implement this class instead of dummy implementation
         for i, doc in enumerate(docs):
@@ -1091,67 +1075,13 @@ class EntityLinker(Pipe):
                     for token in ent:
                         token.ent_kb_id_ = "Q42"
 
-    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
-        scores, bp_scores = self.model.begin_update(docs, drop=drop)
-        loss, d_scores = self.get_loss(docs, golds, scores)
-        bp_scores(d_scores, sgd=sgd)
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += loss
-
-    def rehearse(self, docs, drop=0., sgd=None, losses=None):
-        if self._rehearsal_model is None:
-            return
-        scores, bp_scores = self.model.begin_update(docs, drop=drop)
-        target = self._rehearsal_model(docs)
-        gradient = scores - target
-        bp_scores(gradient, sgd=sgd)
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += (gradient**2).sum()
-
     def get_loss(self, docs, golds, scores):
-        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
-        not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
-        for i, gold in enumerate(golds):
-            for j, label in enumerate(self.labels):
-                if label in gold.cats:
-                    truths[i, j] = gold.cats[label]
-                else:
-                    not_missing[i, j] = 0.
-        truths = self.model.ops.asarray(truths)
-        not_missing = self.model.ops.asarray(not_missing)
-        d_scores = (scores-truths) / scores.shape[0]
-        d_scores *= not_missing
-        mean_square_error = (d_scores**2).sum(axis=1).mean()
-        return float(mean_square_error), d_scores
+        # TODO
+        pass
 
     def add_label(self, label):
-        if label in self.labels:
-            return 0
-        if self.model not in (None, True, False):
-            # This functionality was available previously, but was broken.
-            # The problem is that we resize the last layer, but the last layer
-            # is actually just an ensemble. We're not resizing the child layers
-            # -- a huge problem.
-            raise ValueError(Errors.E116)
-            #smaller = self.model._layers[-1]
-            #larger = Affine(len(self.labels)+1, smaller.nI)
-            #copy_array(larger.W[:smaller.nO], smaller.W)
-            #copy_array(larger.b[:smaller.nO], smaller.b)
-            #self.model._layers[-1] = larger
-        self.labels = tuple(list(self.labels) + [label])
-        return 1
-
-    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
-                       **kwargs):
-        if self.model is True:
-            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
-            self.model = self.Model(len(self.labels), **self.cfg)
-            link_vectors_to_models(self.vocab)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
+        # TODO
+        pass
 
 
 __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker']
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 03261806b..f6296bf89 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -37,16 +37,14 @@ def create_kb():
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
     print()
 
-    for alias in [alias1, "rubbish", alias3]:
-        candidates = mykb.get_candidates(alias)
-        print(len(candidates), "candidates for", alias)
+    return mykb
 
 
-def add_el():
+def add_el(kb):
     nlp = spacy.load('en_core_web_sm')
     print("pipes before:", nlp.pipe_names)
 
-    el_pipe = nlp.create_pipe(name='el')
+    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
     nlp.add_pipe(el_pipe, last=True)
 
     print("pipes after:", nlp.pipe_names)
@@ -62,7 +60,12 @@ def add_el():
     for ent in doc.ents:
         print("ent", ent.text, ent.label_, ent.kb_id_)
 
+    print()
+    for alias in ["douglassss", "rubbish", "adam"]:
+        candidates = nlp.linker.kb.get_candidates(alias)
+        print(len(candidates), "candidates for", alias)
+
 
 if __name__ == "__main__":
-    # add_el()
-    create_kb()
+    mykb = create_kb()
+    add_el(mykb)