From 1ee0e78fd7a07637f5ac31154d4c63faeba6f4cd Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 18:55:01 +0100
Subject: [PATCH] select candidate with highest prior probabiity

---
 examples/pipeline/dummy_entity_linking.py | 69 +++++++++++++++++++++
 spacy/kb.pxd                              | 10 +---
 spacy/kb.pyx                              | 26 +-------
 spacy/pipeline/pipes.pyx                  | 11 +++-
 spacy/sandbox_test_sofie/testing_el.py    | 73 -----------------------
 5 files changed, 81 insertions(+), 108 deletions(-)
 create mode 100644 examples/pipeline/dummy_entity_linking.py
 delete mode 100644 spacy/sandbox_test_sofie/testing_el.py

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
new file mode 100644
index 000000000..c51f321e0
--- /dev/null
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+"""Demonstrate how to build a simple knowledge base and run an Entity Linking algorithm.
+Currently still a bit of a dummy algorithm: taking simply the entity with highest probability for a given alias
+"""
+import spacy
+from spacy.kb import KnowledgeBase
+
+
+def create_kb():
+    kb = KnowledgeBase()
+
+    # adding entities
+    entity_0 = "Q1004791"
+    print("adding entity", entity_0)
+    kb.add_entity(entity_id=entity_0, entity_name="Douglas", prob=0.5)
+
+    entity_1 = "Q42"
+    print("adding entity", entity_1)
+    kb.add_entity(entity_id=entity_1, entity_name="Douglas Adams", prob=0.5)
+
+    entity_2 = "Q5301561"
+    print("adding entity", entity_2)
+    kb.add_entity(entity_id=entity_2, entity_name="Douglas Haig", prob=0.5)
+
+    # adding aliases
+    print()
+    alias_0 = "Douglas"
+    print("adding alias", alias_0, "to all three entities")
+    kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2])
+
+    alias_1 = "Douglas Adams"
+    print("adding alias", alias_1, "to just the one entity")
+    kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9])
+
+    print()
+    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
+
+    return kb
+
+
+def add_el(kb):
+    nlp = spacy.load('en_core_web_sm')
+
+    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
+    nlp.add_pipe(el_pipe, last=True)
+
+    for alias in ["Douglas Adams", "Douglas"]:
+        candidates = nlp.linker.kb.get_candidates(alias)
+        print()
+        print(len(candidates), "candidate(s) for", alias, ":")
+        for c in candidates:
+            print(" ", c.entity_id_, c.entity_name_, c.alias_, c.prior_prob)
+
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
+           "Douglas reminds us to always bring our towel."
+    doc = nlp(text)
+
+    print()
+    for token in doc:
+        print("token", token.text, token.ent_type_, token.ent_kb_id_)
+
+    print()
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
+
+
+if __name__ == "__main__":
+    mykb = create_kb()
+    add_el(mykb)
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index cffbcd5d1..490e05036 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -44,15 +44,7 @@ cdef struct _AliasC:
     vector[float] probs
 
 
-# TODO: document
-cdef class Entity:
-
-    cdef readonly KnowledgeBase kb
-    cdef hash_t entity_id_hash
-    cdef float confidence
-
-
-# TODO: document
+# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 cdef class Candidate:
 
     cdef readonly KnowledgeBase kb
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index e51cb087d..6d031fb91 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -3,28 +3,6 @@
 from spacy.errors import user_warning
 
 
-cdef class Entity:
-
-    def __init__(self, KnowledgeBase kb, entity_id_hash, confidence):
-        self.kb = kb
-        self.entity_id_hash = entity_id_hash
-        self.confidence = confidence
-
-    property kb_id_:
-        """RETURNS (unicode): ID of this entity in the KB"""
-        def __get__(self):
-            return self.kb.strings[self.entity_id_hash]
-
-    property kb_id:
-        """RETURNS (uint64): hash of the entity's KB ID"""
-        def __get__(self):
-            return self.entity_id_hash
-
-    property confidence:
-        def __get__(self):
-            return self.confidence
-
-
 cdef class Candidate:
 
     def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob):
@@ -103,7 +81,8 @@ cdef class KnowledgeBase:
             return
 
         cdef int32_t dummy_value = 342
-        self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
+        self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob,
+                          vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
@@ -155,6 +134,7 @@ cdef class KnowledgeBase:
 
 
     def get_candidates(self, unicode alias):
+        """ TODO: where to put this functionality ?"""
         cdef hash_t alias_hash = self.strings[alias]
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 6bb7da1eb..98ca9d76d 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1086,12 +1086,17 @@ class EntityLinker(Pipe):
             yield from docs
 
     def set_annotations(self, docs, scores, tensors=None):
-        # TODO Sofie: actually implement this class instead of dummy implementation
+        """
+        Currently implemented as taking the KB entry with highest prior probability for each named entity
+        TODO: actually use context etc
+        """
         for i, doc in enumerate(docs):
             for ent in doc.ents:
-                if ent.label_ in ["PERSON", "PER"]:
+                candidates = self.kb.get_candidates(ent.text)
+                if candidates:
+                    best_candidate = max(candidates, key=lambda c: c.prior_prob)
                     for token in ent:
-                        token.ent_kb_id_ = "Q42"
+                        token.ent_kb_id_ = best_candidate.entity_id_
 
     def get_loss(self, docs, golds, scores):
         # TODO
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
deleted file mode 100644
index c7b0a3a07..000000000
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# coding: utf-8
-import spacy
-from spacy.kb import KnowledgeBase
-
-
-def create_kb():
-    mykb = KnowledgeBase()
-
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
-    print()
-
-    # adding entities
-    entity_0 = "Q0"  # douglas adams
-    print(" adding entity", entity_0)
-    mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5)
-
-    entity_42 = "Q42"   # douglas adams
-    print(" adding entity", entity_42)
-    mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5)
-
-    entity_5301561 = "Q5301561"
-    print(" adding entity", entity_5301561)
-    mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5)
-
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
-    print()
-
-    # adding aliases
-    alias1 = "douglassss"
-    print(" adding alias", alias1, "to Q42 and Q5301561")
-    mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
-
-    alias3 = "adam"
-    print(" adding alias", alias3, "to Q42")
-    mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9])
-
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
-    print()
-
-    return mykb
-
-
-def add_el(kb):
-    nlp = spacy.load('en_core_web_sm')
-    print("pipes before:", nlp.pipe_names)
-
-    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
-    nlp.add_pipe(el_pipe, last=True)
-
-    print("pipes after:", nlp.pipe_names)
-    print()
-
-    text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel."
-    doc = nlp(text)
-
-    for token in doc:
-        print("token", token.text, token.ent_type_, token.ent_kb_id_)
-
-    print()
-    for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
-
-    print()
-    for alias in ["douglassss", "rubbish", "adam"]:
-        candidates = nlp.linker.kb.get_candidates(alias)
-        print(len(candidates), "candidates for", alias, ":")
-        for c in candidates:
-            print(" ", c.entity_id_, c.entity_name_, c.alias_)
-
-
-if __name__ == "__main__":
-    mykb = create_kb()
-    add_el(mykb)