From 5ac7edf53c328c90ac4701ef687b0964ea4b756c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 18 Mar 2019 12:38:40 +0100
Subject: [PATCH] adding aliases per entity in the KB

---
 spacy/kb.pxd | 53 +++++++++++++++++++++++++++++++++++++++-------------
 spacy/kb.pyx | 25 ++++++++++++++-----------
 2 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 3ba9c8bba..92a0c8b95 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -27,15 +27,25 @@ cdef struct _EntryC:
     float prob
 
 
+# Each alias struct stores a list of Entry pointers with their prior probabilities
+# for this specific mention/alias.
+cdef struct _AliasC:
+
+    # All entry candidates for this alias
+    const vector[int64_t] entry_indices
+
+    # Prior probability P(entity|alias) - should sum up to (at most) 1.
+    const vector[float] probs
+
+
 cdef class KnowledgeBase:
     cdef Pool mem
 
-    # This maps 64bit keys to 64bit values. Here the key would be a hash of
-    # a unique string name for the entity, and the value would be the position
-    # of the _EntryC struct in our vector.
+    # This maps 64bit keys (hash of unique entity string)
+    # to 64bit values (position of the _EntryC struct in the _entries vector).
     # The PreshMap is pretty space efficient, as it uses open addressing. So
     # the only overhead is the vacancy rate, which is approximately 30%.
-    cdef PreshMap _index
+    cdef PreshMap _entry_index
 
     # Each entry takes 128 bits, and again we'll have a 30% or so overhead for
     # over allocation.
@@ -43,6 +53,16 @@ cdef class KnowledgeBase:
     # Storing 1m entries would take 41.6mb under this scheme.
     cdef vector[_EntryC] _entries
 
+    # This maps 64bit keys (hash of unique alias string)
+    # to 64bit values (position of the _AliasC struct in the _aliases_table vector).
+    cdef PreshMap _alias_index
+
+    # This should map mention hashes to (entry_id, prob) tuples. The probability
+    # should be P(entity | mention), which is pretty important to know.
+    # We can pack both pieces of information into a 64-bit value, to keep things
+    # efficient.
+    cdef vector[_AliasC] _aliases_table
+
     # This is the part which might take more space: storing various
     # categorical features for the entries, and storing vectors for disambiguation
     # and possibly usage.
@@ -61,23 +81,30 @@ cdef class KnowledgeBase:
     # optional data, we can let users configure a DB as the backend for this.
     cdef object _features_table
 
-    # This should map mention hashes to (entry_id, prob) tuples. The probability
-    # should be P(entity | mention), which is pretty important to know.
-    # We can pack both pieces of information into a 64-bit value, to keep things
-    # efficient.
-    cdef object _aliases_table
 
-    cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
+    cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows,
                     int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
-        cdef int64_t index = self._entries.size()
+        cdef int64_t entity_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
                 vector_rows=vector_rows,
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._index[key] = index
-        return index
\ No newline at end of file
+        self._index[entity_key] = entity_index
+        return entity_index
+
+    cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
+        """Connect a mention to a list of potential entities with their prior probabilities ."""
+        cdef int64_t alias_index = self._aliases_table.size()
+
+        self._aliases_table.push_back(
+            _AliasC(
+                entry_indices=entry_indices,
+                probs=probs
+            ))
+        self._alias_index[alias_key] = alias_index
+        return alias_index
\ No newline at end of file
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 46acc2967..0f6a7aecc 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -5,16 +5,16 @@ cdef class KnowledgeBase:
     def __len__(self):
         return self._entries.size()
 
-    def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
+    def add_entity(self, entity_id: str, float prob, vectors=None, features=None):
         # TODO: more friendly check for non-unique name
-        if name in self:
+        if entity_id in self:
             return
 
-        cdef hash_t name_hash = hash_string(name)
+        cdef hash_t id_hash = hash_string(entity_id)
         cdef int32_t dummy_value = 342
-        self.c_add_entity(name_hash, prob, &dummy_value, dummy_value)
+        self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
-        #  self._features_table.get(features))
+        # self._features_table.get(features))
 
     def add_alias(self, alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
@@ -22,10 +22,13 @@ cdef class KnowledgeBase:
         cdef hash_t entity_hash = 0
         cdef int64_t entity_index = 0
 
-        # TODO: check len(entities) == len(probabilities)
-        for entity, prob in zip(entities, probabilities):
-            entity_hash = hash_string(entity)
-            entity_index = self._index[entity_hash]
-            # TODO: check that entity is already in this KB (entity_index is OK)
-            self._aliases_table.add(alias_hash, entity_index, prob)
+        cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities]
+
+        self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities)
+
+        # TODO: check that alias hadn't been defined before
+        # TODO: check that entity is already in this KB (entity_index is OK)
+        # TODO: check sum(probabilities) <= 1
+        # TODO: check len(entities) == len(probabilities)
+