From 6e3223f23494a8c3361290a748de39f5768438d4 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 24 Apr 2019 11:26:38 +0200
Subject: [PATCH] bulk loading in proper order of entity indices

---
 examples/pipeline/wikidata_entity_linking.py | 13 ++--
 spacy/kb.pxd                                 | 57 +++++------------
 spacy/kb.pyx                                 | 65 +++++++++++++-------
 spacy/structs.pxd                            | 37 +++++++++++
 4 files changed, 100 insertions(+), 72 deletions(-)

diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index db8d4577c..674c6166c 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -424,9 +424,8 @@ if __name__ == "__main__":
     # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
 
     # STEP 3 : write KB to file
-    # TODO
-    nlp = spacy.load('en_core_web_sm')
-    kb1 = KnowledgeBase(vocab=nlp.vocab)
+    nlp1 = spacy.load('en_core_web_sm')
+    kb1 = KnowledgeBase(vocab=nlp1.vocab)
 
     kb1.add_entity(entity="Q53", prob=0.33)
     kb1.add_entity(entity="Q17", prob=0.1)
@@ -437,11 +436,11 @@ if __name__ == "__main__":
     kb1.dump(KB_FILE)
 
     # STEP 4 : read KB back in from file
-    # TODO
 
-    kb2 = KnowledgeBase(vocab=nlp.vocab)
-    kb2.load(KB_FILE)
-    print("kb2 size:", len(kb2), kb2.get_size_entities(), kb2.get_size_aliases())
+    nlp3 = spacy.load('en_core_web_sm')
+    kb3 = KnowledgeBase(vocab=nlp3.vocab)
+    kb3.load_bulk(7, KB_FILE)
+    print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases())
 
     # STEP 5 : actually use the EL functionality
     # add_el(my_kb, nlp)
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index c655c6bff..817b7ff25 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -1,48 +1,17 @@
 """Knowledge-base for entity or concept linking."""
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
+
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
+from libc.stdio cimport FILE
 
 from spacy.vocab cimport Vocab
 from .typedefs cimport hash_t
 
-from libc.stdio cimport FILE
-
-
-# Internal struct, for storage and disambiguation. This isn't what we return
-# to the user as the answer to "here's your entity". It's the minimum number
-# of bits we need to keep track of the answers.
-cdef struct _EntryC:
-
-    # The hash of this entry's unique ID/name in the kB
-    hash_t entity_hash
-
-    # Allows retrieval of one or more vectors.
-    # Each element of vector_rows should be an index into a vectors table.
-    # Every entry should have the same number of vectors, so we can avoid storing
-    # the number of vectors in each knowledge-base struct
-    int32_t* vector_rows
-
-    # Allows retrieval of a struct of non-vector features. We could make this a
-    # pointer, but we have 32 bits left over in the struct after prob, so we'd
-    # like this to only be 32 bits. We can also set this to -1, for the common
-    # case where there are no features.
-    int32_t feats_row
-
-    # log probability of entity, based on corpus frequency
-    float prob
-
-
-# Each alias struct stores a list of Entry pointers with their prior probabilities
-# for this specific mention/alias.
-cdef struct _AliasC:
-
-    # All entry candidates for this alias
-    vector[int64_t] entry_indices
-
-    # Prior probability P(entity|alias) - should sum up to (at most) 1.
-    vector[float] probs
+from .structs cimport EntryC, AliasC
+ctypedef vector[EntryC] entry_vec
+ctypedef vector[AliasC] alias_vec
 
 
 # Object used by the Entity Linker that summarizes one entity-alias candidate combination.
@@ -68,7 +37,7 @@ cdef class KnowledgeBase:
     # over allocation.
     # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
     # Storing 1m entries would take 41.6mb under this scheme.
-    cdef vector[_EntryC] _entries
+    cdef entry_vec _entries
 
     # This maps 64bit keys (hash of unique alias string)
     # to 64bit values (position of the _AliasC struct in the _aliases_table vector).
@@ -78,7 +47,7 @@ cdef class KnowledgeBase:
     # should be P(entity | mention), which is pretty important to know.
     # We can pack both pieces of information into a 64-bit value, to keep things
     # efficient.
-    cdef vector[_AliasC] _aliases_table
+    cdef alias_vec _aliases_table
 
     # This is the part which might take more space: storing various
     # categorical features for the entries, and storing vectors for disambiguation
@@ -98,6 +67,7 @@ cdef class KnowledgeBase:
     # optional data, we can let users configure a DB as the backend for this.
     cdef object _features_table
 
+
     cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
                                      int32_t* vector_rows, int feats_row) nogil:
         """Add an entry to the vector of entries.
@@ -107,7 +77,7 @@ cdef class KnowledgeBase:
         cdef int64_t new_index = self._entries.size()
 
         # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
-        cdef _EntryC entry
+        cdef EntryC entry
         entry.entity_hash = entity_hash
         entry.vector_rows = vector_rows
         entry.feats_row = feats_row
@@ -124,7 +94,7 @@ cdef class KnowledgeBase:
         cdef int64_t new_index = self._aliases_table.size()
 
         # Avoid struct initializer to enable nogil
-        cdef _AliasC alias
+        cdef AliasC alias
         alias.entry_indices = entry_indices
         alias.probs = probs
 
@@ -140,7 +110,7 @@ cdef class KnowledgeBase:
         cdef int32_t dummy_value = 0
 
         # Avoid struct initializer to enable nogil
-        cdef _EntryC entry
+        cdef EntryC entry
         entry.entity_hash = dummy_hash
         entry.vector_rows = &dummy_value
         entry.feats_row = dummy_value
@@ -152,20 +122,21 @@ cdef class KnowledgeBase:
         cdef vector[float] dummy_probs
         dummy_probs.push_back(0)
 
-        cdef _AliasC alias
+        cdef AliasC alias
         alias.entry_indices = dummy_entry_indices
         alias.probs = dummy_probs
 
         self._entries.push_back(entry)
         self._aliases_table.push_back(alias)
 
+    cpdef load_bulk(self, int nr_entities, loc)
+
 
 cdef class Writer:
     cdef FILE* _fp
 
     cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1
 
-
 cdef class Reader:
     cdef FILE* _fp
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 4ec910b03..c967654d3 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -7,6 +7,9 @@ from cpython.exc cimport PyErr_CheckSignals
 from spacy import util
 from spacy.errors import Errors, Warnings, user_warning
 
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+
 from cpython.mem cimport PyMem_Malloc
 from cpython.exc cimport PyErr_SetFromErrno
 
@@ -17,6 +20,8 @@ from libc.stdlib cimport qsort
 from .typedefs cimport hash_t
 
 from os import path
+from libcpp.vector cimport vector
+
 
 
 cdef class Candidate:
@@ -53,7 +58,6 @@ cdef class Candidate:
 
 
 cdef class KnowledgeBase:
-
     def __init__(self, Vocab vocab):
         self.vocab = vocab
         self.mem = Pool()
@@ -67,13 +71,13 @@ cdef class KnowledgeBase:
         return self.get_size_entities()
 
     def get_size_entities(self):
-        return self._entries.size() - 1  # not counting dummy element on index 0
+        return len(self._entry_index)
 
     def get_entity_strings(self):
         return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0
 
     def get_size_aliases(self):
-        return self._aliases_table.size() - 1 # not counting dummy element on index
+        return len(self._alias_index)
 
     def get_alias_strings(self):
         return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0
@@ -159,33 +163,44 @@ cdef class KnowledgeBase:
     def dump(self, loc):
         cdef Writer writer = Writer(loc)
 
-        for key, entry_index in self._entry_index.items():
+        # dumping the entry records in the order in which they are in the _entries vector.
+        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
+        i = 1
+        for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
             entry = self._entries[entry_index]
             print("dumping")
             print("index", entry_index)
             print("hash", entry.entity_hash)
+            assert entry.entity_hash ==  entry_hash
+            assert entry_index == i
             print("prob", entry.prob)
             print("")
             writer.write(entry_index, entry.entity_hash, entry.prob)
+            i = i+1
 
         writer.close()
 
-    def load(self, loc):
+    cpdef load_bulk(self, int nr_entities, loc):
+        # TODO: nr_entities from header in file (Reader constructor)
         cdef int64_t entry_id
         cdef hash_t entity_hash
         cdef float prob
-        cdef _EntryC entry
+        cdef EntryC entry
         cdef int32_t dummy_value = 342
 
         cdef Reader reader = Reader(loc)
-        result = reader.read(self.mem, &entry_id, &entity_hash, &prob)  # -1: error, 0: eof after this one
-        while result:
-            print("loading")
-            print("entryID", entry_id)
-            print("hash", entity_hash)
-            print("prob", prob)
-            print("result:", result)
-            print("")
+        to_read = self.get_size_entities()
+
+        self._entry_index = PreshMap(nr_entities+1)
+        self._entries = entry_vec(nr_entities+1)
+
+        # we assume the data was written in sequence
+        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
+        # TODO: should we initialize the dummy objects ?
+        cdef int i = 1
+        while reader.read(self.mem, &entry_id, &entity_hash, &prob) and i <= nr_entities:
+            assert i == entry_id
+
             entry.entity_hash = entity_hash
             entry.prob = prob
 
@@ -193,9 +208,18 @@ cdef class KnowledgeBase:
             entry.vector_rows = &dummy_value
             entry.feats_row = dummy_value
 
-            # TODO: use set instead of push_back to ensure the index remains the same?
-            self._entries.push_back(entry)
-            result = reader.read(self.mem, &entry_id, &entity_hash, &prob)
+            print("bulk loading")
+            print("i", i)
+            print("entryID", entry_id)
+            print("hash", entry.entity_hash)
+            print("prob", entry.prob)
+            print("")
+
+            self._entries[i] = entry
+            self._entry_index[entity_hash] = i
+
+            i += 1
+
 
 cdef class Writer:
     def __init__(self, object loc):
@@ -236,11 +260,6 @@ cdef class Reader:
         fclose(self._fp)
 
     cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
-        """ 
-        Return values:
-        -1: error during current read (EOF during call)
-        0: means we read the last line succesfully (EOF after call)
-        1: we can continue reading this file """
         status = fread(entry_id, sizeof(int64_t), 1, self._fp)
         if status < 1:
             if feof(self._fp):
@@ -263,3 +282,5 @@ cdef class Reader:
             return 0
         else:
             return 1
+
+
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 154202c0d..69a1f4961 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -3,6 +3,10 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
 from .typedefs cimport flags_t, attr_t, hash_t
 from .parts_of_speech cimport univ_pos_t
 
+from libcpp.vector cimport vector
+from libc.stdint cimport int32_t, int64_t
+
+
 
 cdef struct LexemeC:
     flags_t flags
@@ -72,3 +76,36 @@ cdef struct TokenC:
     attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
     attr_t ent_kb_id
     hash_t ent_id
+
+
+# Internal struct, for storage and disambiguation of entities.
+cdef struct EntryC:
+
+    # The hash of this entry's unique ID/name in the kB
+    hash_t entity_hash
+
+    # Allows retrieval of one or more vectors.
+    # Each element of vector_rows should be an index into a vectors table.
+    # Every entry should have the same number of vectors, so we can avoid storing
+    # the number of vectors in each knowledge-base struct
+    int32_t* vector_rows
+
+    # Allows retrieval of a struct of non-vector features. We could make this a
+    # pointer, but we have 32 bits left over in the struct after prob, so we'd
+    # like this to only be 32 bits. We can also set this to -1, for the common
+    # case where there are no features.
+    int32_t feats_row
+
+    # log probability of entity, based on corpus frequency
+    float prob
+
+
+# Each alias struct stores a list of Entry pointers with their prior probabilities
+# for this specific mention/alias.
+cdef struct AliasC:
+
+    # All entry candidates for this alias
+    vector[int64_t] entry_indices
+
+    # Prior probability P(entity|alias) - should sum up to (at most) 1.
+    vector[float] probs