From 6e3223f23494a8c3361290a748de39f5768438d4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 24 Apr 2019 11:26:38 +0200 Subject: [PATCH] bulk loading in proper order of entity indices --- examples/pipeline/wikidata_entity_linking.py | 13 ++-- spacy/kb.pxd | 57 +++++------------ spacy/kb.pyx | 65 +++++++++++++------- spacy/structs.pxd | 37 +++++++++++ 4 files changed, 100 insertions(+), 72 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index db8d4577c..674c6166c 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -424,9 +424,8 @@ if __name__ == "__main__": # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True) # STEP 3 : write KB to file - # TODO - nlp = spacy.load('en_core_web_sm') - kb1 = KnowledgeBase(vocab=nlp.vocab) + nlp1 = spacy.load('en_core_web_sm') + kb1 = KnowledgeBase(vocab=nlp1.vocab) kb1.add_entity(entity="Q53", prob=0.33) kb1.add_entity(entity="Q17", prob=0.1) @@ -437,11 +436,11 @@ if __name__ == "__main__": kb1.dump(KB_FILE) # STEP 4 : read KB back in from file - # TODO - kb2 = KnowledgeBase(vocab=nlp.vocab) - kb2.load(KB_FILE) - print("kb2 size:", len(kb2), kb2.get_size_entities(), kb2.get_size_aliases()) + nlp3 = spacy.load('en_core_web_sm') + kb3 = KnowledgeBase(vocab=nlp3.vocab) + kb3.load_bulk(7, KB_FILE) + print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases()) # STEP 5 : actually use the EL functionality # add_el(my_kb, nlp) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index c655c6bff..817b7ff25 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -1,48 +1,17 @@ """Knowledge-base for entity or concept linking.""" from cymem.cymem cimport Pool from preshed.maps cimport PreshMap + from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t +from libc.stdio cimport FILE from spacy.vocab cimport Vocab from .typedefs cimport hash_t -from libc.stdio cimport FILE - - -# Internal struct, for storage and disambiguation. This isn't what we return -# to the user as the answer to "here's your entity". It's the minimum number -# of bits we need to keep track of the answers. -cdef struct _EntryC: - - # The hash of this entry's unique ID/name in the kB - hash_t entity_hash - - # Allows retrieval of one or more vectors. - # Each element of vector_rows should be an index into a vectors table. - # Every entry should have the same number of vectors, so we can avoid storing - # the number of vectors in each knowledge-base struct - int32_t* vector_rows - - # Allows retrieval of a struct of non-vector features. We could make this a - # pointer, but we have 32 bits left over in the struct after prob, so we'd - # like this to only be 32 bits. We can also set this to -1, for the common - # case where there are no features. - int32_t feats_row - - # log probability of entity, based on corpus frequency - float prob - - -# Each alias struct stores a list of Entry pointers with their prior probabilities -# for this specific mention/alias. -cdef struct _AliasC: - - # All entry candidates for this alias - vector[int64_t] entry_indices - - # Prior probability P(entity|alias) - should sum up to (at most) 1. - vector[float] probs +from .structs cimport EntryC, AliasC +ctypedef vector[EntryC] entry_vec +ctypedef vector[AliasC] alias_vec # Object used by the Entity Linker that summarizes one entity-alias candidate combination. @@ -68,7 +37,7 @@ cdef class KnowledgeBase: # over allocation. # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries. # Storing 1m entries would take 41.6mb under this scheme. - cdef vector[_EntryC] _entries + cdef entry_vec _entries # This maps 64bit keys (hash of unique alias string) # to 64bit values (position of the _AliasC struct in the _aliases_table vector). @@ -78,7 +47,7 @@ cdef class KnowledgeBase: # should be P(entity | mention), which is pretty important to know. # We can pack both pieces of information into a 64-bit value, to keep things # efficient. - cdef vector[_AliasC] _aliases_table + cdef alias_vec _aliases_table # This is the part which might take more space: storing various # categorical features for the entries, and storing vectors for disambiguation @@ -98,6 +67,7 @@ cdef class KnowledgeBase: # optional data, we can let users configure a DB as the backend for this. cdef object _features_table + cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows, int feats_row) nogil: """Add an entry to the vector of entries. @@ -107,7 +77,7 @@ cdef class KnowledgeBase: cdef int64_t new_index = self._entries.size() # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642 - cdef _EntryC entry + cdef EntryC entry entry.entity_hash = entity_hash entry.vector_rows = vector_rows entry.feats_row = feats_row @@ -124,7 +94,7 @@ cdef class KnowledgeBase: cdef int64_t new_index = self._aliases_table.size() # Avoid struct initializer to enable nogil - cdef _AliasC alias + cdef AliasC alias alias.entry_indices = entry_indices alias.probs = probs @@ -140,7 +110,7 @@ cdef class KnowledgeBase: cdef int32_t dummy_value = 0 # Avoid struct initializer to enable nogil - cdef _EntryC entry + cdef EntryC entry entry.entity_hash = dummy_hash entry.vector_rows = &dummy_value entry.feats_row = dummy_value @@ -152,20 +122,21 @@ cdef class KnowledgeBase: cdef vector[float] dummy_probs dummy_probs.push_back(0) - cdef _AliasC alias + cdef AliasC alias alias.entry_indices = dummy_entry_indices alias.probs = dummy_probs self._entries.push_back(entry) self._aliases_table.push_back(alias) + cpdef load_bulk(self, int nr_entities, loc) + cdef class Writer: cdef FILE* _fp cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1 - cdef class Reader: cdef FILE* _fp diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 4ec910b03..c967654d3 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -7,6 +7,9 @@ from cpython.exc cimport PyErr_CheckSignals from spacy import util from spacy.errors import Errors, Warnings, user_warning +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMap + from cpython.mem cimport PyMem_Malloc from cpython.exc cimport PyErr_SetFromErrno @@ -17,6 +20,8 @@ from libc.stdlib cimport qsort from .typedefs cimport hash_t from os import path +from libcpp.vector cimport vector + cdef class Candidate: @@ -53,7 +58,6 @@ cdef class Candidate: cdef class KnowledgeBase: - def __init__(self, Vocab vocab): self.vocab = vocab self.mem = Pool() @@ -67,13 +71,13 @@ cdef class KnowledgeBase: return self.get_size_entities() def get_size_entities(self): - return self._entries.size() - 1 # not counting dummy element on index 0 + return len(self._entry_index) def get_entity_strings(self): return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0 def get_size_aliases(self): - return self._aliases_table.size() - 1 # not counting dummy element on index + return len(self._alias_index) def get_alias_strings(self): return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0 @@ -159,33 +163,44 @@ cdef class KnowledgeBase: def dump(self, loc): cdef Writer writer = Writer(loc) - for key, entry_index in self._entry_index.items(): + # dumping the entry records in the order in which they are in the _entries vector. + # index 0 is a dummy object not stored in the _entry_index and can be ignored. + i = 1 + for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): entry = self._entries[entry_index] print("dumping") print("index", entry_index) print("hash", entry.entity_hash) + assert entry.entity_hash == entry_hash + assert entry_index == i print("prob", entry.prob) print("") writer.write(entry_index, entry.entity_hash, entry.prob) + i = i+1 writer.close() - def load(self, loc): + cpdef load_bulk(self, int nr_entities, loc): + # TODO: nr_entities from header in file (Reader constructor) cdef int64_t entry_id cdef hash_t entity_hash cdef float prob - cdef _EntryC entry + cdef EntryC entry cdef int32_t dummy_value = 342 cdef Reader reader = Reader(loc) - result = reader.read(self.mem, &entry_id, &entity_hash, &prob) # -1: error, 0: eof after this one - while result: - print("loading") - print("entryID", entry_id) - print("hash", entity_hash) - print("prob", prob) - print("result:", result) - print("") + to_read = self.get_size_entities() + + self._entry_index = PreshMap(nr_entities+1) + self._entries = entry_vec(nr_entities+1) + + # we assume the data was written in sequence + # index 0 is a dummy object not stored in the _entry_index and can be ignored. + # TODO: should we initialize the dummy objects ? + cdef int i = 1 + while reader.read(self.mem, &entry_id, &entity_hash, &prob) and i <= nr_entities: + assert i == entry_id + entry.entity_hash = entity_hash entry.prob = prob @@ -193,9 +208,18 @@ cdef class KnowledgeBase: entry.vector_rows = &dummy_value entry.feats_row = dummy_value - # TODO: use set instead of push_back to ensure the index remains the same? - self._entries.push_back(entry) - result = reader.read(self.mem, &entry_id, &entity_hash, &prob) + print("bulk loading") + print("i", i) + print("entryID", entry_id) + print("hash", entry.entity_hash) + print("prob", entry.prob) + print("") + + self._entries[i] = entry + self._entry_index[entity_hash] = i + + i += 1 + cdef class Writer: def __init__(self, object loc): @@ -236,11 +260,6 @@ cdef class Reader: fclose(self._fp) cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: - """ - Return values: - -1: error during current read (EOF during call) - 0: means we read the last line succesfully (EOF after call) - 1: we can continue reading this file """ status = fread(entry_id, sizeof(int64_t), 1, self._fp) if status < 1: if feof(self._fp): @@ -263,3 +282,5 @@ cdef class Reader: return 0 else: return 1 + + diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 154202c0d..69a1f4961 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -3,6 +3,10 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t from .typedefs cimport flags_t, attr_t, hash_t from .parts_of_speech cimport univ_pos_t +from libcpp.vector cimport vector +from libc.stdint cimport int32_t, int64_t + + cdef struct LexemeC: flags_t flags @@ -72,3 +76,36 @@ cdef struct TokenC: attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. attr_t ent_kb_id hash_t ent_id + + +# Internal struct, for storage and disambiguation of entities. +cdef struct EntryC: + + # The hash of this entry's unique ID/name in the kB + hash_t entity_hash + + # Allows retrieval of one or more vectors. + # Each element of vector_rows should be an index into a vectors table. + # Every entry should have the same number of vectors, so we can avoid storing + # the number of vectors in each knowledge-base struct + int32_t* vector_rows + + # Allows retrieval of a struct of non-vector features. We could make this a + # pointer, but we have 32 bits left over in the struct after prob, so we'd + # like this to only be 32 bits. We can also set this to -1, for the common + # case where there are no features. + int32_t feats_row + + # log probability of entity, based on corpus frequency + float prob + + +# Each alias struct stores a list of Entry pointers with their prior probabilities +# for this specific mention/alias. +cdef struct AliasC: + + # All entry candidates for this alias + vector[int64_t] entry_indices + + # Prior probability P(entity|alias) - should sum up to (at most) 1. + vector[float] probs