bulk loading in proper order of entity indices

This commit is contained in:
svlandeg 2019-04-24 11:26:38 +02:00
parent 694fea597a
commit 6e3223f234
4 changed files with 100 additions and 72 deletions

View File

@ -424,9 +424,8 @@ if __name__ == "__main__":
# my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True) # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
# STEP 3 : write KB to file # STEP 3 : write KB to file
# TODO nlp1 = spacy.load('en_core_web_sm')
nlp = spacy.load('en_core_web_sm') kb1 = KnowledgeBase(vocab=nlp1.vocab)
kb1 = KnowledgeBase(vocab=nlp.vocab)
kb1.add_entity(entity="Q53", prob=0.33) kb1.add_entity(entity="Q53", prob=0.33)
kb1.add_entity(entity="Q17", prob=0.1) kb1.add_entity(entity="Q17", prob=0.1)
@ -437,11 +436,11 @@ if __name__ == "__main__":
kb1.dump(KB_FILE) kb1.dump(KB_FILE)
# STEP 4 : read KB back in from file # STEP 4 : read KB back in from file
# TODO
kb2 = KnowledgeBase(vocab=nlp.vocab) nlp3 = spacy.load('en_core_web_sm')
kb2.load(KB_FILE) kb3 = KnowledgeBase(vocab=nlp3.vocab)
print("kb2 size:", len(kb2), kb2.get_size_entities(), kb2.get_size_aliases()) kb3.load_bulk(7, KB_FILE)
print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases())
# STEP 5 : actually use the EL functionality # STEP 5 : actually use the EL functionality
# add_el(my_kb, nlp) # add_el(my_kb, nlp)

View File

@ -1,48 +1,17 @@
"""Knowledge-base for entity or concept linking.""" """Knowledge-base for entity or concept linking."""
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
from spacy.vocab cimport Vocab from spacy.vocab cimport Vocab
from .typedefs cimport hash_t from .typedefs cimport hash_t
from libc.stdio cimport FILE from .structs cimport EntryC, AliasC
ctypedef vector[EntryC] entry_vec
ctypedef vector[AliasC] alias_vec
# Internal struct, for storage and disambiguation. This isn't what we return
# to the user as the answer to "here's your entity". It's the minimum number
# of bits we need to keep track of the answers.
cdef struct _EntryC:
# The hash of this entry's unique ID/name in the kB
hash_t entity_hash
# Allows retrieval of one or more vectors.
# Each element of vector_rows should be an index into a vectors table.
# Every entry should have the same number of vectors, so we can avoid storing
# the number of vectors in each knowledge-base struct
int32_t* vector_rows
# Allows retrieval of a struct of non-vector features. We could make this a
# pointer, but we have 32 bits left over in the struct after prob, so we'd
# like this to only be 32 bits. We can also set this to -1, for the common
# case where there are no features.
int32_t feats_row
# log probability of entity, based on corpus frequency
float prob
# Each alias struct stores a list of Entry pointers with their prior probabilities
# for this specific mention/alias.
cdef struct _AliasC:
# All entry candidates for this alias
vector[int64_t] entry_indices
# Prior probability P(entity|alias) - should sum up to (at most) 1.
vector[float] probs
# Object used by the Entity Linker that summarizes one entity-alias candidate combination. # Object used by the Entity Linker that summarizes one entity-alias candidate combination.
@ -68,7 +37,7 @@ cdef class KnowledgeBase:
# over allocation. # over allocation.
# In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries. # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
# Storing 1m entries would take 41.6mb under this scheme. # Storing 1m entries would take 41.6mb under this scheme.
cdef vector[_EntryC] _entries cdef entry_vec _entries
# This maps 64bit keys (hash of unique alias string) # This maps 64bit keys (hash of unique alias string)
# to 64bit values (position of the _AliasC struct in the _aliases_table vector). # to 64bit values (position of the _AliasC struct in the _aliases_table vector).
@ -78,7 +47,7 @@ cdef class KnowledgeBase:
# should be P(entity | mention), which is pretty important to know. # should be P(entity | mention), which is pretty important to know.
# We can pack both pieces of information into a 64-bit value, to keep things # We can pack both pieces of information into a 64-bit value, to keep things
# efficient. # efficient.
cdef vector[_AliasC] _aliases_table cdef alias_vec _aliases_table
# This is the part which might take more space: storing various # This is the part which might take more space: storing various
# categorical features for the entries, and storing vectors for disambiguation # categorical features for the entries, and storing vectors for disambiguation
@ -98,6 +67,7 @@ cdef class KnowledgeBase:
# optional data, we can let users configure a DB as the backend for this. # optional data, we can let users configure a DB as the backend for this.
cdef object _features_table cdef object _features_table
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
int32_t* vector_rows, int feats_row) nogil: int32_t* vector_rows, int feats_row) nogil:
"""Add an entry to the vector of entries. """Add an entry to the vector of entries.
@ -107,7 +77,7 @@ cdef class KnowledgeBase:
cdef int64_t new_index = self._entries.size() cdef int64_t new_index = self._entries.size()
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642 # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
cdef _EntryC entry cdef EntryC entry
entry.entity_hash = entity_hash entry.entity_hash = entity_hash
entry.vector_rows = vector_rows entry.vector_rows = vector_rows
entry.feats_row = feats_row entry.feats_row = feats_row
@ -124,7 +94,7 @@ cdef class KnowledgeBase:
cdef int64_t new_index = self._aliases_table.size() cdef int64_t new_index = self._aliases_table.size()
# Avoid struct initializer to enable nogil # Avoid struct initializer to enable nogil
cdef _AliasC alias cdef AliasC alias
alias.entry_indices = entry_indices alias.entry_indices = entry_indices
alias.probs = probs alias.probs = probs
@ -140,7 +110,7 @@ cdef class KnowledgeBase:
cdef int32_t dummy_value = 0 cdef int32_t dummy_value = 0
# Avoid struct initializer to enable nogil # Avoid struct initializer to enable nogil
cdef _EntryC entry cdef EntryC entry
entry.entity_hash = dummy_hash entry.entity_hash = dummy_hash
entry.vector_rows = &dummy_value entry.vector_rows = &dummy_value
entry.feats_row = dummy_value entry.feats_row = dummy_value
@ -152,20 +122,21 @@ cdef class KnowledgeBase:
cdef vector[float] dummy_probs cdef vector[float] dummy_probs
dummy_probs.push_back(0) dummy_probs.push_back(0)
cdef _AliasC alias cdef AliasC alias
alias.entry_indices = dummy_entry_indices alias.entry_indices = dummy_entry_indices
alias.probs = dummy_probs alias.probs = dummy_probs
self._entries.push_back(entry) self._entries.push_back(entry)
self._aliases_table.push_back(alias) self._aliases_table.push_back(alias)
cpdef load_bulk(self, int nr_entities, loc)
cdef class Writer: cdef class Writer:
cdef FILE* _fp cdef FILE* _fp
cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1 cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1
cdef class Reader: cdef class Reader:
cdef FILE* _fp cdef FILE* _fp

View File

@ -7,6 +7,9 @@ from cpython.exc cimport PyErr_CheckSignals
from spacy import util from spacy import util
from spacy.errors import Errors, Warnings, user_warning from spacy.errors import Errors, Warnings, user_warning
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from cpython.mem cimport PyMem_Malloc from cpython.mem cimport PyMem_Malloc
from cpython.exc cimport PyErr_SetFromErrno from cpython.exc cimport PyErr_SetFromErrno
@ -17,6 +20,8 @@ from libc.stdlib cimport qsort
from .typedefs cimport hash_t from .typedefs cimport hash_t
from os import path from os import path
from libcpp.vector cimport vector
cdef class Candidate: cdef class Candidate:
@ -53,7 +58,6 @@ cdef class Candidate:
cdef class KnowledgeBase: cdef class KnowledgeBase:
def __init__(self, Vocab vocab): def __init__(self, Vocab vocab):
self.vocab = vocab self.vocab = vocab
self.mem = Pool() self.mem = Pool()
@ -67,13 +71,13 @@ cdef class KnowledgeBase:
return self.get_size_entities() return self.get_size_entities()
def get_size_entities(self): def get_size_entities(self):
return self._entries.size() - 1 # not counting dummy element on index 0 return len(self._entry_index)
def get_entity_strings(self): def get_entity_strings(self):
return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0 return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0
def get_size_aliases(self): def get_size_aliases(self):
return self._aliases_table.size() - 1 # not counting dummy element on index return len(self._alias_index)
def get_alias_strings(self): def get_alias_strings(self):
return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0 return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0
@ -159,33 +163,44 @@ cdef class KnowledgeBase:
def dump(self, loc): def dump(self, loc):
cdef Writer writer = Writer(loc) cdef Writer writer = Writer(loc)
for key, entry_index in self._entry_index.items(): # dumping the entry records in the order in which they are in the _entries vector.
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
i = 1
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
entry = self._entries[entry_index] entry = self._entries[entry_index]
print("dumping") print("dumping")
print("index", entry_index) print("index", entry_index)
print("hash", entry.entity_hash) print("hash", entry.entity_hash)
assert entry.entity_hash == entry_hash
assert entry_index == i
print("prob", entry.prob) print("prob", entry.prob)
print("") print("")
writer.write(entry_index, entry.entity_hash, entry.prob) writer.write(entry_index, entry.entity_hash, entry.prob)
i = i+1
writer.close() writer.close()
def load(self, loc): cpdef load_bulk(self, int nr_entities, loc):
# TODO: nr_entities from header in file (Reader constructor)
cdef int64_t entry_id cdef int64_t entry_id
cdef hash_t entity_hash cdef hash_t entity_hash
cdef float prob cdef float prob
cdef _EntryC entry cdef EntryC entry
cdef int32_t dummy_value = 342 cdef int32_t dummy_value = 342
cdef Reader reader = Reader(loc) cdef Reader reader = Reader(loc)
result = reader.read(self.mem, &entry_id, &entity_hash, &prob) # -1: error, 0: eof after this one to_read = self.get_size_entities()
while result:
print("loading") self._entry_index = PreshMap(nr_entities+1)
print("entryID", entry_id) self._entries = entry_vec(nr_entities+1)
print("hash", entity_hash)
print("prob", prob) # we assume the data was written in sequence
print("result:", result) # index 0 is a dummy object not stored in the _entry_index and can be ignored.
print("") # TODO: should we initialize the dummy objects ?
cdef int i = 1
while reader.read(self.mem, &entry_id, &entity_hash, &prob) and i <= nr_entities:
assert i == entry_id
entry.entity_hash = entity_hash entry.entity_hash = entity_hash
entry.prob = prob entry.prob = prob
@ -193,9 +208,18 @@ cdef class KnowledgeBase:
entry.vector_rows = &dummy_value entry.vector_rows = &dummy_value
entry.feats_row = dummy_value entry.feats_row = dummy_value
# TODO: use set instead of push_back to ensure the index remains the same? print("bulk loading")
self._entries.push_back(entry) print("i", i)
result = reader.read(self.mem, &entry_id, &entity_hash, &prob) print("entryID", entry_id)
print("hash", entry.entity_hash)
print("prob", entry.prob)
print("")
self._entries[i] = entry
self._entry_index[entity_hash] = i
i += 1
cdef class Writer: cdef class Writer:
def __init__(self, object loc): def __init__(self, object loc):
@ -236,11 +260,6 @@ cdef class Reader:
fclose(self._fp) fclose(self._fp)
cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
"""
Return values:
-1: error during current read (EOF during call)
0: means we read the last line succesfully (EOF after call)
1: we can continue reading this file """
status = fread(entry_id, sizeof(int64_t), 1, self._fp) status = fread(entry_id, sizeof(int64_t), 1, self._fp)
if status < 1: if status < 1:
if feof(self._fp): if feof(self._fp):
@ -263,3 +282,5 @@ cdef class Reader:
return 0 return 0
else: else:
return 1 return 1

View File

@ -3,6 +3,10 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
from .typedefs cimport flags_t, attr_t, hash_t from .typedefs cimport flags_t, attr_t, hash_t
from .parts_of_speech cimport univ_pos_t from .parts_of_speech cimport univ_pos_t
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
cdef struct LexemeC: cdef struct LexemeC:
flags_t flags flags_t flags
@ -72,3 +76,36 @@ cdef struct TokenC:
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
attr_t ent_kb_id attr_t ent_kb_id
hash_t ent_id hash_t ent_id
# Internal struct, for storage and disambiguation of entities.
cdef struct EntryC:
# The hash of this entry's unique ID/name in the kB
hash_t entity_hash
# Allows retrieval of one or more vectors.
# Each element of vector_rows should be an index into a vectors table.
# Every entry should have the same number of vectors, so we can avoid storing
# the number of vectors in each knowledge-base struct
int32_t* vector_rows
# Allows retrieval of a struct of non-vector features. We could make this a
# pointer, but we have 32 bits left over in the struct after prob, so we'd
# like this to only be 32 bits. We can also set this to -1, for the common
# case where there are no features.
int32_t feats_row
# log probability of entity, based on corpus frequency
float prob
# Each alias struct stores a list of Entry pointers with their prior probabilities
# for this specific mention/alias.
cdef struct AliasC:
# All entry candidates for this alias
vector[int64_t] entry_indices
# Prior probability P(entity|alias) - should sum up to (at most) 1.
vector[float] probs