adding aliases per entity in the KB

This commit is contained in:
svlandeg 2019-03-18 12:38:40 +01:00
parent 3945fd21b0
commit 5ac7edf53c
2 changed files with 54 additions and 24 deletions

View File

@ -27,15 +27,25 @@ cdef struct _EntryC:
float prob float prob
# Each alias struct stores a list of Entry pointers with their prior probabilities
# for this specific mention/alias.
cdef struct _AliasC:
# All entry candidates for this alias
const vector[int64_t] entry_indices
# Prior probability P(entity|alias) - should sum up to (at most) 1.
const vector[float] probs
cdef class KnowledgeBase: cdef class KnowledgeBase:
cdef Pool mem cdef Pool mem
# This maps 64bit keys to 64bit values. Here the key would be a hash of # This maps 64bit keys (hash of unique entity string)
# a unique string name for the entity, and the value would be the position # to 64bit values (position of the _EntryC struct in the _entries vector).
# of the _EntryC struct in our vector.
# The PreshMap is pretty space efficient, as it uses open addressing. So # The PreshMap is pretty space efficient, as it uses open addressing. So
# the only overhead is the vacancy rate, which is approximately 30%. # the only overhead is the vacancy rate, which is approximately 30%.
cdef PreshMap _index cdef PreshMap _entry_index
# Each entry takes 128 bits, and again we'll have a 30% or so overhead for # Each entry takes 128 bits, and again we'll have a 30% or so overhead for
# over allocation. # over allocation.
@ -43,6 +53,16 @@ cdef class KnowledgeBase:
# Storing 1m entries would take 41.6mb under this scheme. # Storing 1m entries would take 41.6mb under this scheme.
cdef vector[_EntryC] _entries cdef vector[_EntryC] _entries
# This maps 64bit keys (hash of unique alias string)
# to 64bit values (position of the _AliasC struct in the _aliases_table vector).
cdef PreshMap _alias_index
# This should map mention hashes to (entry_id, prob) tuples. The probability
# should be P(entity | mention), which is pretty important to know.
# We can pack both pieces of information into a 64-bit value, to keep things
# efficient.
cdef vector[_AliasC] _aliases_table
# This is the part which might take more space: storing various # This is the part which might take more space: storing various
# categorical features for the entries, and storing vectors for disambiguation # categorical features for the entries, and storing vectors for disambiguation
# and possibly usage. # and possibly usage.
@ -61,23 +81,30 @@ cdef class KnowledgeBase:
# optional data, we can let users configure a DB as the backend for this. # optional data, we can let users configure a DB as the backend for this.
cdef object _features_table cdef object _features_table
# This should map mention hashes to (entry_id, prob) tuples. The probability
# should be P(entity | mention), which is pretty important to know.
# We can pack both pieces of information into a 64-bit value, to keep things
# efficient.
cdef object _aliases_table
cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows, cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows,
int feats_row): int feats_row):
"""Add an entry to the knowledge base.""" """Add an entry to the knowledge base."""
# This is what we'll map the hash key to. It's where the entry will sit # This is what we'll map the hash key to. It's where the entry will sit
# in the vector of entries, so we can get it later. # in the vector of entries, so we can get it later.
cdef int64_t index = self._entries.size() cdef int64_t entity_index = self._entries.size()
self._entries.push_back( self._entries.push_back(
_EntryC( _EntryC(
vector_rows=vector_rows, vector_rows=vector_rows,
feats_row=feats_row, feats_row=feats_row,
prob=prob prob=prob
)) ))
self._index[key] = index self._index[entity_key] = entity_index
return index return entity_index
cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
"""Connect a mention to a list of potential entities with their prior probabilities ."""
cdef int64_t alias_index = self._aliases_table.size()
self._aliases_table.push_back(
_AliasC(
entry_indices=entry_indices,
probs=probs
))
self._alias_index[alias_key] = alias_index
return alias_index

View File

@ -5,14 +5,14 @@ cdef class KnowledgeBase:
def __len__(self): def __len__(self):
return self._entries.size() return self._entries.size()
def add_entity(self, name, float prob, vectors=None, features=None, aliases=None): def add_entity(self, entity_id: str, float prob, vectors=None, features=None):
# TODO: more friendly check for non-unique name # TODO: more friendly check for non-unique name
if name in self: if entity_id in self:
return return
cdef hash_t name_hash = hash_string(name) cdef hash_t id_hash = hash_string(entity_id)
cdef int32_t dummy_value = 342 cdef int32_t dummy_value = 342
self.c_add_entity(name_hash, prob, &dummy_value, dummy_value) self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
# TODO self._vectors_table.get_pointer(vectors), # TODO self._vectors_table.get_pointer(vectors),
# self._features_table.get(features)) # self._features_table.get(features))
@ -22,10 +22,13 @@ cdef class KnowledgeBase:
cdef hash_t entity_hash = 0 cdef hash_t entity_hash = 0
cdef int64_t entity_index = 0 cdef int64_t entity_index = 0
# TODO: check len(entities) == len(probabilities) cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities]
for entity, prob in zip(entities, probabilities):
entity_hash = hash_string(entity) self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities)
entity_index = self._index[entity_hash]
# TODO: check that entity is already in this KB (entity_index is OK) # TODO: check that alias hadn't been defined before
self._aliases_table.add(alias_hash, entity_index, prob) # TODO: check that entity is already in this KB (entity_index is OK)
# TODO: check sum(probabilities) <= 1
# TODO: check len(entities) == len(probabilities)