adding aliases per entity in the KB

This commit is contained in:
svlandeg 2019-03-18 12:38:40 +01:00
parent 3945fd21b0
commit 5ac7edf53c
2 changed files with 54 additions and 24 deletions

View File

@ -27,15 +27,25 @@ cdef struct _EntryC:
float prob
# Each alias struct stores a list of Entry pointers with their prior probabilities
# for this specific mention/alias.
cdef struct _AliasC:
# All entry candidates for this alias
const vector[int64_t] entry_indices
# Prior probability P(entity|alias) - should sum up to (at most) 1.
const vector[float] probs
cdef class KnowledgeBase:
cdef Pool mem
# This maps 64bit keys to 64bit values. Here the key would be a hash of
# a unique string name for the entity, and the value would be the position
# of the _EntryC struct in our vector.
# This maps 64bit keys (hash of unique entity string)
# to 64bit values (position of the _EntryC struct in the _entries vector).
# The PreshMap is pretty space efficient, as it uses open addressing. So
# the only overhead is the vacancy rate, which is approximately 30%.
cdef PreshMap _index
cdef PreshMap _entry_index
# Each entry takes 128 bits, and again we'll have a 30% or so overhead for
# over allocation.
@ -43,6 +53,16 @@ cdef class KnowledgeBase:
# Storing 1m entries would take 41.6mb under this scheme.
cdef vector[_EntryC] _entries
# This maps 64bit keys (hash of unique alias string)
# to 64bit values (position of the _AliasC struct in the _aliases_table vector).
cdef PreshMap _alias_index
# This should map mention hashes to (entry_id, prob) tuples. The probability
# should be P(entity | mention), which is pretty important to know.
# We can pack both pieces of information into a 64-bit value, to keep things
# efficient.
cdef vector[_AliasC] _aliases_table
# This is the part which might take more space: storing various
# categorical features for the entries, and storing vectors for disambiguation
# and possibly usage.
@ -61,23 +81,30 @@ cdef class KnowledgeBase:
# optional data, we can let users configure a DB as the backend for this.
cdef object _features_table
# This should map mention hashes to (entry_id, prob) tuples. The probability
# should be P(entity | mention), which is pretty important to know.
# We can pack both pieces of information into a 64-bit value, to keep things
# efficient.
cdef object _aliases_table
cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows,
int feats_row):
"""Add an entry to the knowledge base."""
# This is what we'll map the hash key to. It's where the entry will sit
# in the vector of entries, so we can get it later.
cdef int64_t index = self._entries.size()
cdef int64_t entity_index = self._entries.size()
self._entries.push_back(
_EntryC(
vector_rows=vector_rows,
feats_row=feats_row,
prob=prob
))
self._index[key] = index
return index
self._index[entity_key] = entity_index
return entity_index
cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
"""Connect a mention to a list of potential entities with their prior probabilities ."""
cdef int64_t alias_index = self._aliases_table.size()
self._aliases_table.push_back(
_AliasC(
entry_indices=entry_indices,
probs=probs
))
self._alias_index[alias_key] = alias_index
return alias_index

View File

@ -5,14 +5,14 @@ cdef class KnowledgeBase:
def __len__(self):
return self._entries.size()
def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
def add_entity(self, entity_id: str, float prob, vectors=None, features=None):
# TODO: more friendly check for non-unique name
if name in self:
if entity_id in self:
return
cdef hash_t name_hash = hash_string(name)
cdef hash_t id_hash = hash_string(entity_id)
cdef int32_t dummy_value = 342
self.c_add_entity(name_hash, prob, &dummy_value, dummy_value)
self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
# TODO self._vectors_table.get_pointer(vectors),
# self._features_table.get(features))
@ -22,10 +22,13 @@ cdef class KnowledgeBase:
cdef hash_t entity_hash = 0
cdef int64_t entity_index = 0
# TODO: check len(entities) == len(probabilities)
for entity, prob in zip(entities, probabilities):
entity_hash = hash_string(entity)
entity_index = self._index[entity_hash]
# TODO: check that entity is already in this KB (entity_index is OK)
self._aliases_table.add(alias_hash, entity_index, prob)
cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities]
self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities)
# TODO: check that alias hadn't been defined before
# TODO: check that entity is already in this KB (entity_index is OK)
# TODO: check sum(probabilities) <= 1
# TODO: check len(entities) == len(probabilities)