From 5ac7edf53c328c90ac4701ef687b0964ea4b756c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 18 Mar 2019 12:38:40 +0100 Subject: [PATCH] adding aliases per entity in the KB --- spacy/kb.pxd | 53 +++++++++++++++++++++++++++++++++++++++------------- spacy/kb.pyx | 25 ++++++++++++++----------- 2 files changed, 54 insertions(+), 24 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 3ba9c8bba..92a0c8b95 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -27,15 +27,25 @@ cdef struct _EntryC: float prob +# Each alias struct stores a list of Entry pointers with their prior probabilities +# for this specific mention/alias. +cdef struct _AliasC: + + # All entry candidates for this alias + const vector[int64_t] entry_indices + + # Prior probability P(entity|alias) - should sum up to (at most) 1. + const vector[float] probs + + cdef class KnowledgeBase: cdef Pool mem - # This maps 64bit keys to 64bit values. Here the key would be a hash of - # a unique string name for the entity, and the value would be the position - # of the _EntryC struct in our vector. + # This maps 64bit keys (hash of unique entity string) + # to 64bit values (position of the _EntryC struct in the _entries vector). # The PreshMap is pretty space efficient, as it uses open addressing. So # the only overhead is the vacancy rate, which is approximately 30%. - cdef PreshMap _index + cdef PreshMap _entry_index # Each entry takes 128 bits, and again we'll have a 30% or so overhead for # over allocation. @@ -43,6 +53,16 @@ cdef class KnowledgeBase: # Storing 1m entries would take 41.6mb under this scheme. cdef vector[_EntryC] _entries + # This maps 64bit keys (hash of unique alias string) + # to 64bit values (position of the _AliasC struct in the _aliases_table vector). + cdef PreshMap _alias_index + + # This should map mention hashes to (entry_id, prob) tuples. The probability + # should be P(entity | mention), which is pretty important to know. + # We can pack both pieces of information into a 64-bit value, to keep things + # efficient. + cdef vector[_AliasC] _aliases_table + # This is the part which might take more space: storing various # categorical features for the entries, and storing vectors for disambiguation # and possibly usage. @@ -61,23 +81,30 @@ cdef class KnowledgeBase: # optional data, we can let users configure a DB as the backend for this. cdef object _features_table - # This should map mention hashes to (entry_id, prob) tuples. The probability - # should be P(entity | mention), which is pretty important to know. - # We can pack both pieces of information into a 64-bit value, to keep things - # efficient. - cdef object _aliases_table - cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows, + cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows, int feats_row): """Add an entry to the knowledge base.""" # This is what we'll map the hash key to. It's where the entry will sit # in the vector of entries, so we can get it later. - cdef int64_t index = self._entries.size() + cdef int64_t entity_index = self._entries.size() self._entries.push_back( _EntryC( vector_rows=vector_rows, feats_row=feats_row, prob=prob )) - self._index[key] = index - return index \ No newline at end of file + self._index[entity_key] = entity_index + return entity_index + + cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs): + """Connect a mention to a list of potential entities with their prior probabilities .""" + cdef int64_t alias_index = self._aliases_table.size() + + self._aliases_table.push_back( + _AliasC( + entry_indices=entry_indices, + probs=probs + )) + self._alias_index[alias_key] = alias_index + return alias_index \ No newline at end of file diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 46acc2967..0f6a7aecc 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -5,16 +5,16 @@ cdef class KnowledgeBase: def __len__(self): return self._entries.size() - def add_entity(self, name, float prob, vectors=None, features=None, aliases=None): + def add_entity(self, entity_id: str, float prob, vectors=None, features=None): # TODO: more friendly check for non-unique name - if name in self: + if entity_id in self: return - cdef hash_t name_hash = hash_string(name) + cdef hash_t id_hash = hash_string(entity_id) cdef int32_t dummy_value = 342 - self.c_add_entity(name_hash, prob, &dummy_value, dummy_value) + self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), - # self._features_table.get(features)) + # self._features_table.get(features)) def add_alias(self, alias, entities, probabilities): """For a given alias, add its potential entities and prior probabilies to the KB.""" @@ -22,10 +22,13 @@ cdef class KnowledgeBase: cdef hash_t entity_hash = 0 cdef int64_t entity_index = 0 - # TODO: check len(entities) == len(probabilities) - for entity, prob in zip(entities, probabilities): - entity_hash = hash_string(entity) - entity_index = self._index[entity_hash] - # TODO: check that entity is already in this KB (entity_index is OK) - self._aliases_table.add(alias_hash, entity_index, prob) + cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities] + + self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities) + + # TODO: check that alias hadn't been defined before + # TODO: check that entity is already in this KB (entity_index is OK) + # TODO: check sum(probabilities) <= 1 + # TODO: check len(entities) == len(probabilities) +