name per entity

This commit is contained in:
svlandeg 2019-03-21 18:20:57 +01:00
parent d0c763ba44
commit 24a0c4a8d4
3 changed files with 54 additions and 35 deletions

View File

@ -13,8 +13,9 @@ from .typedefs cimport hash_t
# of bits we need to keep track of the answers. # of bits we need to keep track of the answers.
cdef struct _EntryC: cdef struct _EntryC:
# The hash of this entry's unique ID # The hash of this entry's unique ID and name in the kB
hash_t entity_hash hash_t entity_id_hash
hash_t entity_name_hash
# Allows retrieval of one or more vectors. # Allows retrieval of one or more vectors.
# Each element of vector_rows should be an index into a vectors table. # Each element of vector_rows should be an index into a vectors table.
@ -47,7 +48,7 @@ cdef struct _AliasC:
cdef class Entity: cdef class Entity:
cdef readonly KnowledgeBase kb cdef readonly KnowledgeBase kb
cdef hash_t entity_hash cdef hash_t entity_id_hash
cdef float confidence cdef float confidence
@ -55,7 +56,7 @@ cdef class Entity:
cdef class Candidate: cdef class Candidate:
cdef readonly KnowledgeBase kb cdef readonly KnowledgeBase kb
cdef hash_t entity_hash cdef hash_t entity_id_hash
cdef hash_t alias_hash cdef hash_t alias_hash
cdef float prior_prob cdef float prior_prob
@ -104,20 +105,21 @@ cdef class KnowledgeBase:
# optional data, we can let users configure a DB as the backend for this. # optional data, we can let users configure a DB as the backend for this.
cdef object _features_table cdef object _features_table
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows, cdef inline int64_t c_add_entity(self, hash_t entity_id_hash, hash_t entity_name_hash, float prob,
int feats_row): int32_t* vector_rows, int feats_row):
"""Add an entry to the knowledge base.""" """Add an entry to the knowledge base."""
# This is what we'll map the hash key to. It's where the entry will sit # This is what we'll map the hash key to. It's where the entry will sit
# in the vector of entries, so we can get it later. # in the vector of entries, so we can get it later.
cdef int64_t new_index = self._entries.size() cdef int64_t new_index = self._entries.size()
self._entries.push_back( self._entries.push_back(
_EntryC( _EntryC(
entity_hash=entity_hash, entity_id_hash=entity_id_hash,
entity_name_hash=entity_name_hash,
vector_rows=vector_rows, vector_rows=vector_rows,
feats_row=feats_row, feats_row=feats_row,
prob=prob prob=prob
)) ))
self._entry_index[entity_hash] = new_index self._entry_index[entity_id_hash] = new_index
return new_index return new_index
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
@ -142,7 +144,8 @@ cdef class KnowledgeBase:
self.strings.add("") self.strings.add("")
self._entries.push_back( self._entries.push_back(
_EntryC( _EntryC(
entity_hash=self.strings.add(""), entity_id_hash=self.strings[""],
entity_name_hash=self.strings[""],
vector_rows=&dummy_value, vector_rows=&dummy_value,
feats_row=dummy_value, feats_row=dummy_value,
prob=dummy_value prob=dummy_value

View File

@ -5,20 +5,20 @@ from spacy.errors import user_warning
cdef class Entity: cdef class Entity:
def __init__(self, KnowledgeBase kb, entity_hash, confidence): def __init__(self, KnowledgeBase kb, entity_id_hash, confidence):
self.kb = kb self.kb = kb
self.entity_hash = entity_hash self.entity_id_hash = entity_id_hash
self.confidence = confidence self.confidence = confidence
property kb_id_: property kb_id_:
"""RETURNS (unicode): ID of this entity in the KB""" """RETURNS (unicode): ID of this entity in the KB"""
def __get__(self): def __get__(self):
return self.kb.strings[self.entity_hash] return self.kb.strings[self.entity_id_hash]
property kb_id: property kb_id:
"""RETURNS (uint64): hash of the entity's KB ID""" """RETURNS (uint64): hash of the entity's KB ID"""
def __get__(self): def __get__(self):
return self.entity_hash return self.entity_id_hash
property confidence: property confidence:
def __get__(self): def __get__(self):
@ -27,32 +27,43 @@ cdef class Entity:
cdef class Candidate: cdef class Candidate:
def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob): def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob):
self.kb = kb self.kb = kb
self.entity_hash = entity_hash self.entity_id_hash = entity_id_hash
self.alias_hash = alias_hash self.alias_hash = alias_hash
self.prior_prob = prior_prob self.prior_prob = prior_prob
property kb_id_: property entity_id:
"""RETURNS (unicode): ID of this entity in the KB"""
def __get__(self):
return self.kb.strings[self.entity_hash]
property kb_id:
"""RETURNS (uint64): hash of the entity's KB ID""" """RETURNS (uint64): hash of the entity's KB ID"""
def __get__(self): def __get__(self):
return self.entity_hash return self.entity_id_hash
property alias_: property entity_id_:
"""RETURNS (unicode): ID of the original alias""" """RETURNS (unicode): ID of this entity in the KB"""
def __get__(self): def __get__(self):
return self.kb.strings[self.alias_hash] return self.kb.strings[self.entity_id]
property entity_name:
"""RETURNS (uint64): hash of the entity's KB name"""
def __get__(self):
entry_index = <int64_t>self.kb._entry_index.get(self.entity_id)
return self.kb._entries[entry_index].entity_name_hash
property entity_name_:
"""RETURNS (unicode): name of this entity in the KB"""
def __get__(self):
return self.kb.strings[self.entity_name]
property alias: property alias:
"""RETURNS (uint64): hash of the alias""" """RETURNS (uint64): hash of the alias"""
def __get__(self): def __get__(self):
return self.alias_hash return self.alias_hash
property alias_:
"""RETURNS (unicode): ID of the original alias"""
def __get__(self):
return self.kb.strings[self.alias]
property prior_prob: property prior_prob:
def __get__(self): def __get__(self):
return self.prior_prob return self.prior_prob
@ -76,12 +87,15 @@ cdef class KnowledgeBase:
def get_size_aliases(self): def get_size_aliases(self):
return self._aliases_table.size() - 1 # not counting dummy element on index 0 return self._aliases_table.size() - 1 # not counting dummy element on index 0
def add_entity(self, unicode entity_id, float prob, vectors=None, features=None): def add_entity(self, unicode entity_id, unicode entity_name=None, float prob=0.5, vectors=None, features=None):
""" """
Add an entity to the KB. Add an entity to the KB.
Return the hash of the entity ID at the end Return the hash of the entity ID at the end
""" """
if not entity_name:
entity_name = entity_id
cdef hash_t id_hash = self.strings.add(entity_id) cdef hash_t id_hash = self.strings.add(entity_id)
cdef hash_t name_hash = self.strings.add(entity_name)
# Return if this entity was added before # Return if this entity was added before
if id_hash in self._entry_index: if id_hash in self._entry_index:
@ -89,7 +103,7 @@ cdef class KnowledgeBase:
return return
cdef int32_t dummy_value = 342 cdef int32_t dummy_value = 342
self.c_add_entity(entity_hash=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
# TODO self._vectors_table.get_pointer(vectors), # TODO self._vectors_table.get_pointer(vectors),
# self._features_table.get(features)) # self._features_table.get(features))
@ -127,11 +141,11 @@ cdef class KnowledgeBase:
cdef vector[float] probs cdef vector[float] probs
for entity, prob in zip(entities, probabilities): for entity, prob in zip(entities, probabilities):
entity_hash = self.strings[entity] entity_id_hash = self.strings[entity]
if not entity_hash in self._entry_index: if not entity_id_hash in self._entry_index:
raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'") raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'")
entry_index = <int64_t>self._entry_index.get(entity_hash) entry_index = <int64_t>self._entry_index.get(entity_id_hash)
entry_indices.push_back(int(entry_index)) entry_indices.push_back(int(entry_index))
probs.push_back(float(prob)) probs.push_back(float(prob))
@ -146,7 +160,7 @@ cdef class KnowledgeBase:
alias_entry = self._aliases_table[alias_index] alias_entry = self._aliases_table[alias_index]
return [Candidate(kb=self, return [Candidate(kb=self,
entity_hash=self._entries[entry_index].entity_hash, entity_id_hash=self._entries[entry_index].entity_id_hash,
alias_hash=alias_hash, alias_hash=alias_hash,
prior_prob=prob) prior_prob=prob)
for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)

View File

@ -12,15 +12,15 @@ def create_kb():
# adding entities # adding entities
entity_0 = "Q0" # douglas adams entity_0 = "Q0" # douglas adams
print(" adding entity", entity_0) print(" adding entity", entity_0)
mykb.add_entity(entity_id=entity_0, prob=0.5) mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5)
entity_42 = "Q42" # douglas adams entity_42 = "Q42" # douglas adams
print(" adding entity", entity_42) print(" adding entity", entity_42)
mykb.add_entity(entity_id=entity_42, prob=0.5) mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5)
entity_5301561 = "Q5301561" entity_5301561 = "Q5301561"
print(" adding entity", entity_5301561) print(" adding entity", entity_5301561)
mykb.add_entity(entity_id=entity_5301561, prob=0.5) mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5)
print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
print() print()
@ -63,7 +63,9 @@ def add_el(kb):
print() print()
for alias in ["douglassss", "rubbish", "adam"]: for alias in ["douglassss", "rubbish", "adam"]:
candidates = nlp.linker.kb.get_candidates(alias) candidates = nlp.linker.kb.get_candidates(alias)
print(len(candidates), "candidates for", alias) print(len(candidates), "candidates for", alias, ":")
for c in candidates:
print(" ", c.entity_id_, c.entity_name_, c.alias_)
if __name__ == "__main__": if __name__ == "__main__":