diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 5fd239998..cffbcd5d1 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -13,8 +13,9 @@ from .typedefs cimport hash_t # of bits we need to keep track of the answers. cdef struct _EntryC: - # The hash of this entry's unique ID - hash_t entity_hash + # The hash of this entry's unique ID and name in the kB + hash_t entity_id_hash + hash_t entity_name_hash # Allows retrieval of one or more vectors. # Each element of vector_rows should be an index into a vectors table. @@ -47,7 +48,7 @@ cdef struct _AliasC: cdef class Entity: cdef readonly KnowledgeBase kb - cdef hash_t entity_hash + cdef hash_t entity_id_hash cdef float confidence @@ -55,7 +56,7 @@ cdef class Entity: cdef class Candidate: cdef readonly KnowledgeBase kb - cdef hash_t entity_hash + cdef hash_t entity_id_hash cdef hash_t alias_hash cdef float prior_prob @@ -104,20 +105,21 @@ cdef class KnowledgeBase: # optional data, we can let users configure a DB as the backend for this. cdef object _features_table - cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows, - int feats_row): + cdef inline int64_t c_add_entity(self, hash_t entity_id_hash, hash_t entity_name_hash, float prob, + int32_t* vector_rows, int feats_row): """Add an entry to the knowledge base.""" # This is what we'll map the hash key to. It's where the entry will sit # in the vector of entries, so we can get it later. cdef int64_t new_index = self._entries.size() self._entries.push_back( _EntryC( - entity_hash=entity_hash, + entity_id_hash=entity_id_hash, + entity_name_hash=entity_name_hash, vector_rows=vector_rows, feats_row=feats_row, prob=prob )) - self._entry_index[entity_hash] = new_index + self._entry_index[entity_id_hash] = new_index return new_index cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): @@ -142,7 +144,8 @@ cdef class KnowledgeBase: self.strings.add("") self._entries.push_back( _EntryC( - entity_hash=self.strings.add(""), + entity_id_hash=self.strings[""], + entity_name_hash=self.strings[""], vector_rows=&dummy_value, feats_row=dummy_value, prob=dummy_value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 33a79da04..e51cb087d 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -5,20 +5,20 @@ from spacy.errors import user_warning cdef class Entity: - def __init__(self, KnowledgeBase kb, entity_hash, confidence): + def __init__(self, KnowledgeBase kb, entity_id_hash, confidence): self.kb = kb - self.entity_hash = entity_hash + self.entity_id_hash = entity_id_hash self.confidence = confidence property kb_id_: """RETURNS (unicode): ID of this entity in the KB""" def __get__(self): - return self.kb.strings[self.entity_hash] + return self.kb.strings[self.entity_id_hash] property kb_id: """RETURNS (uint64): hash of the entity's KB ID""" def __get__(self): - return self.entity_hash + return self.entity_id_hash property confidence: def __get__(self): @@ -27,32 +27,43 @@ cdef class Entity: cdef class Candidate: - def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob): + def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob): self.kb = kb - self.entity_hash = entity_hash + self.entity_id_hash = entity_id_hash self.alias_hash = alias_hash self.prior_prob = prior_prob - property kb_id_: - """RETURNS (unicode): ID of this entity in the KB""" - def __get__(self): - return self.kb.strings[self.entity_hash] - - property kb_id: + property entity_id: """RETURNS (uint64): hash of the entity's KB ID""" def __get__(self): - return self.entity_hash + return self.entity_id_hash - property alias_: - """RETURNS (unicode): ID of the original alias""" + property entity_id_: + """RETURNS (unicode): ID of this entity in the KB""" def __get__(self): - return self.kb.strings[self.alias_hash] + return self.kb.strings[self.entity_id] + + property entity_name: + """RETURNS (uint64): hash of the entity's KB name""" + def __get__(self): + entry_index = self.kb._entry_index.get(self.entity_id) + return self.kb._entries[entry_index].entity_name_hash + + property entity_name_: + """RETURNS (unicode): name of this entity in the KB""" + def __get__(self): + return self.kb.strings[self.entity_name] property alias: """RETURNS (uint64): hash of the alias""" def __get__(self): return self.alias_hash + property alias_: + """RETURNS (unicode): ID of the original alias""" + def __get__(self): + return self.kb.strings[self.alias] + property prior_prob: def __get__(self): return self.prior_prob @@ -76,12 +87,15 @@ cdef class KnowledgeBase: def get_size_aliases(self): return self._aliases_table.size() - 1 # not counting dummy element on index 0 - def add_entity(self, unicode entity_id, float prob, vectors=None, features=None): + def add_entity(self, unicode entity_id, unicode entity_name=None, float prob=0.5, vectors=None, features=None): """ Add an entity to the KB. Return the hash of the entity ID at the end """ + if not entity_name: + entity_name = entity_id cdef hash_t id_hash = self.strings.add(entity_id) + cdef hash_t name_hash = self.strings.add(entity_name) # Return if this entity was added before if id_hash in self._entry_index: @@ -89,7 +103,7 @@ cdef class KnowledgeBase: return cdef int32_t dummy_value = 342 - self.c_add_entity(entity_hash=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) + self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), # self._features_table.get(features)) @@ -127,11 +141,11 @@ cdef class KnowledgeBase: cdef vector[float] probs for entity, prob in zip(entities, probabilities): - entity_hash = self.strings[entity] - if not entity_hash in self._entry_index: + entity_id_hash = self.strings[entity] + if not entity_id_hash in self._entry_index: raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'") - entry_index = self._entry_index.get(entity_hash) + entry_index = self._entry_index.get(entity_id_hash) entry_indices.push_back(int(entry_index)) probs.push_back(float(prob)) @@ -146,7 +160,7 @@ cdef class KnowledgeBase: alias_entry = self._aliases_table[alias_index] return [Candidate(kb=self, - entity_hash=self._entries[entry_index].entity_hash, + entity_id_hash=self._entries[entry_index].entity_id_hash, alias_hash=alias_hash, prior_prob=prob) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index f6296bf89..c7b0a3a07 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -12,15 +12,15 @@ def create_kb(): # adding entities entity_0 = "Q0" # douglas adams print(" adding entity", entity_0) - mykb.add_entity(entity_id=entity_0, prob=0.5) + mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5) entity_42 = "Q42" # douglas adams print(" adding entity", entity_42) - mykb.add_entity(entity_id=entity_42, prob=0.5) + mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5) entity_5301561 = "Q5301561" print(" adding entity", entity_5301561) - mykb.add_entity(entity_id=entity_5301561, prob=0.5) + mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5) print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) print() @@ -63,7 +63,9 @@ def add_el(kb): print() for alias in ["douglassss", "rubbish", "adam"]: candidates = nlp.linker.kb.get_candidates(alias) - print(len(candidates), "candidates for", alias) + print(len(candidates), "candidates for", alias, ":") + for c in candidates: + print(" ", c.entity_id_, c.entity_name_, c.alias_) if __name__ == "__main__":