From 9a46c431c3d806c21d248fd3ab549bf99ce5e5d2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 12:31:02 +0100 Subject: [PATCH] store entity hash instead of pointer --- spacy/kb.pxd | 21 +++++------ spacy/kb.pyx | 51 +++++++++----------------- spacy/sandbox_test_sofie/testing_el.py | 24 ++++++++++-- 3 files changed, 46 insertions(+), 50 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index c409cf1b4..c0998eadb 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -14,7 +14,7 @@ from .typedefs cimport hash_t cdef struct _EntryC: # The hash of this entry's unique ID - hash_t entity_key + hash_t entity_hash # Allows retrieval of one or more vectors. # Each element of vector_rows should be an index into a vectors table. @@ -46,13 +46,10 @@ cdef struct _AliasC: # TODO: document cdef class Candidate: - cdef _EntryC* entity + cdef hash_t entity_hash cdef hash_t alias_hash cdef float prior_prob - @staticmethod - cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob) - cdef class KnowledgeBase: cdef Pool mem @@ -98,8 +95,7 @@ cdef class KnowledgeBase: # optional data, we can let users configure a DB as the backend for this. cdef object _features_table - - cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, int32_t* vector_rows, + cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows, int feats_row): """Add an entry to the knowledge base.""" # This is what we'll map the hash key to. It's where the entry will sit @@ -107,15 +103,15 @@ cdef class KnowledgeBase: cdef int64_t entity_index = self._entries.size() self._entries.push_back( _EntryC( - entity_key=entity_key, + entity_hash=entity_hash, vector_rows=vector_rows, feats_row=feats_row, prob=prob )) - self._entry_index[entity_key] = entity_index + self._entry_index[entity_hash] = entity_index return entity_index - cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs): + cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): """Connect a mention to a list of potential entities with their prior probabilities .""" cdef int64_t alias_index = self._aliases_table.size() @@ -124,7 +120,7 @@ cdef class KnowledgeBase: entry_indices=entry_indices, probs=probs )) - self._alias_index[alias_key] = alias_index + self._alias_index[alias_hash] = alias_index return alias_index cdef inline create_empty_vectors(self): @@ -134,9 +130,10 @@ cdef class KnowledgeBase: cf. https://github.com/explosion/preshed/issues/17 """ cdef int32_t dummy_value = 0 + self.strings.add("") self._entries.push_back( _EntryC( - entity_key=self.strings.add(""), + entity_hash=self.strings.add(""), vector_rows=&dummy_value, feats_row=dummy_value, prob=dummy_value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 38bc48c7f..cca24d4f8 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -5,30 +5,20 @@ from spacy.errors import user_warning cdef class Candidate: + def __init__(self, entity_hash, alias_hash, prior_prob): + self.entity_hash = entity_hash + self.alias_hash = alias_hash + self.prior_prob = prior_prob - # def inline __cinit__(self, _EntryC entity, hash_t alias_hash, float prior_prob): - # self.alias_hash = alias_hash - # self.entity = entity - # self.prior_prob = prior_prob + def get_entity_name(self, KnowledgeBase kb): + return kb.strings[self.entity_hash] - @staticmethod - cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob): - """Factory function to create Candidate objects from entity entries.""" - # Call to __new__ bypasses __init__ constructor - cdef Candidate candidate = Candidate.__new__(Candidate) - candidate.entity = entity - candidate.alias_hash = alias_hash - candidate.prior_prob = prior_prob - return candidate + def get_alias_name(self, KnowledgeBase kb): + return kb.strings[self.alias_hash] - def __str__(self): - return "alias=" + self.strings[self.alias_hash] + \ - " prior_prob=" + str(self.prior_prob) - - #" entry=" + self.strings[self.entity_hash] + \ - - def __repr__(self): - return self.__str__() + property prior_prob: + def __get__(self): + return self.prior_prob cdef class KnowledgeBase: @@ -58,7 +48,7 @@ cdef class KnowledgeBase: return cdef int32_t dummy_value = 342 - self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) + self.c_add_entity(entity_hash=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), # self._features_table.get(features)) @@ -99,7 +89,7 @@ cdef class KnowledgeBase: entry_indices.push_back(int(entry_index)) probs.push_back(float(prob)) - self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs) + self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) def get_candidates(self, unicode alias): @@ -107,15 +97,8 @@ cdef class KnowledgeBase: alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] - for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs): - entity = <_EntryC>self._entries[entry_index] - # candidate = Candidate(entity=entity, alias_hash=alias_hash, prior_prob=prob) - candidate = Candidate.from_entry(entity=&entity, alias_hash=alias_hash, prior_prob=prob) - print(candidate) - - # return [Candidate(entity=<_EntryC>self._entries[self._entry_index[entry_index]], - # alias_hash=alias_hash, - # prior_prob=prob) - # for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)] - + return [Candidate(entity_hash=self._entries[entry_index].entity_hash, + alias_hash=alias_hash, + prior_prob=prob) + for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)] diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index c96c5552f..5c0d6a037 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -39,12 +39,28 @@ def create_kb(): print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) print("candidates for", alias1) - candidates = mykb.get_candidates(alias1) - print(" ", candidates) + candidates1 = mykb.get_candidates(alias1) + for candidate in candidates1: + print(" candidate") + print(" name", candidate.get_entity_name(mykb)) + print(" alias", candidate.get_alias_name(mykb)) + print(" prior_prob", candidate.prior_prob) + + print("candidates for", alias2) + candidates2 = mykb.get_candidates(alias2) + for candidate in candidates2: + print(" candidate") + print(" name", candidate.get_entity_name(mykb)) + print(" alias", candidate.get_alias_name(mykb)) + print(" prior_prob", candidate.prior_prob) print("candidates for", alias3) - candidates = mykb.get_candidates(alias3) - print(" ", candidates) + candidates3 = mykb.get_candidates(alias3) + for candidate in candidates3: + print(" candidate") + print(" name", candidate.get_entity_name(mykb)) + print(" alias", candidate.get_alias_name(mykb)) + print(" prior_prob", candidate.prior_prob) def add_el():