rename entity frequency

This commit is contained in:
svlandeg 2019-07-19 17:40:28 +02:00
parent f75d1299a7
commit dae8a21282
7 changed files with 49 additions and 49 deletions

View File

@ -70,7 +70,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
print()
print(" * adding", len(entity_list), "entities", datetime.datetime.now())
kb.set_entities(entity_list=entity_list, prob_list=frequency_list, vector_list=embeddings)
kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings)
print()
print(" * adding aliases", datetime.datetime.now())

View File

@ -14,15 +14,15 @@ def create_kb(vocab):
# adding entities
entity_0 = "Q1004791_Douglas"
print("adding entity", entity_0)
kb.add_entity(entity=entity_0, prob=0.5, entity_vector=[0])
kb.add_entity(entity=entity_0, freq=0.5, entity_vector=[0])
entity_1 = "Q42_Douglas_Adams"
print("adding entity", entity_1)
kb.add_entity(entity=entity_1, prob=0.5, entity_vector=[1])
kb.add_entity(entity=entity_1, freq=0.5, entity_vector=[1])
entity_2 = "Q5301561_Douglas_Haig"
print("adding entity", entity_2)
kb.add_entity(entity=entity_2, prob=0.5, entity_vector=[2])
kb.add_entity(entity=entity_2, freq=0.5, entity_vector=[2])
# adding aliases
print()

View File

@ -79,7 +79,7 @@ cdef class KnowledgeBase:
return new_index
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
int32_t vector_index, int feats_row) nogil:
"""Add an entry to the vector of entries.
After calling this method, make sure to update also the _entry_index using the return value"""
@ -92,7 +92,7 @@ cdef class KnowledgeBase:
entry.entity_hash = entity_hash
entry.vector_index = vector_index
entry.feats_row = feats_row
entry.prob = prob
entry.freq = freq
self._entries.push_back(entry)
return new_index
@ -125,7 +125,7 @@ cdef class KnowledgeBase:
entry.entity_hash = dummy_hash
entry.vector_index = dummy_value
entry.feats_row = dummy_value
entry.prob = dummy_value
entry.freq = dummy_value
# Avoid struct initializer to enable nogil
cdef vector[int64_t] dummy_entry_indices
@ -141,7 +141,7 @@ cdef class KnowledgeBase:
self._aliases_table.push_back(alias)
cpdef load_bulk(self, loc)
cpdef set_entities(self, entity_list, prob_list, vector_list)
cpdef set_entities(self, entity_list, freq_list, vector_list)
cdef class Writer:
@ -149,7 +149,7 @@ cdef class Writer:
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
cdef int write_vector_element(self, float element) except -1
cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
cdef int write_alias_length(self, int64_t alias_length) except -1
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
@ -162,7 +162,7 @@ cdef class Reader:
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
cdef int read_vector_element(self, float* element) except -1
cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
cdef int read_alias_length(self, int64_t* alias_length) except -1
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1

View File

@ -94,7 +94,7 @@ cdef class KnowledgeBase:
def get_alias_strings(self):
return [self.vocab.strings[x] for x in self._alias_index]
def add_entity(self, unicode entity, float prob, vector[float] entity_vector):
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
"""
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
Return the hash of the entity ID/name at the end.
@ -113,15 +113,15 @@ cdef class KnowledgeBase:
vector_index = self.c_add_vector(entity_vector=entity_vector)
new_index = self.c_add_entity(entity_hash=entity_hash,
prob=prob,
freq=freq,
vector_index=vector_index,
feats_row=-1) # Features table currently not implemented
self._entry_index[entity_hash] = new_index
return entity_hash
cpdef set_entities(self, entity_list, prob_list, vector_list):
if len(entity_list) != len(prob_list) or len(entity_list) != len(vector_list):
cpdef set_entities(self, entity_list, freq_list, vector_list):
if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
raise ValueError(Errors.E140)
nr_entities = len(entity_list)
@ -137,7 +137,7 @@ cdef class KnowledgeBase:
entity_hash = self.vocab.strings.add(entity_list[i])
entry.entity_hash = entity_hash
entry.prob = prob_list[i]
entry.freq = freq_list[i]
vector_index = self.c_add_vector(entity_vector=vector_list[i])
entry.vector_index = vector_index
@ -196,7 +196,7 @@ cdef class KnowledgeBase:
return [Candidate(kb=self,
entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].prob,
entity_freq=self._entries[entry_index].freq,
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
alias_hash=alias_hash,
prior_prob=prior_prob)
@ -252,7 +252,7 @@ cdef class KnowledgeBase:
entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash
assert entry_index == i
writer.write_entry(entry.entity_hash, entry.prob, entry.vector_index)
writer.write_entry(entry.entity_hash, entry.freq, entry.vector_index)
i = i+1
writer.write_alias_length(self.get_size_aliases())
@ -278,7 +278,7 @@ cdef class KnowledgeBase:
cdef hash_t entity_hash
cdef hash_t alias_hash
cdef int64_t entry_index
cdef float prob
cdef float freq
cdef int32_t vector_index
cdef KBEntryC entry
cdef AliasC alias
@ -314,10 +314,10 @@ cdef class KnowledgeBase:
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
i = 1
while i <= nr_entities:
reader.read_entry(&entity_hash, &prob, &vector_index)
reader.read_entry(&entity_hash, &freq, &vector_index)
entry.entity_hash = entity_hash
entry.prob = prob
entry.freq = freq
entry.vector_index = vector_index
entry.feats_row = -1 # Features table currently not implemented
@ -387,9 +387,9 @@ cdef class Writer:
cdef int write_vector_element(self, float element) except -1:
self._write(&element, sizeof(element))
cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1:
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
self._write(&entry_hash, sizeof(entry_hash))
self._write(&entry_prob, sizeof(entry_prob))
self._write(&entry_freq, sizeof(entry_freq))
self._write(&vector_index, sizeof(vector_index))
# Features table currently not implemented and not written to file
@ -444,18 +444,18 @@ cdef class Reader:
return 0 # end of file
raise IOError("error reading entity vector from input file")
cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1:
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
status = self._read(entity_hash, sizeof(hash_t))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError("error reading entity hash from input file")
status = self._read(prob, sizeof(float))
status = self._read(freq, sizeof(float))
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError("error reading entity prob from input file")
raise IOError("error reading entity freq from input file")
status = self._read(vector_index, sizeof(int32_t))
if status < 1:

View File

@ -93,7 +93,7 @@ cdef struct KBEntryC:
int32_t feats_row
# log probability of entity, based on corpus frequency
float prob
float freq
# Each alias struct stores a list of Entry pointers with their prior probabilities

View File

@ -23,9 +23,9 @@ def test_kb_valid_entities(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[8, 4, 3])
mykb.add_entity(entity="Q2", prob=0.5, entity_vector=[2, 1, 0])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[-1, -6, 5])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[8, 4, 3])
mykb.add_entity(entity="Q2", freq=0.5, entity_vector=[2, 1, 0])
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[-1, -6, 5])
# adding aliases
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
@ -50,9 +50,9 @@ def test_kb_invalid_entities(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
# adding aliases - should fail because one of the given IDs is not valid
with pytest.raises(ValueError):
@ -66,9 +66,9 @@ def test_kb_invalid_probabilities(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
# adding aliases - should fail because the sum of the probabilities exceeds 1
with pytest.raises(ValueError):
@ -80,9 +80,9 @@ def test_kb_invalid_combination(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
# adding aliases - should fail because the entities and probabilities vectors are not of equal length
with pytest.raises(ValueError):
@ -96,11 +96,11 @@ def test_kb_invalid_entity_vector(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1, 2, 3])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1, 2, 3])
# this should fail because the kb's expected entity vector length is 3
with pytest.raises(ValueError):
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
def test_candidate_generation(nlp):
@ -108,9 +108,9 @@ def test_candidate_generation(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", prob=0.7, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
mykb.add_entity(entity="Q1", freq=0.7, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
# adding aliases
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
@ -133,8 +133,8 @@ def test_preserving_links_asdoc(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.8, entity_vector=[1])
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=0.8, entity_vector=[1])
# adding aliases
mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7])

View File

@ -30,10 +30,10 @@ def test_serialize_kb_disk(en_vocab):
def _get_dummy_kb(vocab):
kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
kb.add_entity(entity='Q53', prob=0.33, entity_vector=[0, 5, 3])
kb.add_entity(entity='Q17', prob=0.2, entity_vector=[7, 1, 0])
kb.add_entity(entity='Q007', prob=0.7, entity_vector=[0, 0, 7])
kb.add_entity(entity='Q44', prob=0.4, entity_vector=[4, 4, 4])
kb.add_entity(entity='Q53', freq=0.33, entity_vector=[0, 5, 3])
kb.add_entity(entity='Q17', freq=0.2, entity_vector=[7, 1, 0])
kb.add_entity(entity='Q007', freq=0.7, entity_vector=[0, 0, 7])
kb.add_entity(entity='Q44', freq=0.4, entity_vector=[4, 4, 4])
kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9])
kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])