rename entity frequency

This commit is contained in:
svlandeg 2019-07-19 17:40:28 +02:00
parent f75d1299a7
commit dae8a21282
7 changed files with 49 additions and 49 deletions

View File

@ -70,7 +70,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
print() print()
print(" * adding", len(entity_list), "entities", datetime.datetime.now()) print(" * adding", len(entity_list), "entities", datetime.datetime.now())
kb.set_entities(entity_list=entity_list, prob_list=frequency_list, vector_list=embeddings) kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings)
print() print()
print(" * adding aliases", datetime.datetime.now()) print(" * adding aliases", datetime.datetime.now())

View File

@ -14,15 +14,15 @@ def create_kb(vocab):
# adding entities # adding entities
entity_0 = "Q1004791_Douglas" entity_0 = "Q1004791_Douglas"
print("adding entity", entity_0) print("adding entity", entity_0)
kb.add_entity(entity=entity_0, prob=0.5, entity_vector=[0]) kb.add_entity(entity=entity_0, freq=0.5, entity_vector=[0])
entity_1 = "Q42_Douglas_Adams" entity_1 = "Q42_Douglas_Adams"
print("adding entity", entity_1) print("adding entity", entity_1)
kb.add_entity(entity=entity_1, prob=0.5, entity_vector=[1]) kb.add_entity(entity=entity_1, freq=0.5, entity_vector=[1])
entity_2 = "Q5301561_Douglas_Haig" entity_2 = "Q5301561_Douglas_Haig"
print("adding entity", entity_2) print("adding entity", entity_2)
kb.add_entity(entity=entity_2, prob=0.5, entity_vector=[2]) kb.add_entity(entity=entity_2, freq=0.5, entity_vector=[2])
# adding aliases # adding aliases
print() print()

View File

@ -79,7 +79,7 @@ cdef class KnowledgeBase:
return new_index return new_index
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
int32_t vector_index, int feats_row) nogil: int32_t vector_index, int feats_row) nogil:
"""Add an entry to the vector of entries. """Add an entry to the vector of entries.
After calling this method, make sure to update also the _entry_index using the return value""" After calling this method, make sure to update also the _entry_index using the return value"""
@ -92,7 +92,7 @@ cdef class KnowledgeBase:
entry.entity_hash = entity_hash entry.entity_hash = entity_hash
entry.vector_index = vector_index entry.vector_index = vector_index
entry.feats_row = feats_row entry.feats_row = feats_row
entry.prob = prob entry.freq = freq
self._entries.push_back(entry) self._entries.push_back(entry)
return new_index return new_index
@ -125,7 +125,7 @@ cdef class KnowledgeBase:
entry.entity_hash = dummy_hash entry.entity_hash = dummy_hash
entry.vector_index = dummy_value entry.vector_index = dummy_value
entry.feats_row = dummy_value entry.feats_row = dummy_value
entry.prob = dummy_value entry.freq = dummy_value
# Avoid struct initializer to enable nogil # Avoid struct initializer to enable nogil
cdef vector[int64_t] dummy_entry_indices cdef vector[int64_t] dummy_entry_indices
@ -141,7 +141,7 @@ cdef class KnowledgeBase:
self._aliases_table.push_back(alias) self._aliases_table.push_back(alias)
cpdef load_bulk(self, loc) cpdef load_bulk(self, loc)
cpdef set_entities(self, entity_list, prob_list, vector_list) cpdef set_entities(self, entity_list, freq_list, vector_list)
cdef class Writer: cdef class Writer:
@ -149,7 +149,7 @@ cdef class Writer:
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1 cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
cdef int write_vector_element(self, float element) except -1 cdef int write_vector_element(self, float element) except -1
cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1 cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
cdef int write_alias_length(self, int64_t alias_length) except -1 cdef int write_alias_length(self, int64_t alias_length) except -1
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1 cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
@ -162,7 +162,7 @@ cdef class Reader:
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1 cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
cdef int read_vector_element(self, float* element) except -1 cdef int read_vector_element(self, float* element) except -1
cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1 cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
cdef int read_alias_length(self, int64_t* alias_length) except -1 cdef int read_alias_length(self, int64_t* alias_length) except -1
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1 cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1

View File

@ -94,7 +94,7 @@ cdef class KnowledgeBase:
def get_alias_strings(self): def get_alias_strings(self):
return [self.vocab.strings[x] for x in self._alias_index] return [self.vocab.strings[x] for x in self._alias_index]
def add_entity(self, unicode entity, float prob, vector[float] entity_vector): def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
""" """
Add an entity to the KB, optionally specifying its log probability based on corpus frequency Add an entity to the KB, optionally specifying its log probability based on corpus frequency
Return the hash of the entity ID/name at the end. Return the hash of the entity ID/name at the end.
@ -113,15 +113,15 @@ cdef class KnowledgeBase:
vector_index = self.c_add_vector(entity_vector=entity_vector) vector_index = self.c_add_vector(entity_vector=entity_vector)
new_index = self.c_add_entity(entity_hash=entity_hash, new_index = self.c_add_entity(entity_hash=entity_hash,
prob=prob, freq=freq,
vector_index=vector_index, vector_index=vector_index,
feats_row=-1) # Features table currently not implemented feats_row=-1) # Features table currently not implemented
self._entry_index[entity_hash] = new_index self._entry_index[entity_hash] = new_index
return entity_hash return entity_hash
cpdef set_entities(self, entity_list, prob_list, vector_list): cpdef set_entities(self, entity_list, freq_list, vector_list):
if len(entity_list) != len(prob_list) or len(entity_list) != len(vector_list): if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
raise ValueError(Errors.E140) raise ValueError(Errors.E140)
nr_entities = len(entity_list) nr_entities = len(entity_list)
@ -137,7 +137,7 @@ cdef class KnowledgeBase:
entity_hash = self.vocab.strings.add(entity_list[i]) entity_hash = self.vocab.strings.add(entity_list[i])
entry.entity_hash = entity_hash entry.entity_hash = entity_hash
entry.prob = prob_list[i] entry.freq = freq_list[i]
vector_index = self.c_add_vector(entity_vector=vector_list[i]) vector_index = self.c_add_vector(entity_vector=vector_list[i])
entry.vector_index = vector_index entry.vector_index = vector_index
@ -196,7 +196,7 @@ cdef class KnowledgeBase:
return [Candidate(kb=self, return [Candidate(kb=self,
entity_hash=self._entries[entry_index].entity_hash, entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].prob, entity_freq=self._entries[entry_index].freq,
entity_vector=self._vectors_table[self._entries[entry_index].vector_index], entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
alias_hash=alias_hash, alias_hash=alias_hash,
prior_prob=prior_prob) prior_prob=prior_prob)
@ -252,7 +252,7 @@ cdef class KnowledgeBase:
entry = self._entries[entry_index] entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash assert entry.entity_hash == entry_hash
assert entry_index == i assert entry_index == i
writer.write_entry(entry.entity_hash, entry.prob, entry.vector_index) writer.write_entry(entry.entity_hash, entry.freq, entry.vector_index)
i = i+1 i = i+1
writer.write_alias_length(self.get_size_aliases()) writer.write_alias_length(self.get_size_aliases())
@ -278,7 +278,7 @@ cdef class KnowledgeBase:
cdef hash_t entity_hash cdef hash_t entity_hash
cdef hash_t alias_hash cdef hash_t alias_hash
cdef int64_t entry_index cdef int64_t entry_index
cdef float prob cdef float freq
cdef int32_t vector_index cdef int32_t vector_index
cdef KBEntryC entry cdef KBEntryC entry
cdef AliasC alias cdef AliasC alias
@ -314,10 +314,10 @@ cdef class KnowledgeBase:
# index 0 is a dummy object not stored in the _entry_index and can be ignored. # index 0 is a dummy object not stored in the _entry_index and can be ignored.
i = 1 i = 1
while i <= nr_entities: while i <= nr_entities:
reader.read_entry(&entity_hash, &prob, &vector_index) reader.read_entry(&entity_hash, &freq, &vector_index)
entry.entity_hash = entity_hash entry.entity_hash = entity_hash
entry.prob = prob entry.freq = freq
entry.vector_index = vector_index entry.vector_index = vector_index
entry.feats_row = -1 # Features table currently not implemented entry.feats_row = -1 # Features table currently not implemented
@ -387,9 +387,9 @@ cdef class Writer:
cdef int write_vector_element(self, float element) except -1: cdef int write_vector_element(self, float element) except -1:
self._write(&element, sizeof(element)) self._write(&element, sizeof(element))
cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1: cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
self._write(&entry_hash, sizeof(entry_hash)) self._write(&entry_hash, sizeof(entry_hash))
self._write(&entry_prob, sizeof(entry_prob)) self._write(&entry_freq, sizeof(entry_freq))
self._write(&vector_index, sizeof(vector_index)) self._write(&vector_index, sizeof(vector_index))
# Features table currently not implemented and not written to file # Features table currently not implemented and not written to file
@ -444,18 +444,18 @@ cdef class Reader:
return 0 # end of file return 0 # end of file
raise IOError("error reading entity vector from input file") raise IOError("error reading entity vector from input file")
cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1: cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
status = self._read(entity_hash, sizeof(hash_t)) status = self._read(entity_hash, sizeof(hash_t))
if status < 1: if status < 1:
if feof(self._fp): if feof(self._fp):
return 0 # end of file return 0 # end of file
raise IOError("error reading entity hash from input file") raise IOError("error reading entity hash from input file")
status = self._read(prob, sizeof(float)) status = self._read(freq, sizeof(float))
if status < 1: if status < 1:
if feof(self._fp): if feof(self._fp):
return 0 # end of file return 0 # end of file
raise IOError("error reading entity prob from input file") raise IOError("error reading entity freq from input file")
status = self._read(vector_index, sizeof(int32_t)) status = self._read(vector_index, sizeof(int32_t))
if status < 1: if status < 1:

View File

@ -93,7 +93,7 @@ cdef struct KBEntryC:
int32_t feats_row int32_t feats_row
# log probability of entity, based on corpus frequency # log probability of entity, based on corpus frequency
float prob float freq
# Each alias struct stores a list of Entry pointers with their prior probabilities # Each alias struct stores a list of Entry pointers with their prior probabilities

View File

@ -23,9 +23,9 @@ def test_kb_valid_entities(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
# adding entities # adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[8, 4, 3]) mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[8, 4, 3])
mykb.add_entity(entity="Q2", prob=0.5, entity_vector=[2, 1, 0]) mykb.add_entity(entity="Q2", freq=0.5, entity_vector=[2, 1, 0])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[-1, -6, 5]) mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[-1, -6, 5])
# adding aliases # adding aliases
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2]) mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
@ -50,9 +50,9 @@ def test_kb_invalid_entities(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities # adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1]) mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2]) mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3]) mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
# adding aliases - should fail because one of the given IDs is not valid # adding aliases - should fail because one of the given IDs is not valid
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -66,9 +66,9 @@ def test_kb_invalid_probabilities(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities # adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1]) mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2]) mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3]) mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
# adding aliases - should fail because the sum of the probabilities exceeds 1 # adding aliases - should fail because the sum of the probabilities exceeds 1
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -80,9 +80,9 @@ def test_kb_invalid_combination(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities # adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1]) mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2]) mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3]) mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
# adding aliases - should fail because the entities and probabilities vectors are not of equal length # adding aliases - should fail because the entities and probabilities vectors are not of equal length
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -96,11 +96,11 @@ def test_kb_invalid_entity_vector(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
# adding entities # adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1, 2, 3]) mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1, 2, 3])
# this should fail because the kb's expected entity vector length is 3 # this should fail because the kb's expected entity vector length is 3
with pytest.raises(ValueError): with pytest.raises(ValueError):
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2]) mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
def test_candidate_generation(nlp): def test_candidate_generation(nlp):
@ -108,9 +108,9 @@ def test_candidate_generation(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities # adding entities
mykb.add_entity(entity="Q1", prob=0.7, entity_vector=[1]) mykb.add_entity(entity="Q1", freq=0.7, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2]) mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3]) mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
# adding aliases # adding aliases
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]) mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
@ -133,8 +133,8 @@ def test_preserving_links_asdoc(nlp):
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
# adding entities # adding entities
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1]) mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
mykb.add_entity(entity="Q2", prob=0.8, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=0.8, entity_vector=[1])
# adding aliases # adding aliases
mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7]) mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7])

View File

@ -30,10 +30,10 @@ def test_serialize_kb_disk(en_vocab):
def _get_dummy_kb(vocab): def _get_dummy_kb(vocab):
kb = KnowledgeBase(vocab=vocab, entity_vector_length=3) kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
kb.add_entity(entity='Q53', prob=0.33, entity_vector=[0, 5, 3]) kb.add_entity(entity='Q53', freq=0.33, entity_vector=[0, 5, 3])
kb.add_entity(entity='Q17', prob=0.2, entity_vector=[7, 1, 0]) kb.add_entity(entity='Q17', freq=0.2, entity_vector=[7, 1, 0])
kb.add_entity(entity='Q007', prob=0.7, entity_vector=[0, 0, 7]) kb.add_entity(entity='Q007', freq=0.7, entity_vector=[0, 0, 7])
kb.add_entity(entity='Q44', prob=0.4, entity_vector=[4, 4, 4]) kb.add_entity(entity='Q44', freq=0.4, entity_vector=[4, 4, 4])
kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9]) kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9])
kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1]) kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])