diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py index e8e081cef..d8cdf6dd7 100644 --- a/bin/wiki_entity_linking/kb_creator.py +++ b/bin/wiki_entity_linking/kb_creator.py @@ -70,7 +70,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ, print() print(" * adding", len(entity_list), "entities", datetime.datetime.now()) - kb.set_entities(entity_list=entity_list, prob_list=frequency_list, vector_list=embeddings) + kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings) print() print(" * adding aliases", datetime.datetime.now()) diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py index 0e59db304..6dde616b8 100644 --- a/examples/pipeline/dummy_entity_linking.py +++ b/examples/pipeline/dummy_entity_linking.py @@ -14,15 +14,15 @@ def create_kb(vocab): # adding entities entity_0 = "Q1004791_Douglas" print("adding entity", entity_0) - kb.add_entity(entity=entity_0, prob=0.5, entity_vector=[0]) + kb.add_entity(entity=entity_0, freq=0.5, entity_vector=[0]) entity_1 = "Q42_Douglas_Adams" print("adding entity", entity_1) - kb.add_entity(entity=entity_1, prob=0.5, entity_vector=[1]) + kb.add_entity(entity=entity_1, freq=0.5, entity_vector=[1]) entity_2 = "Q5301561_Douglas_Haig" print("adding entity", entity_2) - kb.add_entity(entity=entity_2, prob=0.5, entity_vector=[2]) + kb.add_entity(entity=entity_2, freq=0.5, entity_vector=[2]) # adding aliases print() diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 40b22b275..d5aa382b1 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -79,7 +79,7 @@ cdef class KnowledgeBase: return new_index - cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, + cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq, int32_t vector_index, int feats_row) nogil: """Add an entry to the vector of entries. After calling this method, make sure to update also the _entry_index using the return value""" @@ -92,7 +92,7 @@ cdef class KnowledgeBase: entry.entity_hash = entity_hash entry.vector_index = vector_index entry.feats_row = feats_row - entry.prob = prob + entry.freq = freq self._entries.push_back(entry) return new_index @@ -125,7 +125,7 @@ cdef class KnowledgeBase: entry.entity_hash = dummy_hash entry.vector_index = dummy_value entry.feats_row = dummy_value - entry.prob = dummy_value + entry.freq = dummy_value # Avoid struct initializer to enable nogil cdef vector[int64_t] dummy_entry_indices @@ -141,7 +141,7 @@ cdef class KnowledgeBase: self._aliases_table.push_back(alias) cpdef load_bulk(self, loc) - cpdef set_entities(self, entity_list, prob_list, vector_list) + cpdef set_entities(self, entity_list, freq_list, vector_list) cdef class Writer: @@ -149,7 +149,7 @@ cdef class Writer: cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1 cdef int write_vector_element(self, float element) except -1 - cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1 + cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1 cdef int write_alias_length(self, int64_t alias_length) except -1 cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1 @@ -162,7 +162,7 @@ cdef class Reader: cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1 cdef int read_vector_element(self, float* element) except -1 - cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1 + cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1 cdef int read_alias_length(self, int64_t* alias_length) except -1 cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1 diff --git a/spacy/kb.pyx b/spacy/kb.pyx index bdd7da0f9..9df0e4fc2 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -94,7 +94,7 @@ cdef class KnowledgeBase: def get_alias_strings(self): return [self.vocab.strings[x] for x in self._alias_index] - def add_entity(self, unicode entity, float prob, vector[float] entity_vector): + def add_entity(self, unicode entity, float freq, vector[float] entity_vector): """ Add an entity to the KB, optionally specifying its log probability based on corpus frequency Return the hash of the entity ID/name at the end. @@ -113,15 +113,15 @@ cdef class KnowledgeBase: vector_index = self.c_add_vector(entity_vector=entity_vector) new_index = self.c_add_entity(entity_hash=entity_hash, - prob=prob, + freq=freq, vector_index=vector_index, feats_row=-1) # Features table currently not implemented self._entry_index[entity_hash] = new_index return entity_hash - cpdef set_entities(self, entity_list, prob_list, vector_list): - if len(entity_list) != len(prob_list) or len(entity_list) != len(vector_list): + cpdef set_entities(self, entity_list, freq_list, vector_list): + if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list): raise ValueError(Errors.E140) nr_entities = len(entity_list) @@ -137,7 +137,7 @@ cdef class KnowledgeBase: entity_hash = self.vocab.strings.add(entity_list[i]) entry.entity_hash = entity_hash - entry.prob = prob_list[i] + entry.freq = freq_list[i] vector_index = self.c_add_vector(entity_vector=vector_list[i]) entry.vector_index = vector_index @@ -196,7 +196,7 @@ cdef class KnowledgeBase: return [Candidate(kb=self, entity_hash=self._entries[entry_index].entity_hash, - entity_freq=self._entries[entry_index].prob, + entity_freq=self._entries[entry_index].freq, entity_vector=self._vectors_table[self._entries[entry_index].vector_index], alias_hash=alias_hash, prior_prob=prior_prob) @@ -252,7 +252,7 @@ cdef class KnowledgeBase: entry = self._entries[entry_index] assert entry.entity_hash == entry_hash assert entry_index == i - writer.write_entry(entry.entity_hash, entry.prob, entry.vector_index) + writer.write_entry(entry.entity_hash, entry.freq, entry.vector_index) i = i+1 writer.write_alias_length(self.get_size_aliases()) @@ -278,7 +278,7 @@ cdef class KnowledgeBase: cdef hash_t entity_hash cdef hash_t alias_hash cdef int64_t entry_index - cdef float prob + cdef float freq cdef int32_t vector_index cdef KBEntryC entry cdef AliasC alias @@ -314,10 +314,10 @@ cdef class KnowledgeBase: # index 0 is a dummy object not stored in the _entry_index and can be ignored. i = 1 while i <= nr_entities: - reader.read_entry(&entity_hash, &prob, &vector_index) + reader.read_entry(&entity_hash, &freq, &vector_index) entry.entity_hash = entity_hash - entry.prob = prob + entry.freq = freq entry.vector_index = vector_index entry.feats_row = -1 # Features table currently not implemented @@ -387,9 +387,9 @@ cdef class Writer: cdef int write_vector_element(self, float element) except -1: self._write(&element, sizeof(element)) - cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1: + cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1: self._write(&entry_hash, sizeof(entry_hash)) - self._write(&entry_prob, sizeof(entry_prob)) + self._write(&entry_freq, sizeof(entry_freq)) self._write(&vector_index, sizeof(vector_index)) # Features table currently not implemented and not written to file @@ -444,18 +444,18 @@ cdef class Reader: return 0 # end of file raise IOError("error reading entity vector from input file") - cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1: + cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1: status = self._read(entity_hash, sizeof(hash_t)) if status < 1: if feof(self._fp): return 0 # end of file raise IOError("error reading entity hash from input file") - status = self._read(prob, sizeof(float)) + status = self._read(freq, sizeof(float)) if status < 1: if feof(self._fp): return 0 # end of file - raise IOError("error reading entity prob from input file") + raise IOError("error reading entity freq from input file") status = self._read(vector_index, sizeof(int32_t)) if status < 1: diff --git a/spacy/structs.pxd b/spacy/structs.pxd index e80b1b4d6..6c643b4cd 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -93,7 +93,7 @@ cdef struct KBEntryC: int32_t feats_row # log probability of entity, based on corpus frequency - float prob + float freq # Each alias struct stores a list of Entry pointers with their prior probabilities diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index c3163200a..ab4055bba 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -23,9 +23,9 @@ def test_kb_valid_entities(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities - mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[8, 4, 3]) - mykb.add_entity(entity="Q2", prob=0.5, entity_vector=[2, 1, 0]) - mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[-1, -6, 5]) + mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[8, 4, 3]) + mykb.add_entity(entity="Q2", freq=0.5, entity_vector=[2, 1, 0]) + mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[-1, -6, 5]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2]) @@ -50,9 +50,9 @@ def test_kb_invalid_entities(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1]) - mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2]) - mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3]) + mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) + mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3]) # adding aliases - should fail because one of the given IDs is not valid with pytest.raises(ValueError): @@ -66,9 +66,9 @@ def test_kb_invalid_probabilities(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1]) - mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2]) - mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3]) + mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) + mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3]) # adding aliases - should fail because the sum of the probabilities exceeds 1 with pytest.raises(ValueError): @@ -80,9 +80,9 @@ def test_kb_invalid_combination(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1]) - mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2]) - mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3]) + mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) + mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3]) # adding aliases - should fail because the entities and probabilities vectors are not of equal length with pytest.raises(ValueError): @@ -96,11 +96,11 @@ def test_kb_invalid_entity_vector(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities - mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1, 2, 3]) + mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1, 2, 3]) # this should fail because the kb's expected entity vector length is 3 with pytest.raises(ValueError): - mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2]) + mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) def test_candidate_generation(nlp): @@ -108,9 +108,9 @@ def test_candidate_generation(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity="Q1", prob=0.7, entity_vector=[1]) - mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2]) - mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3]) + mykb.add_entity(entity="Q1", freq=0.7, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) + mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]) @@ -133,8 +133,8 @@ def test_preserving_links_asdoc(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1]) - mykb.add_entity(entity="Q2", prob=0.8, entity_vector=[1]) + mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=0.8, entity_vector=[1]) # adding aliases mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7]) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index fa7253fa1..1752abda2 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -30,10 +30,10 @@ def test_serialize_kb_disk(en_vocab): def _get_dummy_kb(vocab): kb = KnowledgeBase(vocab=vocab, entity_vector_length=3) - kb.add_entity(entity='Q53', prob=0.33, entity_vector=[0, 5, 3]) - kb.add_entity(entity='Q17', prob=0.2, entity_vector=[7, 1, 0]) - kb.add_entity(entity='Q007', prob=0.7, entity_vector=[0, 0, 7]) - kb.add_entity(entity='Q44', prob=0.4, entity_vector=[4, 4, 4]) + kb.add_entity(entity='Q53', freq=0.33, entity_vector=[0, 5, 3]) + kb.add_entity(entity='Q17', freq=0.2, entity_vector=[7, 1, 0]) + kb.add_entity(entity='Q007', freq=0.7, entity_vector=[0, 0, 7]) + kb.add_entity(entity='Q44', freq=0.4, entity_vector=[4, 4, 4]) kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9]) kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])