mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
rename entity frequency
This commit is contained in:
parent
f75d1299a7
commit
dae8a21282
|
@ -70,7 +70,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
|
|||
|
||||
print()
|
||||
print(" * adding", len(entity_list), "entities", datetime.datetime.now())
|
||||
kb.set_entities(entity_list=entity_list, prob_list=frequency_list, vector_list=embeddings)
|
||||
kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings)
|
||||
|
||||
print()
|
||||
print(" * adding aliases", datetime.datetime.now())
|
||||
|
|
|
@ -14,15 +14,15 @@ def create_kb(vocab):
|
|||
# adding entities
|
||||
entity_0 = "Q1004791_Douglas"
|
||||
print("adding entity", entity_0)
|
||||
kb.add_entity(entity=entity_0, prob=0.5, entity_vector=[0])
|
||||
kb.add_entity(entity=entity_0, freq=0.5, entity_vector=[0])
|
||||
|
||||
entity_1 = "Q42_Douglas_Adams"
|
||||
print("adding entity", entity_1)
|
||||
kb.add_entity(entity=entity_1, prob=0.5, entity_vector=[1])
|
||||
kb.add_entity(entity=entity_1, freq=0.5, entity_vector=[1])
|
||||
|
||||
entity_2 = "Q5301561_Douglas_Haig"
|
||||
print("adding entity", entity_2)
|
||||
kb.add_entity(entity=entity_2, prob=0.5, entity_vector=[2])
|
||||
kb.add_entity(entity=entity_2, freq=0.5, entity_vector=[2])
|
||||
|
||||
# adding aliases
|
||||
print()
|
||||
|
|
12
spacy/kb.pxd
12
spacy/kb.pxd
|
@ -79,7 +79,7 @@ cdef class KnowledgeBase:
|
|||
return new_index
|
||||
|
||||
|
||||
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
|
||||
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
|
||||
int32_t vector_index, int feats_row) nogil:
|
||||
"""Add an entry to the vector of entries.
|
||||
After calling this method, make sure to update also the _entry_index using the return value"""
|
||||
|
@ -92,7 +92,7 @@ cdef class KnowledgeBase:
|
|||
entry.entity_hash = entity_hash
|
||||
entry.vector_index = vector_index
|
||||
entry.feats_row = feats_row
|
||||
entry.prob = prob
|
||||
entry.freq = freq
|
||||
|
||||
self._entries.push_back(entry)
|
||||
return new_index
|
||||
|
@ -125,7 +125,7 @@ cdef class KnowledgeBase:
|
|||
entry.entity_hash = dummy_hash
|
||||
entry.vector_index = dummy_value
|
||||
entry.feats_row = dummy_value
|
||||
entry.prob = dummy_value
|
||||
entry.freq = dummy_value
|
||||
|
||||
# Avoid struct initializer to enable nogil
|
||||
cdef vector[int64_t] dummy_entry_indices
|
||||
|
@ -141,7 +141,7 @@ cdef class KnowledgeBase:
|
|||
self._aliases_table.push_back(alias)
|
||||
|
||||
cpdef load_bulk(self, loc)
|
||||
cpdef set_entities(self, entity_list, prob_list, vector_list)
|
||||
cpdef set_entities(self, entity_list, freq_list, vector_list)
|
||||
|
||||
|
||||
cdef class Writer:
|
||||
|
@ -149,7 +149,7 @@ cdef class Writer:
|
|||
|
||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
|
||||
cdef int write_vector_element(self, float element) except -1
|
||||
cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1
|
||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
|
||||
|
||||
cdef int write_alias_length(self, int64_t alias_length) except -1
|
||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
|
||||
|
@ -162,7 +162,7 @@ cdef class Reader:
|
|||
|
||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
|
||||
cdef int read_vector_element(self, float* element) except -1
|
||||
cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1
|
||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
|
||||
|
||||
cdef int read_alias_length(self, int64_t* alias_length) except -1
|
||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
|
||||
|
|
30
spacy/kb.pyx
30
spacy/kb.pyx
|
@ -94,7 +94,7 @@ cdef class KnowledgeBase:
|
|||
def get_alias_strings(self):
|
||||
return [self.vocab.strings[x] for x in self._alias_index]
|
||||
|
||||
def add_entity(self, unicode entity, float prob, vector[float] entity_vector):
|
||||
def add_entity(self, unicode entity, float freq, vector[float] entity_vector):
|
||||
"""
|
||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
||||
Return the hash of the entity ID/name at the end.
|
||||
|
@ -113,15 +113,15 @@ cdef class KnowledgeBase:
|
|||
vector_index = self.c_add_vector(entity_vector=entity_vector)
|
||||
|
||||
new_index = self.c_add_entity(entity_hash=entity_hash,
|
||||
prob=prob,
|
||||
freq=freq,
|
||||
vector_index=vector_index,
|
||||
feats_row=-1) # Features table currently not implemented
|
||||
self._entry_index[entity_hash] = new_index
|
||||
|
||||
return entity_hash
|
||||
|
||||
cpdef set_entities(self, entity_list, prob_list, vector_list):
|
||||
if len(entity_list) != len(prob_list) or len(entity_list) != len(vector_list):
|
||||
cpdef set_entities(self, entity_list, freq_list, vector_list):
|
||||
if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
|
||||
raise ValueError(Errors.E140)
|
||||
|
||||
nr_entities = len(entity_list)
|
||||
|
@ -137,7 +137,7 @@ cdef class KnowledgeBase:
|
|||
|
||||
entity_hash = self.vocab.strings.add(entity_list[i])
|
||||
entry.entity_hash = entity_hash
|
||||
entry.prob = prob_list[i]
|
||||
entry.freq = freq_list[i]
|
||||
|
||||
vector_index = self.c_add_vector(entity_vector=vector_list[i])
|
||||
entry.vector_index = vector_index
|
||||
|
@ -196,7 +196,7 @@ cdef class KnowledgeBase:
|
|||
|
||||
return [Candidate(kb=self,
|
||||
entity_hash=self._entries[entry_index].entity_hash,
|
||||
entity_freq=self._entries[entry_index].prob,
|
||||
entity_freq=self._entries[entry_index].freq,
|
||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||
alias_hash=alias_hash,
|
||||
prior_prob=prior_prob)
|
||||
|
@ -252,7 +252,7 @@ cdef class KnowledgeBase:
|
|||
entry = self._entries[entry_index]
|
||||
assert entry.entity_hash == entry_hash
|
||||
assert entry_index == i
|
||||
writer.write_entry(entry.entity_hash, entry.prob, entry.vector_index)
|
||||
writer.write_entry(entry.entity_hash, entry.freq, entry.vector_index)
|
||||
i = i+1
|
||||
|
||||
writer.write_alias_length(self.get_size_aliases())
|
||||
|
@ -278,7 +278,7 @@ cdef class KnowledgeBase:
|
|||
cdef hash_t entity_hash
|
||||
cdef hash_t alias_hash
|
||||
cdef int64_t entry_index
|
||||
cdef float prob
|
||||
cdef float freq
|
||||
cdef int32_t vector_index
|
||||
cdef KBEntryC entry
|
||||
cdef AliasC alias
|
||||
|
@ -314,10 +314,10 @@ cdef class KnowledgeBase:
|
|||
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
|
||||
i = 1
|
||||
while i <= nr_entities:
|
||||
reader.read_entry(&entity_hash, &prob, &vector_index)
|
||||
reader.read_entry(&entity_hash, &freq, &vector_index)
|
||||
|
||||
entry.entity_hash = entity_hash
|
||||
entry.prob = prob
|
||||
entry.freq = freq
|
||||
entry.vector_index = vector_index
|
||||
entry.feats_row = -1 # Features table currently not implemented
|
||||
|
||||
|
@ -387,9 +387,9 @@ cdef class Writer:
|
|||
cdef int write_vector_element(self, float element) except -1:
|
||||
self._write(&element, sizeof(element))
|
||||
|
||||
cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1:
|
||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
|
||||
self._write(&entry_hash, sizeof(entry_hash))
|
||||
self._write(&entry_prob, sizeof(entry_prob))
|
||||
self._write(&entry_freq, sizeof(entry_freq))
|
||||
self._write(&vector_index, sizeof(vector_index))
|
||||
# Features table currently not implemented and not written to file
|
||||
|
||||
|
@ -444,18 +444,18 @@ cdef class Reader:
|
|||
return 0 # end of file
|
||||
raise IOError("error reading entity vector from input file")
|
||||
|
||||
cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1:
|
||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
|
||||
status = self._read(entity_hash, sizeof(hash_t))
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
return 0 # end of file
|
||||
raise IOError("error reading entity hash from input file")
|
||||
|
||||
status = self._read(prob, sizeof(float))
|
||||
status = self._read(freq, sizeof(float))
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
return 0 # end of file
|
||||
raise IOError("error reading entity prob from input file")
|
||||
raise IOError("error reading entity freq from input file")
|
||||
|
||||
status = self._read(vector_index, sizeof(int32_t))
|
||||
if status < 1:
|
||||
|
|
|
@ -93,7 +93,7 @@ cdef struct KBEntryC:
|
|||
int32_t feats_row
|
||||
|
||||
# log probability of entity, based on corpus frequency
|
||||
float prob
|
||||
float freq
|
||||
|
||||
|
||||
# Each alias struct stores a list of Entry pointers with their prior probabilities
|
||||
|
|
|
@ -23,9 +23,9 @@ def test_kb_valid_entities(nlp):
|
|||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[8, 4, 3])
|
||||
mykb.add_entity(entity="Q2", prob=0.5, entity_vector=[2, 1, 0])
|
||||
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[-1, -6, 5])
|
||||
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[8, 4, 3])
|
||||
mykb.add_entity(entity="Q2", freq=0.5, entity_vector=[2, 1, 0])
|
||||
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[-1, -6, 5])
|
||||
|
||||
# adding aliases
|
||||
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
|
||||
|
@ -50,9 +50,9 @@ def test_kb_invalid_entities(nlp):
|
|||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
|
||||
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
|
||||
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
|
||||
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
|
||||
|
||||
# adding aliases - should fail because one of the given IDs is not valid
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -66,9 +66,9 @@ def test_kb_invalid_probabilities(nlp):
|
|||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
|
||||
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
|
||||
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
|
||||
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
|
||||
|
||||
# adding aliases - should fail because the sum of the probabilities exceeds 1
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -80,9 +80,9 @@ def test_kb_invalid_combination(nlp):
|
|||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
|
||||
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
|
||||
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
|
||||
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
|
||||
|
||||
# adding aliases - should fail because the entities and probabilities vectors are not of equal length
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -96,11 +96,11 @@ def test_kb_invalid_entity_vector(nlp):
|
|||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1, 2, 3])
|
||||
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1, 2, 3])
|
||||
|
||||
# this should fail because the kb's expected entity vector length is 3
|
||||
with pytest.raises(ValueError):
|
||||
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
|
||||
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
|
||||
|
||||
|
||||
def test_candidate_generation(nlp):
|
||||
|
@ -108,9 +108,9 @@ def test_candidate_generation(nlp):
|
|||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", prob=0.7, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", prob=0.2, entity_vector=[2])
|
||||
mykb.add_entity(entity="Q3", prob=0.5, entity_vector=[3])
|
||||
mykb.add_entity(entity="Q1", freq=0.7, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
|
||||
mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3])
|
||||
|
||||
# adding aliases
|
||||
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
|
||||
|
@ -133,8 +133,8 @@ def test_preserving_links_asdoc(nlp):
|
|||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", prob=0.9, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", prob=0.8, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", freq=0.8, entity_vector=[1])
|
||||
|
||||
# adding aliases
|
||||
mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7])
|
||||
|
|
|
@ -30,10 +30,10 @@ def test_serialize_kb_disk(en_vocab):
|
|||
def _get_dummy_kb(vocab):
|
||||
kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
|
||||
|
||||
kb.add_entity(entity='Q53', prob=0.33, entity_vector=[0, 5, 3])
|
||||
kb.add_entity(entity='Q17', prob=0.2, entity_vector=[7, 1, 0])
|
||||
kb.add_entity(entity='Q007', prob=0.7, entity_vector=[0, 0, 7])
|
||||
kb.add_entity(entity='Q44', prob=0.4, entity_vector=[4, 4, 4])
|
||||
kb.add_entity(entity='Q53', freq=0.33, entity_vector=[0, 5, 3])
|
||||
kb.add_entity(entity='Q17', freq=0.2, entity_vector=[7, 1, 0])
|
||||
kb.add_entity(entity='Q007', freq=0.7, entity_vector=[0, 0, 7])
|
||||
kb.add_entity(entity='Q44', freq=0.4, entity_vector=[4, 4, 4])
|
||||
|
||||
kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9])
|
||||
kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1])
|
||||
|
|
Loading…
Reference in New Issue
Block a user