mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
dumping all entryC entries + (inefficient) reading back in
This commit is contained in:
parent
8e70a564f1
commit
694fea597a
|
@ -426,16 +426,22 @@ if __name__ == "__main__":
|
|||
# STEP 3 : write KB to file
|
||||
# TODO
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
||||
kb.dump(KB_FILE)
|
||||
print("DUMPED")
|
||||
kb.load(KB_FILE)
|
||||
print("LOADED")
|
||||
kb1 = KnowledgeBase(vocab=nlp.vocab)
|
||||
|
||||
# PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
||||
kb1.add_entity(entity="Q53", prob=0.33)
|
||||
kb1.add_entity(entity="Q17", prob=0.1)
|
||||
kb1.add_entity(entity="Q007", prob=0.7)
|
||||
kb1.add_entity(entity="Q44", prob=0.4)
|
||||
print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases())
|
||||
|
||||
kb1.dump(KB_FILE)
|
||||
|
||||
# STEP 4 : read KB back in from file
|
||||
# TODO
|
||||
|
||||
kb2 = KnowledgeBase(vocab=nlp.vocab)
|
||||
kb2.load(KB_FILE)
|
||||
print("kb2 size:", len(kb2), kb2.get_size_entities(), kb2.get_size_aliases())
|
||||
|
||||
# STEP 5 : actually use the EL functionality
|
||||
# add_el(my_kb, nlp)
|
||||
|
|
|
@ -168,7 +168,6 @@ cdef class Writer:
|
|||
|
||||
cdef class Reader:
|
||||
cdef FILE* _fp
|
||||
cdef public int32_t nr_feat
|
||||
|
||||
cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1
|
||||
|
||||
|
|
64
spacy/kb.pyx
64
spacy/kb.pyx
|
@ -157,33 +157,45 @@ cdef class KnowledgeBase:
|
|||
|
||||
|
||||
def dump(self, loc):
|
||||
# TODO: actually dump the data in this KB :-)
|
||||
|
||||
cdef int64_t entry_id = 32
|
||||
self.vocab.strings.add("Q342")
|
||||
cdef hash_t entity_hash = self.vocab.strings["Q342"]
|
||||
cdef float prob = 0.333
|
||||
|
||||
cdef Writer writer = Writer(loc)
|
||||
writer.write(entry_id, entity_hash, prob)
|
||||
|
||||
for key, entry_index in self._entry_index.items():
|
||||
entry = self._entries[entry_index]
|
||||
print("dumping")
|
||||
print("index", entry_index)
|
||||
print("hash", entry.entity_hash)
|
||||
print("prob", entry.prob)
|
||||
print("")
|
||||
writer.write(entry_index, entry.entity_hash, entry.prob)
|
||||
|
||||
writer.close()
|
||||
|
||||
def load(self, loc):
|
||||
cdef int64_t entry_id
|
||||
cdef hash_t entity_hash
|
||||
cdef float prob
|
||||
cdef _EntryC entry
|
||||
cdef int32_t dummy_value = 342
|
||||
|
||||
cdef Reader reader = Reader(loc)
|
||||
reader.read(self.mem, &entry_id, &entity_hash, &prob)
|
||||
result = reader.read(self.mem, &entry_id, &entity_hash, &prob) # -1: error, 0: eof after this one
|
||||
while result:
|
||||
print("loading")
|
||||
print("entryID", entry_id)
|
||||
print("hash", entity_hash)
|
||||
print("prob", prob)
|
||||
print("result:", result)
|
||||
print("")
|
||||
entry.entity_hash = entity_hash
|
||||
entry.prob = prob
|
||||
|
||||
cdef _EntryC entry
|
||||
entry.entity_hash = entity_hash
|
||||
entry.prob = prob
|
||||
# TODO features and vectors
|
||||
entry.vector_rows = &dummy_value
|
||||
entry.feats_row = dummy_value
|
||||
|
||||
# TODO
|
||||
cdef int32_t dummy_value = 342
|
||||
entry.vector_rows = &dummy_value
|
||||
entry.feats_row = dummy_value
|
||||
# TODO: use set instead of push_back to ensure the index remains the same?
|
||||
self._entries.push_back(entry)
|
||||
result = reader.read(self.mem, &entry_id, &entity_hash, &prob)
|
||||
|
||||
cdef class Writer:
|
||||
def __init__(self, object loc):
|
||||
|
@ -199,10 +211,7 @@ cdef class Writer:
|
|||
assert status == 0
|
||||
|
||||
cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1:
|
||||
cdef int i = 0
|
||||
|
||||
# TODO: feats_rows and vector rows
|
||||
|
||||
_write(&entry_id, sizeof(entry_id), self._fp)
|
||||
_write(&entry_hash, sizeof(entry_hash), self._fp)
|
||||
_write(&entry_prob, sizeof(entry_prob), self._fp)
|
||||
|
@ -227,21 +236,30 @@ cdef class Reader:
|
|||
fclose(self._fp)
|
||||
|
||||
cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
|
||||
status = fread(entry_id, sizeof(entry_id), 1, self._fp)
|
||||
"""
|
||||
Return values:
|
||||
-1: error during current read (EOF during call)
|
||||
0: means we read the last line succesfully (EOF after call)
|
||||
1: we can continue reading this file """
|
||||
status = fread(entry_id, sizeof(int64_t), 1, self._fp)
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
return 0 # end of file
|
||||
raise IOError("error reading entry ID from input file")
|
||||
|
||||
#status = fread(&entity_hash, sizeof(entity_hash), 1, self._fp)
|
||||
status = fread(entity_hash, sizeof(entity_hash), 1, self._fp)
|
||||
status = fread(entity_hash, sizeof(hash_t), 1, self._fp)
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
return 0 # end of file
|
||||
raise IOError("error reading entity hash from input file")
|
||||
|
||||
status = fread(prob, sizeof(prob), 1, self._fp)
|
||||
status = fread(prob, sizeof(float), 1, self._fp)
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
return 0 # end of file
|
||||
raise IOError("error reading entity prob from input file")
|
||||
|
||||
if feof(self._fp):
|
||||
return 0
|
||||
else:
|
||||
return 1
|
||||
|
|
Loading…
Reference in New Issue
Block a user