custom reader and writer for _EntryC fields (first stab at it - not complete)

This commit is contained in:
svlandeg 2019-04-23 16:33:40 +02:00
parent 004e5e7d1c
commit 8e70a564f1
3 changed files with 133 additions and 3 deletions

View File

@ -16,6 +16,8 @@ ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-art
ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2' ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
# these will/should be matched ignoring case # these will/should be matched ignoring case
wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
@ -418,14 +420,22 @@ if __name__ == "__main__":
# _read_wikipedia_prior_probs() # _read_wikipedia_prior_probs()
# STEP 2 : create KB # STEP 2 : create KB
nlp = spacy.load('en_core_web_sm') # nlp = spacy.load('en_core_web_sm')
my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True) # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
# STEP 3 : write KB to file # STEP 3 : write KB to file
# TODO # TODO
nlp = spacy.load('en_core_web_sm')
kb = KnowledgeBase(vocab=nlp.vocab)
kb.dump(KB_FILE)
print("DUMPED")
kb.load(KB_FILE)
print("LOADED")
# PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
# STEP 4 : read KB back in from file # STEP 4 : read KB back in from file
# TODO # TODO
# STEP 5 : actually use the EL functionality # STEP 5 : actually use the EL functionality
add_el(my_kb, nlp) # add_el(my_kb, nlp)

View File

@ -7,6 +7,8 @@ from libc.stdint cimport int32_t, int64_t
from spacy.vocab cimport Vocab from spacy.vocab cimport Vocab
from .typedefs cimport hash_t from .typedefs cimport hash_t
from libc.stdio cimport FILE
# Internal struct, for storage and disambiguation. This isn't what we return # Internal struct, for storage and disambiguation. This isn't what we return
# to the user as the answer to "here's your entity". It's the minimum number # to the user as the answer to "here's your entity". It's the minimum number
@ -158,3 +160,15 @@ cdef class KnowledgeBase:
self._aliases_table.push_back(alias) self._aliases_table.push_back(alias)
cdef class Writer:
cdef FILE* _fp
cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1
cdef class Reader:
cdef FILE* _fp
cdef public int32_t nr_feat
cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1

View File

@ -1,8 +1,23 @@
# cython: infer_types=True # cython: infer_types=True
# cython: profile=True # cython: profile=True
# coding: utf8 # coding: utf8
from collections import OrderedDict
from cpython.exc cimport PyErr_CheckSignals
from spacy import util
from spacy.errors import Errors, Warnings, user_warning from spacy.errors import Errors, Warnings, user_warning
from cpython.mem cimport PyMem_Malloc
from cpython.exc cimport PyErr_SetFromErrno
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
from libc.stdint cimport int32_t, int64_t
from libc.stdlib cimport qsort
from .typedefs cimport hash_t
from os import path
cdef class Candidate: cdef class Candidate:
@ -139,3 +154,94 @@ cdef class KnowledgeBase:
prior_prob=prob) prior_prob=prob)
for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
if entry_index != 0] if entry_index != 0]
def dump(self, loc):
# TODO: actually dump the data in this KB :-)
cdef int64_t entry_id = 32
self.vocab.strings.add("Q342")
cdef hash_t entity_hash = self.vocab.strings["Q342"]
cdef float prob = 0.333
cdef Writer writer = Writer(loc)
writer.write(entry_id, entity_hash, prob)
writer.close()
def load(self, loc):
cdef int64_t entry_id
cdef hash_t entity_hash
cdef float prob
cdef Reader reader = Reader(loc)
reader.read(self.mem, &entry_id, &entity_hash, &prob)
cdef _EntryC entry
entry.entity_hash = entity_hash
entry.prob = prob
# TODO
cdef int32_t dummy_value = 342
entry.vector_rows = &dummy_value
entry.feats_row = dummy_value
cdef class Writer:
def __init__(self, object loc):
if path.exists(loc):
assert not path.isdir(loc), "%s is directory." % loc
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self._fp = fopen(<char*>bytes_loc, 'wb')
assert self._fp != NULL
fseek(self._fp, 0, 0)
def close(self):
cdef size_t status = fclose(self._fp)
assert status == 0
cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1:
cdef int i = 0
# TODO: feats_rows and vector rows
_write(&entry_id, sizeof(entry_id), self._fp)
_write(&entry_hash, sizeof(entry_hash), self._fp)
_write(&entry_prob, sizeof(entry_prob), self._fp)
cdef int _write(void* value, size_t size, FILE* fp) except -1:
status = fwrite(value, size, 1, fp)
assert status == 1, status
cdef class Reader:
def __init__(self, object loc):
assert path.exists(loc)
assert not path.isdir(loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self._fp = fopen(<char*>bytes_loc, 'rb')
if not self._fp:
PyErr_SetFromErrno(IOError)
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header
def __dealloc__(self):
fclose(self._fp)
cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
status = fread(entry_id, sizeof(entry_id), 1, self._fp)
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError("error reading entry ID from input file")
#status = fread(&entity_hash, sizeof(entity_hash), 1, self._fp)
status = fread(entity_hash, sizeof(entity_hash), 1, self._fp)
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError("error reading entity hash from input file")
status = fread(prob, sizeof(prob), 1, self._fp)
if status < 1:
if feof(self._fp):
return 0 # end of file
raise IOError("error reading entity prob from input file")