diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index a02226f9f..84e8066e2 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -16,6 +16,8 @@ ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-art ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2' PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' +KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' + # these will/should be matched ignoring case wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", @@ -418,14 +420,22 @@ if __name__ == "__main__": # _read_wikipedia_prior_probs() # STEP 2 : create KB - nlp = spacy.load('en_core_web_sm') - my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True) + # nlp = spacy.load('en_core_web_sm') + # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True) # STEP 3 : write KB to file # TODO + nlp = spacy.load('en_core_web_sm') + kb = KnowledgeBase(vocab=nlp.vocab) + kb.dump(KB_FILE) + print("DUMPED") + kb.load(KB_FILE) + print("LOADED") + + # PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' # STEP 4 : read KB back in from file # TODO # STEP 5 : actually use the EL functionality - add_el(my_kb, nlp) + # add_el(my_kb, nlp) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 3cdf1e07e..eab947b66 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -7,6 +7,8 @@ from libc.stdint cimport int32_t, int64_t from spacy.vocab cimport Vocab from .typedefs cimport hash_t +from libc.stdio cimport FILE + # Internal struct, for storage and disambiguation. This isn't what we return # to the user as the answer to "here's your entity". It's the minimum number @@ -158,3 +160,15 @@ cdef class KnowledgeBase: self._aliases_table.push_back(alias) +cdef class Writer: + cdef FILE* _fp + + cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1 + + +cdef class Reader: + cdef FILE* _fp + cdef public int32_t nr_feat + + cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1 + diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 8a1710a9c..207231c99 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,8 +1,23 @@ # cython: infer_types=True # cython: profile=True # coding: utf8 +from collections import OrderedDict +from cpython.exc cimport PyErr_CheckSignals + +from spacy import util from spacy.errors import Errors, Warnings, user_warning +from cpython.mem cimport PyMem_Malloc +from cpython.exc cimport PyErr_SetFromErrno + +from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek +from libc.stdint cimport int32_t, int64_t +from libc.stdlib cimport qsort + +from .typedefs cimport hash_t + +from os import path + cdef class Candidate: @@ -139,3 +154,94 @@ cdef class KnowledgeBase: prior_prob=prob) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) if entry_index != 0] + + + def dump(self, loc): + # TODO: actually dump the data in this KB :-) + + cdef int64_t entry_id = 32 + self.vocab.strings.add("Q342") + cdef hash_t entity_hash = self.vocab.strings["Q342"] + cdef float prob = 0.333 + + cdef Writer writer = Writer(loc) + writer.write(entry_id, entity_hash, prob) + writer.close() + + def load(self, loc): + cdef int64_t entry_id + cdef hash_t entity_hash + cdef float prob + + cdef Reader reader = Reader(loc) + reader.read(self.mem, &entry_id, &entity_hash, &prob) + + cdef _EntryC entry + entry.entity_hash = entity_hash + entry.prob = prob + + # TODO + cdef int32_t dummy_value = 342 + entry.vector_rows = &dummy_value + entry.feats_row = dummy_value + +cdef class Writer: + def __init__(self, object loc): + if path.exists(loc): + assert not path.isdir(loc), "%s is directory." % loc + cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + self._fp = fopen(bytes_loc, 'wb') + assert self._fp != NULL + fseek(self._fp, 0, 0) + + def close(self): + cdef size_t status = fclose(self._fp) + assert status == 0 + + cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1: + cdef int i = 0 + + # TODO: feats_rows and vector rows + + _write(&entry_id, sizeof(entry_id), self._fp) + _write(&entry_hash, sizeof(entry_hash), self._fp) + _write(&entry_prob, sizeof(entry_prob), self._fp) + + +cdef int _write(void* value, size_t size, FILE* fp) except -1: + status = fwrite(value, size, 1, fp) + assert status == 1, status + + +cdef class Reader: + def __init__(self, object loc): + assert path.exists(loc) + assert not path.isdir(loc) + cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + self._fp = fopen(bytes_loc, 'rb') + if not self._fp: + PyErr_SetFromErrno(IOError) + status = fseek(self._fp, 0, 0) # this can be 0 if there is no header + + def __dealloc__(self): + fclose(self._fp) + + cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: + status = fread(entry_id, sizeof(entry_id), 1, self._fp) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading entry ID from input file") + + #status = fread(&entity_hash, sizeof(entity_hash), 1, self._fp) + status = fread(entity_hash, sizeof(entity_hash), 1, self._fp) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading entity hash from input file") + + status = fread(prob, sizeof(prob), 1, self._fp) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading entity prob from input file")