mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
custom reader and writer for _EntryC fields (first stab at it - not complete)
This commit is contained in:
parent
004e5e7d1c
commit
8e70a564f1
|
@ -16,6 +16,8 @@ ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-art
|
|||
ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
|
||||
PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
||||
|
||||
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
||||
|
||||
|
||||
# these will/should be matched ignoring case
|
||||
wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
|
||||
|
@ -418,14 +420,22 @@ if __name__ == "__main__":
|
|||
# _read_wikipedia_prior_probs()
|
||||
|
||||
# STEP 2 : create KB
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
|
||||
# nlp = spacy.load('en_core_web_sm')
|
||||
# my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
|
||||
|
||||
# STEP 3 : write KB to file
|
||||
# TODO
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
||||
kb.dump(KB_FILE)
|
||||
print("DUMPED")
|
||||
kb.load(KB_FILE)
|
||||
print("LOADED")
|
||||
|
||||
# PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
||||
|
||||
# STEP 4 : read KB back in from file
|
||||
# TODO
|
||||
|
||||
# STEP 5 : actually use the EL functionality
|
||||
add_el(my_kb, nlp)
|
||||
# add_el(my_kb, nlp)
|
||||
|
|
14
spacy/kb.pxd
14
spacy/kb.pxd
|
@ -7,6 +7,8 @@ from libc.stdint cimport int32_t, int64_t
|
|||
from spacy.vocab cimport Vocab
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
from libc.stdio cimport FILE
|
||||
|
||||
|
||||
# Internal struct, for storage and disambiguation. This isn't what we return
|
||||
# to the user as the answer to "here's your entity". It's the minimum number
|
||||
|
@ -158,3 +160,15 @@ cdef class KnowledgeBase:
|
|||
self._aliases_table.push_back(alias)
|
||||
|
||||
|
||||
cdef class Writer:
|
||||
cdef FILE* _fp
|
||||
|
||||
cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1
|
||||
|
||||
|
||||
cdef class Reader:
|
||||
cdef FILE* _fp
|
||||
cdef public int32_t nr_feat
|
||||
|
||||
cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1
|
||||
|
||||
|
|
106
spacy/kb.pyx
106
spacy/kb.pyx
|
@ -1,8 +1,23 @@
|
|||
# cython: infer_types=True
|
||||
# cython: profile=True
|
||||
# coding: utf8
|
||||
from collections import OrderedDict
|
||||
from cpython.exc cimport PyErr_CheckSignals
|
||||
|
||||
from spacy import util
|
||||
from spacy.errors import Errors, Warnings, user_warning
|
||||
|
||||
from cpython.mem cimport PyMem_Malloc
|
||||
from cpython.exc cimport PyErr_SetFromErrno
|
||||
|
||||
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libc.stdlib cimport qsort
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
from os import path
|
||||
|
||||
|
||||
cdef class Candidate:
|
||||
|
||||
|
@ -139,3 +154,94 @@ cdef class KnowledgeBase:
|
|||
prior_prob=prob)
|
||||
for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||
if entry_index != 0]
|
||||
|
||||
|
||||
def dump(self, loc):
|
||||
# TODO: actually dump the data in this KB :-)
|
||||
|
||||
cdef int64_t entry_id = 32
|
||||
self.vocab.strings.add("Q342")
|
||||
cdef hash_t entity_hash = self.vocab.strings["Q342"]
|
||||
cdef float prob = 0.333
|
||||
|
||||
cdef Writer writer = Writer(loc)
|
||||
writer.write(entry_id, entity_hash, prob)
|
||||
writer.close()
|
||||
|
||||
def load(self, loc):
|
||||
cdef int64_t entry_id
|
||||
cdef hash_t entity_hash
|
||||
cdef float prob
|
||||
|
||||
cdef Reader reader = Reader(loc)
|
||||
reader.read(self.mem, &entry_id, &entity_hash, &prob)
|
||||
|
||||
cdef _EntryC entry
|
||||
entry.entity_hash = entity_hash
|
||||
entry.prob = prob
|
||||
|
||||
# TODO
|
||||
cdef int32_t dummy_value = 342
|
||||
entry.vector_rows = &dummy_value
|
||||
entry.feats_row = dummy_value
|
||||
|
||||
cdef class Writer:
|
||||
def __init__(self, object loc):
|
||||
if path.exists(loc):
|
||||
assert not path.isdir(loc), "%s is directory." % loc
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||
assert self._fp != NULL
|
||||
fseek(self._fp, 0, 0)
|
||||
|
||||
def close(self):
|
||||
cdef size_t status = fclose(self._fp)
|
||||
assert status == 0
|
||||
|
||||
cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1:
|
||||
cdef int i = 0
|
||||
|
||||
# TODO: feats_rows and vector rows
|
||||
|
||||
_write(&entry_id, sizeof(entry_id), self._fp)
|
||||
_write(&entry_hash, sizeof(entry_hash), self._fp)
|
||||
_write(&entry_prob, sizeof(entry_prob), self._fp)
|
||||
|
||||
|
||||
cdef int _write(void* value, size_t size, FILE* fp) except -1:
|
||||
status = fwrite(value, size, 1, fp)
|
||||
assert status == 1, status
|
||||
|
||||
|
||||
cdef class Reader:
|
||||
def __init__(self, object loc):
|
||||
assert path.exists(loc)
|
||||
assert not path.isdir(loc)
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||
if not self._fp:
|
||||
PyErr_SetFromErrno(IOError)
|
||||
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
||||
|
||||
def __dealloc__(self):
|
||||
fclose(self._fp)
|
||||
|
||||
cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
|
||||
status = fread(entry_id, sizeof(entry_id), 1, self._fp)
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
return 0 # end of file
|
||||
raise IOError("error reading entry ID from input file")
|
||||
|
||||
#status = fread(&entity_hash, sizeof(entity_hash), 1, self._fp)
|
||||
status = fread(entity_hash, sizeof(entity_hash), 1, self._fp)
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
return 0 # end of file
|
||||
raise IOError("error reading entity hash from input file")
|
||||
|
||||
status = fread(prob, sizeof(prob), 1, self._fp)
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
return 0 # end of file
|
||||
raise IOError("error reading entity prob from input file")
|
||||
|
|
Loading…
Reference in New Issue
Block a user