mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
custom reader and writer for _EntryC fields (first stab at it - not complete)
This commit is contained in:
parent
004e5e7d1c
commit
8e70a564f1
|
@ -16,6 +16,8 @@ ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-art
|
||||||
ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
|
ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
|
||||||
PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
||||||
|
|
||||||
|
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
||||||
|
|
||||||
|
|
||||||
# these will/should be matched ignoring case
|
# these will/should be matched ignoring case
|
||||||
wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
|
wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
|
||||||
|
@ -418,14 +420,22 @@ if __name__ == "__main__":
|
||||||
# _read_wikipedia_prior_probs()
|
# _read_wikipedia_prior_probs()
|
||||||
|
|
||||||
# STEP 2 : create KB
|
# STEP 2 : create KB
|
||||||
nlp = spacy.load('en_core_web_sm')
|
# nlp = spacy.load('en_core_web_sm')
|
||||||
my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
|
# my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
|
||||||
|
|
||||||
# STEP 3 : write KB to file
|
# STEP 3 : write KB to file
|
||||||
# TODO
|
# TODO
|
||||||
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
kb = KnowledgeBase(vocab=nlp.vocab)
|
||||||
|
kb.dump(KB_FILE)
|
||||||
|
print("DUMPED")
|
||||||
|
kb.load(KB_FILE)
|
||||||
|
print("LOADED")
|
||||||
|
|
||||||
|
# PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
||||||
|
|
||||||
# STEP 4 : read KB back in from file
|
# STEP 4 : read KB back in from file
|
||||||
# TODO
|
# TODO
|
||||||
|
|
||||||
# STEP 5 : actually use the EL functionality
|
# STEP 5 : actually use the EL functionality
|
||||||
add_el(my_kb, nlp)
|
# add_el(my_kb, nlp)
|
||||||
|
|
14
spacy/kb.pxd
14
spacy/kb.pxd
|
@ -7,6 +7,8 @@ from libc.stdint cimport int32_t, int64_t
|
||||||
from spacy.vocab cimport Vocab
|
from spacy.vocab cimport Vocab
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
|
||||||
|
from libc.stdio cimport FILE
|
||||||
|
|
||||||
|
|
||||||
# Internal struct, for storage and disambiguation. This isn't what we return
|
# Internal struct, for storage and disambiguation. This isn't what we return
|
||||||
# to the user as the answer to "here's your entity". It's the minimum number
|
# to the user as the answer to "here's your entity". It's the minimum number
|
||||||
|
@ -158,3 +160,15 @@ cdef class KnowledgeBase:
|
||||||
self._aliases_table.push_back(alias)
|
self._aliases_table.push_back(alias)
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Writer:
|
||||||
|
cdef FILE* _fp
|
||||||
|
|
||||||
|
cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Reader:
|
||||||
|
cdef FILE* _fp
|
||||||
|
cdef public int32_t nr_feat
|
||||||
|
|
||||||
|
cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1
|
||||||
|
|
||||||
|
|
106
spacy/kb.pyx
106
spacy/kb.pyx
|
@ -1,8 +1,23 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
|
from collections import OrderedDict
|
||||||
|
from cpython.exc cimport PyErr_CheckSignals
|
||||||
|
|
||||||
|
from spacy import util
|
||||||
from spacy.errors import Errors, Warnings, user_warning
|
from spacy.errors import Errors, Warnings, user_warning
|
||||||
|
|
||||||
|
from cpython.mem cimport PyMem_Malloc
|
||||||
|
from cpython.exc cimport PyErr_SetFromErrno
|
||||||
|
|
||||||
|
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
||||||
|
from libc.stdint cimport int32_t, int64_t
|
||||||
|
from libc.stdlib cimport qsort
|
||||||
|
|
||||||
|
from .typedefs cimport hash_t
|
||||||
|
|
||||||
|
from os import path
|
||||||
|
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
|
|
||||||
|
@ -139,3 +154,94 @@ cdef class KnowledgeBase:
|
||||||
prior_prob=prob)
|
prior_prob=prob)
|
||||||
for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||||
if entry_index != 0]
|
if entry_index != 0]
|
||||||
|
|
||||||
|
|
||||||
|
def dump(self, loc):
|
||||||
|
# TODO: actually dump the data in this KB :-)
|
||||||
|
|
||||||
|
cdef int64_t entry_id = 32
|
||||||
|
self.vocab.strings.add("Q342")
|
||||||
|
cdef hash_t entity_hash = self.vocab.strings["Q342"]
|
||||||
|
cdef float prob = 0.333
|
||||||
|
|
||||||
|
cdef Writer writer = Writer(loc)
|
||||||
|
writer.write(entry_id, entity_hash, prob)
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
def load(self, loc):
|
||||||
|
cdef int64_t entry_id
|
||||||
|
cdef hash_t entity_hash
|
||||||
|
cdef float prob
|
||||||
|
|
||||||
|
cdef Reader reader = Reader(loc)
|
||||||
|
reader.read(self.mem, &entry_id, &entity_hash, &prob)
|
||||||
|
|
||||||
|
cdef _EntryC entry
|
||||||
|
entry.entity_hash = entity_hash
|
||||||
|
entry.prob = prob
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
cdef int32_t dummy_value = 342
|
||||||
|
entry.vector_rows = &dummy_value
|
||||||
|
entry.feats_row = dummy_value
|
||||||
|
|
||||||
|
cdef class Writer:
|
||||||
|
def __init__(self, object loc):
|
||||||
|
if path.exists(loc):
|
||||||
|
assert not path.isdir(loc), "%s is directory." % loc
|
||||||
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
|
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||||
|
assert self._fp != NULL
|
||||||
|
fseek(self._fp, 0, 0)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
cdef size_t status = fclose(self._fp)
|
||||||
|
assert status == 0
|
||||||
|
|
||||||
|
cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1:
|
||||||
|
cdef int i = 0
|
||||||
|
|
||||||
|
# TODO: feats_rows and vector rows
|
||||||
|
|
||||||
|
_write(&entry_id, sizeof(entry_id), self._fp)
|
||||||
|
_write(&entry_hash, sizeof(entry_hash), self._fp)
|
||||||
|
_write(&entry_prob, sizeof(entry_prob), self._fp)
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _write(void* value, size_t size, FILE* fp) except -1:
|
||||||
|
status = fwrite(value, size, 1, fp)
|
||||||
|
assert status == 1, status
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Reader:
|
||||||
|
def __init__(self, object loc):
|
||||||
|
assert path.exists(loc)
|
||||||
|
assert not path.isdir(loc)
|
||||||
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
|
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||||
|
if not self._fp:
|
||||||
|
PyErr_SetFromErrno(IOError)
|
||||||
|
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
fclose(self._fp)
|
||||||
|
|
||||||
|
cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
|
||||||
|
status = fread(entry_id, sizeof(entry_id), 1, self._fp)
|
||||||
|
if status < 1:
|
||||||
|
if feof(self._fp):
|
||||||
|
return 0 # end of file
|
||||||
|
raise IOError("error reading entry ID from input file")
|
||||||
|
|
||||||
|
#status = fread(&entity_hash, sizeof(entity_hash), 1, self._fp)
|
||||||
|
status = fread(entity_hash, sizeof(entity_hash), 1, self._fp)
|
||||||
|
if status < 1:
|
||||||
|
if feof(self._fp):
|
||||||
|
return 0 # end of file
|
||||||
|
raise IOError("error reading entity hash from input file")
|
||||||
|
|
||||||
|
status = fread(prob, sizeof(prob), 1, self._fp)
|
||||||
|
if status < 1:
|
||||||
|
if feof(self._fp):
|
||||||
|
return 0 # end of file
|
||||||
|
raise IOError("error reading entity prob from input file")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user