rename to KBEntryC

This commit is contained in:
svlandeg 2019-06-26 15:55:26 +02:00
parent 1de61f68d6
commit dbc53b9870
4 changed files with 16 additions and 18 deletions

View File

@ -61,23 +61,23 @@ def run_pipeline():
to_create_kb = False to_create_kb = False
# read KB back in from file # read KB back in from file
to_read_kb = False to_read_kb = True
to_test_kb = False to_test_kb = False
# create training dataset # create training dataset
create_wp_training = False create_wp_training = False
# train the EL pipe # train the EL pipe
train_pipe = False train_pipe = True
measure_performance = False measure_performance = True
# test the EL pipe on a simple example # test the EL pipe on a simple example
to_test_pipeline = False to_test_pipeline = True
# write the NLP object, read back in and test again # write the NLP object, read back in and test again
to_write_nlp = False to_write_nlp = True
to_read_nlp = False to_read_nlp = False
test_from_file = True test_from_file = False
# STEP 1 : create prior probabilities from WP (run only once) # STEP 1 : create prior probabilities from WP (run only once)
if to_create_prior_probs: if to_create_prior_probs:
@ -149,7 +149,7 @@ def run_pipeline():
print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
# define the size (nr of entities) of training and dev set # define the size (nr of entities) of training and dev set
train_limit = 5000 train_limit = 5000
dev_limit = 10000 dev_limit = 5000
train_data = training_set_creator.read_training(nlp=nlp_2, train_data = training_set_creator.read_training(nlp=nlp_2,
training_dir=TRAINING_DIR, training_dir=TRAINING_DIR,
@ -285,9 +285,7 @@ def _measure_accuracy(data, el_pipe=None):
docs = [d for d, g in data if len(d) > 0] docs = [d for d, g in data if len(d) > 0]
if el_pipe is not None: if el_pipe is not None:
print("applying el_pipe", datetime.datetime.now()) docs = list(el_pipe.pipe(docs))
docs = list(el_pipe.pipe(docs, batch_size=10000000000))
print("done applying el_pipe", datetime.datetime.now())
golds = [g for d, g in data if len(d) > 0] golds = [g for d, g in data if len(d) > 0]
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):

View File

@ -9,8 +9,8 @@ from libc.stdio cimport FILE
from spacy.vocab cimport Vocab from spacy.vocab cimport Vocab
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .structs cimport EntryC, AliasC from .structs cimport KBEntryC, AliasC
ctypedef vector[EntryC] entry_vec ctypedef vector[KBEntryC] entry_vec
ctypedef vector[AliasC] alias_vec ctypedef vector[AliasC] alias_vec
ctypedef vector[float] float_vec ctypedef vector[float] float_vec
ctypedef vector[float_vec] float_matrix ctypedef vector[float_vec] float_matrix
@ -32,7 +32,7 @@ cdef class KnowledgeBase:
cdef int64_t entity_vector_length cdef int64_t entity_vector_length
# This maps 64bit keys (hash of unique entity string) # This maps 64bit keys (hash of unique entity string)
# to 64bit values (position of the _EntryC struct in the _entries vector). # to 64bit values (position of the _KBEntryC struct in the _entries vector).
# The PreshMap is pretty space efficient, as it uses open addressing. So # The PreshMap is pretty space efficient, as it uses open addressing. So
# the only overhead is the vacancy rate, which is approximately 30%. # the only overhead is the vacancy rate, which is approximately 30%.
cdef PreshMap _entry_index cdef PreshMap _entry_index
@ -88,7 +88,7 @@ cdef class KnowledgeBase:
cdef int64_t new_index = self._entries.size() cdef int64_t new_index = self._entries.size()
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642 # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
cdef EntryC entry cdef KBEntryC entry
entry.entity_hash = entity_hash entry.entity_hash = entity_hash
entry.vector_index = vector_index entry.vector_index = vector_index
entry.feats_row = feats_row entry.feats_row = feats_row
@ -121,7 +121,7 @@ cdef class KnowledgeBase:
cdef int32_t dummy_value = 0 cdef int32_t dummy_value = 0
# Avoid struct initializer to enable nogil # Avoid struct initializer to enable nogil
cdef EntryC entry cdef KBEntryC entry
entry.entity_hash = dummy_hash entry.entity_hash = dummy_hash
entry.vector_index = dummy_value entry.vector_index = dummy_value
entry.feats_row = dummy_value entry.feats_row = dummy_value

View File

@ -129,7 +129,7 @@ cdef class KnowledgeBase:
self._entries = entry_vec(nr_entities+1) self._entries = entry_vec(nr_entities+1)
i = 0 i = 0
cdef EntryC entry cdef KBEntryC entry
while i < nr_entities: while i < nr_entities:
entity_vector = vector_list[i] entity_vector = vector_list[i]
if len(entity_vector) != self.entity_vector_length: if len(entity_vector) != self.entity_vector_length:
@ -250,7 +250,7 @@ cdef class KnowledgeBase:
cdef int64_t entry_index cdef int64_t entry_index
cdef float prob cdef float prob
cdef int32_t vector_index cdef int32_t vector_index
cdef EntryC entry cdef KBEntryC entry
cdef AliasC alias cdef AliasC alias
cdef float vector_element cdef float vector_element

View File

@ -79,7 +79,7 @@ cdef struct TokenC:
# Internal struct, for storage and disambiguation of entities. # Internal struct, for storage and disambiguation of entities.
cdef struct EntryC: cdef struct KBEntryC:
# The hash of this entry's unique ID/name in the kB # The hash of this entry's unique ID/name in the kB
hash_t entity_hash hash_t entity_hash