From dbc53b9870a76840d50c29cd1708e02c02414756 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 26 Jun 2019 15:55:26 +0200 Subject: [PATCH] rename to KBEntryC --- examples/pipeline/wikidata_entity_linking.py | 18 ++++++++---------- spacy/kb.pxd | 10 +++++----- spacy/kb.pyx | 4 ++-- spacy/structs.pxd | 2 +- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 2d300f699..9ce3b9559 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -61,23 +61,23 @@ def run_pipeline(): to_create_kb = False # read KB back in from file - to_read_kb = False + to_read_kb = True to_test_kb = False # create training dataset create_wp_training = False # train the EL pipe - train_pipe = False - measure_performance = False + train_pipe = True + measure_performance = True # test the EL pipe on a simple example - to_test_pipeline = False + to_test_pipeline = True # write the NLP object, read back in and test again - to_write_nlp = False + to_write_nlp = True to_read_nlp = False - test_from_file = True + test_from_file = False # STEP 1 : create prior probabilities from WP (run only once) if to_create_prior_probs: @@ -149,7 +149,7 @@ def run_pipeline(): print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) # define the size (nr of entities) of training and dev set train_limit = 5000 - dev_limit = 10000 + dev_limit = 5000 train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, @@ -285,9 +285,7 @@ def _measure_accuracy(data, el_pipe=None): docs = [d for d, g in data if len(d) > 0] if el_pipe is not None: - print("applying el_pipe", datetime.datetime.now()) - docs = list(el_pipe.pipe(docs, batch_size=10000000000)) - print("done applying el_pipe", datetime.datetime.now()) + docs = list(el_pipe.pipe(docs)) golds = [g for d, g in data if len(d) > 0] for doc, gold in zip(docs, golds): diff --git a/spacy/kb.pxd b/spacy/kb.pxd index ccf150cd2..40b22b275 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -9,8 +9,8 @@ from libc.stdio cimport FILE from spacy.vocab cimport Vocab from .typedefs cimport hash_t -from .structs cimport EntryC, AliasC -ctypedef vector[EntryC] entry_vec +from .structs cimport KBEntryC, AliasC +ctypedef vector[KBEntryC] entry_vec ctypedef vector[AliasC] alias_vec ctypedef vector[float] float_vec ctypedef vector[float_vec] float_matrix @@ -32,7 +32,7 @@ cdef class KnowledgeBase: cdef int64_t entity_vector_length # This maps 64bit keys (hash of unique entity string) - # to 64bit values (position of the _EntryC struct in the _entries vector). + # to 64bit values (position of the _KBEntryC struct in the _entries vector). # The PreshMap is pretty space efficient, as it uses open addressing. So # the only overhead is the vacancy rate, which is approximately 30%. cdef PreshMap _entry_index @@ -88,7 +88,7 @@ cdef class KnowledgeBase: cdef int64_t new_index = self._entries.size() # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642 - cdef EntryC entry + cdef KBEntryC entry entry.entity_hash = entity_hash entry.vector_index = vector_index entry.feats_row = feats_row @@ -121,7 +121,7 @@ cdef class KnowledgeBase: cdef int32_t dummy_value = 0 # Avoid struct initializer to enable nogil - cdef EntryC entry + cdef KBEntryC entry entry.entity_hash = dummy_hash entry.vector_index = dummy_value entry.feats_row = dummy_value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 4d9d2b89b..7c2daa659 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -129,7 +129,7 @@ cdef class KnowledgeBase: self._entries = entry_vec(nr_entities+1) i = 0 - cdef EntryC entry + cdef KBEntryC entry while i < nr_entities: entity_vector = vector_list[i] if len(entity_vector) != self.entity_vector_length: @@ -250,7 +250,7 @@ cdef class KnowledgeBase: cdef int64_t entry_index cdef float prob cdef int32_t vector_index - cdef EntryC entry + cdef KBEntryC entry cdef AliasC alias cdef float vector_element diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 8de4d5f4c..e80b1b4d6 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -79,7 +79,7 @@ cdef struct TokenC: # Internal struct, for storage and disambiguation of entities. -cdef struct EntryC: +cdef struct KBEntryC: # The hash of this entry's unique ID/name in the kB hash_t entity_hash