mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
rename to KBEntryC
This commit is contained in:
parent
1de61f68d6
commit
dbc53b9870
|
@ -61,23 +61,23 @@ def run_pipeline():
|
||||||
to_create_kb = False
|
to_create_kb = False
|
||||||
|
|
||||||
# read KB back in from file
|
# read KB back in from file
|
||||||
to_read_kb = False
|
to_read_kb = True
|
||||||
to_test_kb = False
|
to_test_kb = False
|
||||||
|
|
||||||
# create training dataset
|
# create training dataset
|
||||||
create_wp_training = False
|
create_wp_training = False
|
||||||
|
|
||||||
# train the EL pipe
|
# train the EL pipe
|
||||||
train_pipe = False
|
train_pipe = True
|
||||||
measure_performance = False
|
measure_performance = True
|
||||||
|
|
||||||
# test the EL pipe on a simple example
|
# test the EL pipe on a simple example
|
||||||
to_test_pipeline = False
|
to_test_pipeline = True
|
||||||
|
|
||||||
# write the NLP object, read back in and test again
|
# write the NLP object, read back in and test again
|
||||||
to_write_nlp = False
|
to_write_nlp = True
|
||||||
to_read_nlp = False
|
to_read_nlp = False
|
||||||
test_from_file = True
|
test_from_file = False
|
||||||
|
|
||||||
# STEP 1 : create prior probabilities from WP (run only once)
|
# STEP 1 : create prior probabilities from WP (run only once)
|
||||||
if to_create_prior_probs:
|
if to_create_prior_probs:
|
||||||
|
@ -149,7 +149,7 @@ def run_pipeline():
|
||||||
print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
|
print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
|
||||||
# define the size (nr of entities) of training and dev set
|
# define the size (nr of entities) of training and dev set
|
||||||
train_limit = 5000
|
train_limit = 5000
|
||||||
dev_limit = 10000
|
dev_limit = 5000
|
||||||
|
|
||||||
train_data = training_set_creator.read_training(nlp=nlp_2,
|
train_data = training_set_creator.read_training(nlp=nlp_2,
|
||||||
training_dir=TRAINING_DIR,
|
training_dir=TRAINING_DIR,
|
||||||
|
@ -285,9 +285,7 @@ def _measure_accuracy(data, el_pipe=None):
|
||||||
|
|
||||||
docs = [d for d, g in data if len(d) > 0]
|
docs = [d for d, g in data if len(d) > 0]
|
||||||
if el_pipe is not None:
|
if el_pipe is not None:
|
||||||
print("applying el_pipe", datetime.datetime.now())
|
docs = list(el_pipe.pipe(docs))
|
||||||
docs = list(el_pipe.pipe(docs, batch_size=10000000000))
|
|
||||||
print("done applying el_pipe", datetime.datetime.now())
|
|
||||||
golds = [g for d, g in data if len(d) > 0]
|
golds = [g for d, g in data if len(d) > 0]
|
||||||
|
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
|
|
10
spacy/kb.pxd
10
spacy/kb.pxd
|
@ -9,8 +9,8 @@ from libc.stdio cimport FILE
|
||||||
from spacy.vocab cimport Vocab
|
from spacy.vocab cimport Vocab
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
|
||||||
from .structs cimport EntryC, AliasC
|
from .structs cimport KBEntryC, AliasC
|
||||||
ctypedef vector[EntryC] entry_vec
|
ctypedef vector[KBEntryC] entry_vec
|
||||||
ctypedef vector[AliasC] alias_vec
|
ctypedef vector[AliasC] alias_vec
|
||||||
ctypedef vector[float] float_vec
|
ctypedef vector[float] float_vec
|
||||||
ctypedef vector[float_vec] float_matrix
|
ctypedef vector[float_vec] float_matrix
|
||||||
|
@ -32,7 +32,7 @@ cdef class KnowledgeBase:
|
||||||
cdef int64_t entity_vector_length
|
cdef int64_t entity_vector_length
|
||||||
|
|
||||||
# This maps 64bit keys (hash of unique entity string)
|
# This maps 64bit keys (hash of unique entity string)
|
||||||
# to 64bit values (position of the _EntryC struct in the _entries vector).
|
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
|
||||||
# The PreshMap is pretty space efficient, as it uses open addressing. So
|
# The PreshMap is pretty space efficient, as it uses open addressing. So
|
||||||
# the only overhead is the vacancy rate, which is approximately 30%.
|
# the only overhead is the vacancy rate, which is approximately 30%.
|
||||||
cdef PreshMap _entry_index
|
cdef PreshMap _entry_index
|
||||||
|
@ -88,7 +88,7 @@ cdef class KnowledgeBase:
|
||||||
cdef int64_t new_index = self._entries.size()
|
cdef int64_t new_index = self._entries.size()
|
||||||
|
|
||||||
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
|
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
|
||||||
cdef EntryC entry
|
cdef KBEntryC entry
|
||||||
entry.entity_hash = entity_hash
|
entry.entity_hash = entity_hash
|
||||||
entry.vector_index = vector_index
|
entry.vector_index = vector_index
|
||||||
entry.feats_row = feats_row
|
entry.feats_row = feats_row
|
||||||
|
@ -121,7 +121,7 @@ cdef class KnowledgeBase:
|
||||||
cdef int32_t dummy_value = 0
|
cdef int32_t dummy_value = 0
|
||||||
|
|
||||||
# Avoid struct initializer to enable nogil
|
# Avoid struct initializer to enable nogil
|
||||||
cdef EntryC entry
|
cdef KBEntryC entry
|
||||||
entry.entity_hash = dummy_hash
|
entry.entity_hash = dummy_hash
|
||||||
entry.vector_index = dummy_value
|
entry.vector_index = dummy_value
|
||||||
entry.feats_row = dummy_value
|
entry.feats_row = dummy_value
|
||||||
|
|
|
@ -129,7 +129,7 @@ cdef class KnowledgeBase:
|
||||||
self._entries = entry_vec(nr_entities+1)
|
self._entries = entry_vec(nr_entities+1)
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
cdef EntryC entry
|
cdef KBEntryC entry
|
||||||
while i < nr_entities:
|
while i < nr_entities:
|
||||||
entity_vector = vector_list[i]
|
entity_vector = vector_list[i]
|
||||||
if len(entity_vector) != self.entity_vector_length:
|
if len(entity_vector) != self.entity_vector_length:
|
||||||
|
@ -250,7 +250,7 @@ cdef class KnowledgeBase:
|
||||||
cdef int64_t entry_index
|
cdef int64_t entry_index
|
||||||
cdef float prob
|
cdef float prob
|
||||||
cdef int32_t vector_index
|
cdef int32_t vector_index
|
||||||
cdef EntryC entry
|
cdef KBEntryC entry
|
||||||
cdef AliasC alias
|
cdef AliasC alias
|
||||||
cdef float vector_element
|
cdef float vector_element
|
||||||
|
|
||||||
|
|
|
@ -79,7 +79,7 @@ cdef struct TokenC:
|
||||||
|
|
||||||
|
|
||||||
# Internal struct, for storage and disambiguation of entities.
|
# Internal struct, for storage and disambiguation of entities.
|
||||||
cdef struct EntryC:
|
cdef struct KBEntryC:
|
||||||
|
|
||||||
# The hash of this entry's unique ID/name in the kB
|
# The hash of this entry's unique ID/name in the kB
|
||||||
hash_t entity_hash
|
hash_t entity_hash
|
||||||
|
|
Loading…
Reference in New Issue
Block a user