mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 01:32:32 +03:00
bulk loading in proper order of entity indices
This commit is contained in:
parent
694fea597a
commit
6e3223f234
|
@ -424,9 +424,8 @@ if __name__ == "__main__":
|
||||||
# my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
|
# my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
|
||||||
|
|
||||||
# STEP 3 : write KB to file
|
# STEP 3 : write KB to file
|
||||||
# TODO
|
nlp1 = spacy.load('en_core_web_sm')
|
||||||
nlp = spacy.load('en_core_web_sm')
|
kb1 = KnowledgeBase(vocab=nlp1.vocab)
|
||||||
kb1 = KnowledgeBase(vocab=nlp.vocab)
|
|
||||||
|
|
||||||
kb1.add_entity(entity="Q53", prob=0.33)
|
kb1.add_entity(entity="Q53", prob=0.33)
|
||||||
kb1.add_entity(entity="Q17", prob=0.1)
|
kb1.add_entity(entity="Q17", prob=0.1)
|
||||||
|
@ -437,11 +436,11 @@ if __name__ == "__main__":
|
||||||
kb1.dump(KB_FILE)
|
kb1.dump(KB_FILE)
|
||||||
|
|
||||||
# STEP 4 : read KB back in from file
|
# STEP 4 : read KB back in from file
|
||||||
# TODO
|
|
||||||
|
|
||||||
kb2 = KnowledgeBase(vocab=nlp.vocab)
|
nlp3 = spacy.load('en_core_web_sm')
|
||||||
kb2.load(KB_FILE)
|
kb3 = KnowledgeBase(vocab=nlp3.vocab)
|
||||||
print("kb2 size:", len(kb2), kb2.get_size_entities(), kb2.get_size_aliases())
|
kb3.load_bulk(7, KB_FILE)
|
||||||
|
print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases())
|
||||||
|
|
||||||
# STEP 5 : actually use the EL functionality
|
# STEP 5 : actually use the EL functionality
|
||||||
# add_el(my_kb, nlp)
|
# add_el(my_kb, nlp)
|
||||||
|
|
57
spacy/kb.pxd
57
spacy/kb.pxd
|
@ -1,48 +1,17 @@
|
||||||
"""Knowledge-base for entity or concept linking."""
|
"""Knowledge-base for entity or concept linking."""
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
|
from libc.stdio cimport FILE
|
||||||
|
|
||||||
from spacy.vocab cimport Vocab
|
from spacy.vocab cimport Vocab
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
|
||||||
from libc.stdio cimport FILE
|
from .structs cimport EntryC, AliasC
|
||||||
|
ctypedef vector[EntryC] entry_vec
|
||||||
|
ctypedef vector[AliasC] alias_vec
|
||||||
# Internal struct, for storage and disambiguation. This isn't what we return
|
|
||||||
# to the user as the answer to "here's your entity". It's the minimum number
|
|
||||||
# of bits we need to keep track of the answers.
|
|
||||||
cdef struct _EntryC:
|
|
||||||
|
|
||||||
# The hash of this entry's unique ID/name in the kB
|
|
||||||
hash_t entity_hash
|
|
||||||
|
|
||||||
# Allows retrieval of one or more vectors.
|
|
||||||
# Each element of vector_rows should be an index into a vectors table.
|
|
||||||
# Every entry should have the same number of vectors, so we can avoid storing
|
|
||||||
# the number of vectors in each knowledge-base struct
|
|
||||||
int32_t* vector_rows
|
|
||||||
|
|
||||||
# Allows retrieval of a struct of non-vector features. We could make this a
|
|
||||||
# pointer, but we have 32 bits left over in the struct after prob, so we'd
|
|
||||||
# like this to only be 32 bits. We can also set this to -1, for the common
|
|
||||||
# case where there are no features.
|
|
||||||
int32_t feats_row
|
|
||||||
|
|
||||||
# log probability of entity, based on corpus frequency
|
|
||||||
float prob
|
|
||||||
|
|
||||||
|
|
||||||
# Each alias struct stores a list of Entry pointers with their prior probabilities
|
|
||||||
# for this specific mention/alias.
|
|
||||||
cdef struct _AliasC:
|
|
||||||
|
|
||||||
# All entry candidates for this alias
|
|
||||||
vector[int64_t] entry_indices
|
|
||||||
|
|
||||||
# Prior probability P(entity|alias) - should sum up to (at most) 1.
|
|
||||||
vector[float] probs
|
|
||||||
|
|
||||||
|
|
||||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||||
|
@ -68,7 +37,7 @@ cdef class KnowledgeBase:
|
||||||
# over allocation.
|
# over allocation.
|
||||||
# In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
|
# In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
|
||||||
# Storing 1m entries would take 41.6mb under this scheme.
|
# Storing 1m entries would take 41.6mb under this scheme.
|
||||||
cdef vector[_EntryC] _entries
|
cdef entry_vec _entries
|
||||||
|
|
||||||
# This maps 64bit keys (hash of unique alias string)
|
# This maps 64bit keys (hash of unique alias string)
|
||||||
# to 64bit values (position of the _AliasC struct in the _aliases_table vector).
|
# to 64bit values (position of the _AliasC struct in the _aliases_table vector).
|
||||||
|
@ -78,7 +47,7 @@ cdef class KnowledgeBase:
|
||||||
# should be P(entity | mention), which is pretty important to know.
|
# should be P(entity | mention), which is pretty important to know.
|
||||||
# We can pack both pieces of information into a 64-bit value, to keep things
|
# We can pack both pieces of information into a 64-bit value, to keep things
|
||||||
# efficient.
|
# efficient.
|
||||||
cdef vector[_AliasC] _aliases_table
|
cdef alias_vec _aliases_table
|
||||||
|
|
||||||
# This is the part which might take more space: storing various
|
# This is the part which might take more space: storing various
|
||||||
# categorical features for the entries, and storing vectors for disambiguation
|
# categorical features for the entries, and storing vectors for disambiguation
|
||||||
|
@ -98,6 +67,7 @@ cdef class KnowledgeBase:
|
||||||
# optional data, we can let users configure a DB as the backend for this.
|
# optional data, we can let users configure a DB as the backend for this.
|
||||||
cdef object _features_table
|
cdef object _features_table
|
||||||
|
|
||||||
|
|
||||||
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
|
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
|
||||||
int32_t* vector_rows, int feats_row) nogil:
|
int32_t* vector_rows, int feats_row) nogil:
|
||||||
"""Add an entry to the vector of entries.
|
"""Add an entry to the vector of entries.
|
||||||
|
@ -107,7 +77,7 @@ cdef class KnowledgeBase:
|
||||||
cdef int64_t new_index = self._entries.size()
|
cdef int64_t new_index = self._entries.size()
|
||||||
|
|
||||||
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
|
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
|
||||||
cdef _EntryC entry
|
cdef EntryC entry
|
||||||
entry.entity_hash = entity_hash
|
entry.entity_hash = entity_hash
|
||||||
entry.vector_rows = vector_rows
|
entry.vector_rows = vector_rows
|
||||||
entry.feats_row = feats_row
|
entry.feats_row = feats_row
|
||||||
|
@ -124,7 +94,7 @@ cdef class KnowledgeBase:
|
||||||
cdef int64_t new_index = self._aliases_table.size()
|
cdef int64_t new_index = self._aliases_table.size()
|
||||||
|
|
||||||
# Avoid struct initializer to enable nogil
|
# Avoid struct initializer to enable nogil
|
||||||
cdef _AliasC alias
|
cdef AliasC alias
|
||||||
alias.entry_indices = entry_indices
|
alias.entry_indices = entry_indices
|
||||||
alias.probs = probs
|
alias.probs = probs
|
||||||
|
|
||||||
|
@ -140,7 +110,7 @@ cdef class KnowledgeBase:
|
||||||
cdef int32_t dummy_value = 0
|
cdef int32_t dummy_value = 0
|
||||||
|
|
||||||
# Avoid struct initializer to enable nogil
|
# Avoid struct initializer to enable nogil
|
||||||
cdef _EntryC entry
|
cdef EntryC entry
|
||||||
entry.entity_hash = dummy_hash
|
entry.entity_hash = dummy_hash
|
||||||
entry.vector_rows = &dummy_value
|
entry.vector_rows = &dummy_value
|
||||||
entry.feats_row = dummy_value
|
entry.feats_row = dummy_value
|
||||||
|
@ -152,20 +122,21 @@ cdef class KnowledgeBase:
|
||||||
cdef vector[float] dummy_probs
|
cdef vector[float] dummy_probs
|
||||||
dummy_probs.push_back(0)
|
dummy_probs.push_back(0)
|
||||||
|
|
||||||
cdef _AliasC alias
|
cdef AliasC alias
|
||||||
alias.entry_indices = dummy_entry_indices
|
alias.entry_indices = dummy_entry_indices
|
||||||
alias.probs = dummy_probs
|
alias.probs = dummy_probs
|
||||||
|
|
||||||
self._entries.push_back(entry)
|
self._entries.push_back(entry)
|
||||||
self._aliases_table.push_back(alias)
|
self._aliases_table.push_back(alias)
|
||||||
|
|
||||||
|
cpdef load_bulk(self, int nr_entities, loc)
|
||||||
|
|
||||||
|
|
||||||
cdef class Writer:
|
cdef class Writer:
|
||||||
cdef FILE* _fp
|
cdef FILE* _fp
|
||||||
|
|
||||||
cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1
|
cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1
|
||||||
|
|
||||||
|
|
||||||
cdef class Reader:
|
cdef class Reader:
|
||||||
cdef FILE* _fp
|
cdef FILE* _fp
|
||||||
|
|
||||||
|
|
65
spacy/kb.pyx
65
spacy/kb.pyx
|
@ -7,6 +7,9 @@ from cpython.exc cimport PyErr_CheckSignals
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.errors import Errors, Warnings, user_warning
|
from spacy.errors import Errors, Warnings, user_warning
|
||||||
|
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
from cpython.mem cimport PyMem_Malloc
|
from cpython.mem cimport PyMem_Malloc
|
||||||
from cpython.exc cimport PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_SetFromErrno
|
||||||
|
|
||||||
|
@ -17,6 +20,8 @@ from libc.stdlib cimport qsort
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
|
@ -53,7 +58,6 @@ cdef class Candidate:
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
|
|
||||||
def __init__(self, Vocab vocab):
|
def __init__(self, Vocab vocab):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
@ -67,13 +71,13 @@ cdef class KnowledgeBase:
|
||||||
return self.get_size_entities()
|
return self.get_size_entities()
|
||||||
|
|
||||||
def get_size_entities(self):
|
def get_size_entities(self):
|
||||||
return self._entries.size() - 1 # not counting dummy element on index 0
|
return len(self._entry_index)
|
||||||
|
|
||||||
def get_entity_strings(self):
|
def get_entity_strings(self):
|
||||||
return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0
|
return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0
|
||||||
|
|
||||||
def get_size_aliases(self):
|
def get_size_aliases(self):
|
||||||
return self._aliases_table.size() - 1 # not counting dummy element on index
|
return len(self._alias_index)
|
||||||
|
|
||||||
def get_alias_strings(self):
|
def get_alias_strings(self):
|
||||||
return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0
|
return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0
|
||||||
|
@ -159,33 +163,44 @@ cdef class KnowledgeBase:
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
cdef Writer writer = Writer(loc)
|
cdef Writer writer = Writer(loc)
|
||||||
|
|
||||||
for key, entry_index in self._entry_index.items():
|
# dumping the entry records in the order in which they are in the _entries vector.
|
||||||
|
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
|
||||||
|
i = 1
|
||||||
|
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
||||||
entry = self._entries[entry_index]
|
entry = self._entries[entry_index]
|
||||||
print("dumping")
|
print("dumping")
|
||||||
print("index", entry_index)
|
print("index", entry_index)
|
||||||
print("hash", entry.entity_hash)
|
print("hash", entry.entity_hash)
|
||||||
|
assert entry.entity_hash == entry_hash
|
||||||
|
assert entry_index == i
|
||||||
print("prob", entry.prob)
|
print("prob", entry.prob)
|
||||||
print("")
|
print("")
|
||||||
writer.write(entry_index, entry.entity_hash, entry.prob)
|
writer.write(entry_index, entry.entity_hash, entry.prob)
|
||||||
|
i = i+1
|
||||||
|
|
||||||
writer.close()
|
writer.close()
|
||||||
|
|
||||||
def load(self, loc):
|
cpdef load_bulk(self, int nr_entities, loc):
|
||||||
|
# TODO: nr_entities from header in file (Reader constructor)
|
||||||
cdef int64_t entry_id
|
cdef int64_t entry_id
|
||||||
cdef hash_t entity_hash
|
cdef hash_t entity_hash
|
||||||
cdef float prob
|
cdef float prob
|
||||||
cdef _EntryC entry
|
cdef EntryC entry
|
||||||
cdef int32_t dummy_value = 342
|
cdef int32_t dummy_value = 342
|
||||||
|
|
||||||
cdef Reader reader = Reader(loc)
|
cdef Reader reader = Reader(loc)
|
||||||
result = reader.read(self.mem, &entry_id, &entity_hash, &prob) # -1: error, 0: eof after this one
|
to_read = self.get_size_entities()
|
||||||
while result:
|
|
||||||
print("loading")
|
self._entry_index = PreshMap(nr_entities+1)
|
||||||
print("entryID", entry_id)
|
self._entries = entry_vec(nr_entities+1)
|
||||||
print("hash", entity_hash)
|
|
||||||
print("prob", prob)
|
# we assume the data was written in sequence
|
||||||
print("result:", result)
|
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
|
||||||
print("")
|
# TODO: should we initialize the dummy objects ?
|
||||||
|
cdef int i = 1
|
||||||
|
while reader.read(self.mem, &entry_id, &entity_hash, &prob) and i <= nr_entities:
|
||||||
|
assert i == entry_id
|
||||||
|
|
||||||
entry.entity_hash = entity_hash
|
entry.entity_hash = entity_hash
|
||||||
entry.prob = prob
|
entry.prob = prob
|
||||||
|
|
||||||
|
@ -193,9 +208,18 @@ cdef class KnowledgeBase:
|
||||||
entry.vector_rows = &dummy_value
|
entry.vector_rows = &dummy_value
|
||||||
entry.feats_row = dummy_value
|
entry.feats_row = dummy_value
|
||||||
|
|
||||||
# TODO: use set instead of push_back to ensure the index remains the same?
|
print("bulk loading")
|
||||||
self._entries.push_back(entry)
|
print("i", i)
|
||||||
result = reader.read(self.mem, &entry_id, &entity_hash, &prob)
|
print("entryID", entry_id)
|
||||||
|
print("hash", entry.entity_hash)
|
||||||
|
print("prob", entry.prob)
|
||||||
|
print("")
|
||||||
|
|
||||||
|
self._entries[i] = entry
|
||||||
|
self._entry_index[entity_hash] = i
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
|
||||||
cdef class Writer:
|
cdef class Writer:
|
||||||
def __init__(self, object loc):
|
def __init__(self, object loc):
|
||||||
|
@ -236,11 +260,6 @@ cdef class Reader:
|
||||||
fclose(self._fp)
|
fclose(self._fp)
|
||||||
|
|
||||||
cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
|
cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1:
|
||||||
"""
|
|
||||||
Return values:
|
|
||||||
-1: error during current read (EOF during call)
|
|
||||||
0: means we read the last line succesfully (EOF after call)
|
|
||||||
1: we can continue reading this file """
|
|
||||||
status = fread(entry_id, sizeof(int64_t), 1, self._fp)
|
status = fread(entry_id, sizeof(int64_t), 1, self._fp)
|
||||||
if status < 1:
|
if status < 1:
|
||||||
if feof(self._fp):
|
if feof(self._fp):
|
||||||
|
@ -263,3 +282,5 @@ cdef class Reader:
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,10 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
|
||||||
from .typedefs cimport flags_t, attr_t, hash_t
|
from .typedefs cimport flags_t, attr_t, hash_t
|
||||||
from .parts_of_speech cimport univ_pos_t
|
from .parts_of_speech cimport univ_pos_t
|
||||||
|
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from libc.stdint cimport int32_t, int64_t
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct LexemeC:
|
cdef struct LexemeC:
|
||||||
flags_t flags
|
flags_t flags
|
||||||
|
@ -72,3 +76,36 @@ cdef struct TokenC:
|
||||||
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||||
attr_t ent_kb_id
|
attr_t ent_kb_id
|
||||||
hash_t ent_id
|
hash_t ent_id
|
||||||
|
|
||||||
|
|
||||||
|
# Internal struct, for storage and disambiguation of entities.
|
||||||
|
cdef struct EntryC:
|
||||||
|
|
||||||
|
# The hash of this entry's unique ID/name in the kB
|
||||||
|
hash_t entity_hash
|
||||||
|
|
||||||
|
# Allows retrieval of one or more vectors.
|
||||||
|
# Each element of vector_rows should be an index into a vectors table.
|
||||||
|
# Every entry should have the same number of vectors, so we can avoid storing
|
||||||
|
# the number of vectors in each knowledge-base struct
|
||||||
|
int32_t* vector_rows
|
||||||
|
|
||||||
|
# Allows retrieval of a struct of non-vector features. We could make this a
|
||||||
|
# pointer, but we have 32 bits left over in the struct after prob, so we'd
|
||||||
|
# like this to only be 32 bits. We can also set this to -1, for the common
|
||||||
|
# case where there are no features.
|
||||||
|
int32_t feats_row
|
||||||
|
|
||||||
|
# log probability of entity, based on corpus frequency
|
||||||
|
float prob
|
||||||
|
|
||||||
|
|
||||||
|
# Each alias struct stores a list of Entry pointers with their prior probabilities
|
||||||
|
# for this specific mention/alias.
|
||||||
|
cdef struct AliasC:
|
||||||
|
|
||||||
|
# All entry candidates for this alias
|
||||||
|
vector[int64_t] entry_indices
|
||||||
|
|
||||||
|
# Prior probability P(entity|alias) - should sum up to (at most) 1.
|
||||||
|
vector[float] probs
|
||||||
|
|
Loading…
Reference in New Issue
Block a user