very minimal KB functionality working

This commit is contained in:
svlandeg 2019-03-18 17:27:51 +01:00
parent af281c5466
commit cf34113250
5 changed files with 47 additions and 16 deletions

View File

@ -40,6 +40,7 @@ MOD_NAMES = [
"spacy.lexeme", "spacy.lexeme",
"spacy.vocab", "spacy.vocab",
"spacy.attrs", "spacy.attrs",
"spacy.kb",
"spacy.morphology", "spacy.morphology",
"spacy.pipeline.pipes", "spacy.pipeline.pipes",
"spacy.syntax.stateclass", "spacy.syntax.stateclass",

View File

@ -4,6 +4,7 @@ from preshed.maps cimport PreshMap
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t from libc.stdint cimport int32_t, int64_t
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .strings cimport hash_string
# Internal struct, for storage and disambiguation. This isn't what we return # Internal struct, for storage and disambiguation. This isn't what we return
@ -32,10 +33,10 @@ cdef struct _EntryC:
cdef struct _AliasC: cdef struct _AliasC:
# All entry candidates for this alias # All entry candidates for this alias
const vector[int64_t] entry_indices vector[int64_t] entry_indices
# Prior probability P(entity|alias) - should sum up to (at most) 1. # Prior probability P(entity|alias) - should sum up to (at most) 1.
const vector[float] probs vector[float] probs
cdef class KnowledgeBase: cdef class KnowledgeBase:
@ -94,13 +95,21 @@ cdef class KnowledgeBase:
feats_row=feats_row, feats_row=feats_row,
prob=prob prob=prob
)) ))
self._index[entity_key] = entity_index self._entry_index[entity_key] = entity_index
return entity_index return entity_index
cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs): cdef inline int64_t c_add_aliases(self, hash_t alias_key, entities, probabilities):
"""Connect a mention to a list of potential entities with their prior probabilities .""" """Connect a mention to a list of potential entities with their prior probabilities ."""
cdef int64_t alias_index = self._aliases_table.size() cdef int64_t alias_index = self._aliases_table.size()
cdef vector[int64_t] entry_indices
cdef vector[float] probs
for entity, prob in zip(entities, probs):
entry_index = self._entry_index[hash_string(entity)]
entry_indices.push_back(entry_index)
probs.push_back(prob)
self._aliases_table.push_back( self._aliases_table.push_back(
_AliasC( _AliasC(
entry_indices=entry_indices, entry_indices=entry_indices,

View File

@ -1,34 +1,42 @@
from .strings cimport hash_string # cython: profile=True
# coding: utf8
from preshed.maps import PreshMap
cdef class KnowledgeBase: cdef class KnowledgeBase:
def __init__(self):
self._entry_index = PreshMap()
self._alias_index = PreshMap()
self.mem = Pool()
def __len__(self): def __len__(self):
return self._entries.size() return self._entries.size()
def add_entity(self, entity_id: str, float prob, vectors=None, features=None): def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
cdef hash_t id_hash = hash_string(entity_id)
# TODO: more friendly check for non-unique name # TODO: more friendly check for non-unique name
if entity_id in self: if id_hash in self._entry_index:
return return
cdef hash_t id_hash = hash_string(entity_id)
cdef int32_t dummy_value = 342 cdef int32_t dummy_value = 342
self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
# TODO self._vectors_table.get_pointer(vectors), # TODO self._vectors_table.get_pointer(vectors),
# self._features_table.get(features)) # self._features_table.get(features))
def add_alias(self, alias, entities, probabilities): def add_alias(self, unicode alias, entities, probabilities):
"""For a given alias, add its potential entities and prior probabilies to the KB.""" """For a given alias, add its potential entities and prior probabilies to the KB."""
cdef hash_t alias_hash = hash_string(alias) cdef hash_t alias_hash = hash_string(alias)
cdef hash_t entity_hash = 0
cdef int64_t entity_index = 0
cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities]
self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities)
# TODO: check that alias hadn't been defined before # TODO: check that alias hadn't been defined before
# TODO: check that entity is already in this KB (entity_index is OK) # TODO: check that entity is already in this KB (entity_index is OK)
# TODO: check sum(probabilities) <= 1 # TODO: check sum(probabilities) <= 1
# TODO: check len(entities) == len(probabilities) # TODO: check len(entities) == len(probabilities)
self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities)

View File

@ -1,4 +1,16 @@
import spacy import spacy
from spacy.kb import KnowledgeBase
def create_kb():
mykb = KnowledgeBase()
print("kb size", len(mykb))
entity_id = "Q42"
mykb.add_entity(entity_id=entity_id, prob=0.5)
print("adding entity", entity_id)
print("kb size", len(mykb))
def add_el(): def add_el():
@ -23,4 +35,5 @@ def add_el():
if __name__ == "__main__": if __name__ == "__main__":
add_el() # add_el()
create_kb()