add pyx and separate method to add aliases

This commit is contained in:
svlandeg 2019-03-15 16:05:23 +01:00
parent feb71e15fd
commit 27483f9080
2 changed files with 29 additions and 19 deletions

View File

@ -3,8 +3,7 @@ from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t from libc.stdint cimport int32_t, int64_t
from .typedefs cimport attr_t, hash_t from .typedefs cimport hash_t
from .strings cimport hash_string
# Internal struct, for storage and disambiguation. This isn't what we return # Internal struct, for storage and disambiguation. This isn't what we return
@ -68,26 +67,10 @@ cdef class KnowledgeBase:
# efficient. # efficient.
cdef object _aliases_table cdef object _aliases_table
def __len__(self):
return self._entries.size()
def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
# TODO: more friendly check for non-unique name
if name in self:
return
cdef hash_t key = hash_string(name)
self.c_add_entity(key, prob, self._vectors_table.get_pointer(vectors),
self._features_table.get(features))
# TODO: hash the aliases?
for alias, prob_alias in aliases:
self._aliases_table.add(alias, key, prob_alias)
cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows, cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
int feats_row) nogil: int feats_row) nogil:
"""Add an entry to the knowledge base.""" """Add an entry to the knowledge base."""
# This is what we'll map the orth to. It's where the entry will sit # This is what we'll map the hash key to. It's where the entry will sit
# in the vector of entries, so we can get it later. # in the vector of entries, so we can get it later.
cdef int64_t index = self._entries.size() cdef int64_t index = self._entries.size()
self._entries.push_back( self._entries.push_back(

27
spacy/kb.pyx Normal file
View File

@ -0,0 +1,27 @@
from .strings cimport hash_string
cdef class KnowledgeBase:
def __len__(self):
return self._entries.size()
def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
# TODO: more friendly check for non-unique name
if name in self:
return
cdef hash_t name_hash = hash_string(name)
self.c_add_entity(name_hash, prob, self._vectors_table.get_pointer(vectors),
self._features_table.get(features))
def add_alias(self, alias, entities, probabilities):
"""For a given alias, add its potential entities and prior probabilies to the KB."""
cdef hash_t alias_hash = hash_string(alias)
# TODO: check len(entities) == len(probabilities)
for entity, prob in zip(entities, probabilities):
cdef hash_t entity_hash = hash_string(entity)
cdef int64_t entity_index = self._index[entity_hash]
# TODO: check that entity is already in this KB (entity_index is OK)
self._aliases_table.add(alias_hash, entity_index, prob)