spaCy/spacy/kb.pxd

"""Knowledge-base for entity or concept linking."""
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t

from spacy.strings cimport StringStore
from .typedefs cimport hash_t


# Internal struct, for storage and disambiguation. This isn't what we return
# to the user as the answer to "here's your entity". It's the minimum number
# of bits we need to keep track of the answers.
cdef struct _EntryC:

    # The hash of this entry's unique ID and name in the kB
    hash_t entity_id_hash
    hash_t entity_name_hash

    # Allows retrieval of one or more vectors.
    # Each element of vector_rows should be an index into a vectors table.
    # Every entry should have the same number of vectors, so we can avoid storing
    # the number of vectors in each knowledge-base struct
    int32_t* vector_rows

    # Allows retrieval of a struct of non-vector features. We could make this a
    # pointer, but we have 32 bits left over in the struct after prob, so we'd
    # like this to only be 32 bits. We can also set this to -1, for the common
    # case where there are no features.
    int32_t feats_row

    # log probability of entity, based on corpus frequency
    float prob


# Each alias struct stores a list of Entry pointers with their prior probabilities
# for this specific mention/alias.
cdef struct _AliasC:

    # All entry candidates for this alias
    vector[int64_t] entry_indices

    # Prior probability P(entity|alias) - should sum up to (at most) 1.
    vector[float] probs


# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
cdef class Candidate:

    cdef readonly KnowledgeBase kb
    cdef hash_t entity_id_hash
    cdef hash_t alias_hash
    cdef float prior_prob


cdef class KnowledgeBase:
    cdef Pool mem
    cpdef readonly StringStore strings

    # This maps 64bit keys (hash of unique entity string)
    # to 64bit values (position of the _EntryC struct in the _entries vector).
    # The PreshMap is pretty space efficient, as it uses open addressing. So
    # the only overhead is the vacancy rate, which is approximately 30%.
    cdef PreshMap _entry_index

    # Each entry takes 128 bits, and again we'll have a 30% or so overhead for
    # over allocation.
    # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
    # Storing 1m entries would take 41.6mb under this scheme.
    cdef vector[_EntryC] _entries

    # This maps 64bit keys (hash of unique alias string)
    # to 64bit values (position of the _AliasC struct in the _aliases_table vector).
    cdef PreshMap _alias_index

    # This should map mention hashes to (entry_id, prob) tuples. The probability
    # should be P(entity | mention), which is pretty important to know.
    # We can pack both pieces of information into a 64-bit value, to keep things
    # efficient.
    cdef vector[_AliasC] _aliases_table

    # This is the part which might take more space: storing various
    # categorical features for the entries, and storing vectors for disambiguation
    # and possibly usage.
    # If each entry gets a 300-dimensional vector, for 1m entries we would need
    # 1.2gb. That gets expensive fast. What might be better is to avoid learning
    # a unique vector for every entity. We could instead have a compositional
    # model, that embeds different features of the entities into vectors. We'll
    # still want some per-entity features, like the Wikipedia text or entity
    # co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions.
    cdef object _vectors_table

    # It's very useful to track categorical features, at least for output, even
    # if they're not useful in the model itself. For instance, we should be
    # able to track stuff like a person's date of birth or whatever. This can
    # easily make the KB bigger, but if this isn't needed by the model, and it's
    # optional data, we can let users configure a DB as the backend for this.
    cdef object _features_table

    cdef inline int64_t c_add_entity(self, hash_t entity_id_hash, hash_t entity_name_hash, float prob,
                                     int32_t* vector_rows, int feats_row):
        """Add an entry to the knowledge base."""
        # This is what we'll map the hash key to. It's where the entry will sit
        # in the vector of entries, so we can get it later.
        cdef int64_t new_index = self._entries.size()
        self._entries.push_back(
            _EntryC(
                entity_id_hash=entity_id_hash,
                entity_name_hash=entity_name_hash,
                vector_rows=vector_rows,
                feats_row=feats_row,
                prob=prob
            ))
        self._entry_index[entity_id_hash] = new_index
        return new_index

    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
        """Connect a mention to a list of potential entities with their prior probabilities ."""
        cdef int64_t new_index = self._aliases_table.size()

        self._aliases_table.push_back(
            _AliasC(
                entry_indices=entry_indices,
                probs=probs
            ))
        self._alias_index[alias_hash] = new_index
        return new_index

    cdef inline _create_empty_vectors(self):
        """ 
        Making sure the first element of each vector is a dummy,
        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
        cf. https://github.com/explosion/preshed/issues/17
        """
        cdef int32_t dummy_value = 0
        self.strings.add("")
        self._entries.push_back(
            _EntryC(
                entity_id_hash=self.strings[""],
                entity_name_hash=self.strings[""],
                vector_rows=&dummy_value,
                feats_row=dummy_value,
                prob=dummy_value
            ))
        self._aliases_table.push_back(
            _AliasC(
                entry_indices=[dummy_value],
                probs=[dummy_value]
            ))
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00			`"""Knowledge-base for entity or concept linking."""`
			`from cymem.cymem cimport Pool`
			`from preshed.maps cimport PreshMap`
			`from libcpp.vector cimport vector`
hash the entity name 2019-03-15 17:00:53 +03:00			`from libc.stdint cimport int32_t, int64_t`
use StringStore 2019-03-19 18:43:23 +03:00
			`from spacy.strings cimport StringStore`
add pyx and separate method to add aliases 2019-03-15 18:05:23 +03:00			`from .typedefs cimport hash_t`
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00

			`# Internal struct, for storage and disambiguation. This isn't what we return`
			`# to the user as the answer to "here's your entity". It's the minimum number`
			`# of bits we need to keep track of the answers.`
			`cdef struct _EntryC:`

name per entity 2019-03-21 20:20:57 +03:00			`# The hash of this entry's unique ID and name in the kB`
			`hash_t entity_id_hash`
			`hash_t entity_name_hash`
create candidate object from entry pointer (not fully functional yet) 2019-03-21 02:04:06 +03:00
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00			`# Allows retrieval of one or more vectors.`
			`# Each element of vector_rows should be an index into a vectors table.`
			`# Every entry should have the same number of vectors, so we can avoid storing`
			`# the number of vectors in each knowledge-base struct`
create candidate object from entry pointer (not fully functional yet) 2019-03-21 02:04:06 +03:00			`int32_t* vector_rows`
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00
			`# Allows retrieval of a struct of non-vector features. We could make this a`
			`# pointer, but we have 32 bits left over in the struct after prob, so we'd`
			`# like this to only be 32 bits. We can also set this to -1, for the common`
			`# case where there are no features.`
			`int32_t feats_row`
documented some comments and todos 2019-03-15 13:37:24 +03:00
			`# log probability of entity, based on corpus frequency`
			`float prob`
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00

adding aliases per entity in the KB 2019-03-18 14:38:40 +03:00			`# Each alias struct stores a list of Entry pointers with their prior probabilities`
			`# for this specific mention/alias.`
			`cdef struct _AliasC:`

			`# All entry candidates for this alias`
very minimal KB functionality working 2019-03-18 19:27:51 +03:00			`vector[int64_t] entry_indices`
adding aliases per entity in the KB 2019-03-18 14:38:40 +03:00
			`# Prior probability P(entity\|alias) - should sum up to (at most) 1.`
very minimal KB functionality working 2019-03-18 19:27:51 +03:00			`vector[float] probs`
adding aliases per entity in the KB 2019-03-18 14:38:40 +03:00

select candidate with highest prior probabiity 2019-03-21 20:55:01 +03:00			`# Object used by the Entity Linker that summarizes one entity-alias candidate combination.`
create candidate object from entry pointer (not fully functional yet) 2019-03-21 02:04:06 +03:00			`cdef class Candidate:`

property getters and keep track of KB internally 2019-03-21 15:26:12 +03:00			`cdef readonly KnowledgeBase kb`
name per entity 2019-03-21 20:20:57 +03:00			`cdef hash_t entity_id_hash`
create candidate object from entry pointer (not fully functional yet) 2019-03-21 02:04:06 +03:00			`cdef hash_t alias_hash`
			`cdef float prior_prob`


kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00			`cdef class KnowledgeBase:`
			`cdef Pool mem`
use StringStore 2019-03-19 18:43:23 +03:00			`cpdef readonly StringStore strings`
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00
adding aliases per entity in the KB 2019-03-18 14:38:40 +03:00			`# This maps 64bit keys (hash of unique entity string)`
			`# to 64bit values (position of the _EntryC struct in the _entries vector).`
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00			`# The PreshMap is pretty space efficient, as it uses open addressing. So`
			`# the only overhead is the vacancy rate, which is approximately 30%.`
adding aliases per entity in the KB 2019-03-18 14:38:40 +03:00			`cdef PreshMap _entry_index`
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00
			`# Each entry takes 128 bits, and again we'll have a 30% or so overhead for`
			`# over allocation.`
			`# In total we end up with (N1281.3)+(N1281.3) bits for N entries.`
			`# Storing 1m entries would take 41.6mb under this scheme.`
			`cdef vector[_EntryC] _entries`

adding aliases per entity in the KB 2019-03-18 14:38:40 +03:00			`# This maps 64bit keys (hash of unique alias string)`
			`# to 64bit values (position of the _AliasC struct in the _aliases_table vector).`
			`cdef PreshMap _alias_index`

			`# This should map mention hashes to (entry_id, prob) tuples. The probability`
			`# should be P(entity \| mention), which is pretty important to know.`
			`# We can pack both pieces of information into a 64-bit value, to keep things`
			`# efficient.`
			`cdef vector[_AliasC] _aliases_table`

kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00			`# This is the part which might take more space: storing various`
			`# categorical features for the entries, and storing vectors for disambiguation`
			`# and possibly usage.`
			`# If each entry gets a 300-dimensional vector, for 1m entries we would need`
			`# 1.2gb. That gets expensive fast. What might be better is to avoid learning`
			`# a unique vector for every entity. We could instead have a compositional`
			`# model, that embeds different features of the entities into vectors. We'll`
			`# still want some per-entity features, like the Wikipedia text or entity`
			`# co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions.`
			`cdef object _vectors_table`

			`# It's very useful to track categorical features, at least for output, even`
			`# if they're not useful in the model itself. For instance, we should be`
			`# able to track stuff like a person's date of birth or whatever. This can`
			`# easily make the KB bigger, but if this isn't needed by the model, and it's`
			`# optional data, we can let users configure a DB as the backend for this.`
			`cdef object _features_table`

name per entity 2019-03-21 20:20:57 +03:00			`cdef inline int64_t c_add_entity(self, hash_t entity_id_hash, hash_t entity_name_hash, float prob,`
			`int32_t* vector_rows, int feats_row):`
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00			`"""Add an entry to the knowledge base."""`
add pyx and separate method to add aliases 2019-03-15 18:05:23 +03:00			`# This is what we'll map the hash key to. It's where the entry will sit`
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00			`# in the vector of entries, so we can get it later.`
minimal EL pipe 2019-03-21 19:33:25 +03:00			`cdef int64_t new_index = self._entries.size()`
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00			`self._entries.push_back(`
			`_EntryC(`
name per entity 2019-03-21 20:20:57 +03:00			`entity_id_hash=entity_id_hash,`
			`entity_name_hash=entity_name_hash,`
kb snippet, draft by Matt (wip) 2019-03-15 13:17:35 +03:00			`vector_rows=vector_rows,`
			`feats_row=feats_row,`
			`prob=prob`
			`))`
name per entity 2019-03-21 20:20:57 +03:00			`self._entry_index[entity_id_hash] = new_index`
minimal EL pipe 2019-03-21 19:33:25 +03:00			`return new_index`
adding aliases per entity in the KB 2019-03-18 14:38:40 +03:00
store entity hash instead of pointer 2019-03-21 14:31:02 +03:00			`cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):`
adding aliases per entity in the KB 2019-03-18 14:38:40 +03:00			`"""Connect a mention to a list of potential entities with their prior probabilities ."""`
minimal EL pipe 2019-03-21 19:33:25 +03:00			`cdef int64_t new_index = self._aliases_table.size()`
adding aliases per entity in the KB 2019-03-18 14:38:40 +03:00
			`self._aliases_table.push_back(`
			`_AliasC(`
			`entry_indices=entry_indices,`
			`probs=probs`
			`))`
minimal EL pipe 2019-03-21 19:33:25 +03:00			`self._alias_index[alias_hash] = new_index`
			`return new_index`
adding and retrieving aliases 2019-03-18 19:50:01 +03:00
minimal EL pipe 2019-03-21 19:33:25 +03:00			`cdef inline _create_empty_vectors(self):`
avoid value 0 in preshmap and helpful user warnings 2019-03-19 23:35:24 +03:00			`"""`
			`Making sure the first element of each vector is a dummy,`
			`because the PreshMap maps pointing to indices in these vectors can not contain 0 as value`
			`cf. https://github.com/explosion/preshed/issues/17`
			`"""`
			`cdef int32_t dummy_value = 0`
store entity hash instead of pointer 2019-03-21 14:31:02 +03:00			`self.strings.add("")`
avoid value 0 in preshmap and helpful user warnings 2019-03-19 23:35:24 +03:00			`self._entries.push_back(`
			`_EntryC(`
name per entity 2019-03-21 20:20:57 +03:00			`entity_id_hash=self.strings[""],`
			`entity_name_hash=self.strings[""],`
avoid value 0 in preshmap and helpful user warnings 2019-03-19 23:35:24 +03:00			`vector_rows=&dummy_value,`
			`feats_row=dummy_value,`
			`prob=dummy_value`
			`))`
			`self._aliases_table.push_back(`
			`_AliasC(`
			`entry_indices=[dummy_value],`
			`probs=[dummy_value]`
			`))`

adding and retrieving aliases 2019-03-18 19:50:01 +03:00