mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-14 03:26:24 +03:00
adding aliases per entity in the KB
This commit is contained in:
parent
f77b99c103
commit
af281c5466
53
spacy/kb.pxd
53
spacy/kb.pxd
|
@ -27,15 +27,25 @@ cdef struct _EntryC:
|
||||||
float prob
|
float prob
|
||||||
|
|
||||||
|
|
||||||
|
# Each alias struct stores a list of Entry pointers with their prior probabilities
|
||||||
|
# for this specific mention/alias.
|
||||||
|
cdef struct _AliasC:
|
||||||
|
|
||||||
|
# All entry candidates for this alias
|
||||||
|
const vector[int64_t] entry_indices
|
||||||
|
|
||||||
|
# Prior probability P(entity|alias) - should sum up to (at most) 1.
|
||||||
|
const vector[float] probs
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
|
|
||||||
# This maps 64bit keys to 64bit values. Here the key would be a hash of
|
# This maps 64bit keys (hash of unique entity string)
|
||||||
# a unique string name for the entity, and the value would be the position
|
# to 64bit values (position of the _EntryC struct in the _entries vector).
|
||||||
# of the _EntryC struct in our vector.
|
|
||||||
# The PreshMap is pretty space efficient, as it uses open addressing. So
|
# The PreshMap is pretty space efficient, as it uses open addressing. So
|
||||||
# the only overhead is the vacancy rate, which is approximately 30%.
|
# the only overhead is the vacancy rate, which is approximately 30%.
|
||||||
cdef PreshMap _index
|
cdef PreshMap _entry_index
|
||||||
|
|
||||||
# Each entry takes 128 bits, and again we'll have a 30% or so overhead for
|
# Each entry takes 128 bits, and again we'll have a 30% or so overhead for
|
||||||
# over allocation.
|
# over allocation.
|
||||||
|
@ -43,6 +53,16 @@ cdef class KnowledgeBase:
|
||||||
# Storing 1m entries would take 41.6mb under this scheme.
|
# Storing 1m entries would take 41.6mb under this scheme.
|
||||||
cdef vector[_EntryC] _entries
|
cdef vector[_EntryC] _entries
|
||||||
|
|
||||||
|
# This maps 64bit keys (hash of unique alias string)
|
||||||
|
# to 64bit values (position of the _AliasC struct in the _aliases_table vector).
|
||||||
|
cdef PreshMap _alias_index
|
||||||
|
|
||||||
|
# This should map mention hashes to (entry_id, prob) tuples. The probability
|
||||||
|
# should be P(entity | mention), which is pretty important to know.
|
||||||
|
# We can pack both pieces of information into a 64-bit value, to keep things
|
||||||
|
# efficient.
|
||||||
|
cdef vector[_AliasC] _aliases_table
|
||||||
|
|
||||||
# This is the part which might take more space: storing various
|
# This is the part which might take more space: storing various
|
||||||
# categorical features for the entries, and storing vectors for disambiguation
|
# categorical features for the entries, and storing vectors for disambiguation
|
||||||
# and possibly usage.
|
# and possibly usage.
|
||||||
|
@ -61,23 +81,30 @@ cdef class KnowledgeBase:
|
||||||
# optional data, we can let users configure a DB as the backend for this.
|
# optional data, we can let users configure a DB as the backend for this.
|
||||||
cdef object _features_table
|
cdef object _features_table
|
||||||
|
|
||||||
# This should map mention hashes to (entry_id, prob) tuples. The probability
|
|
||||||
# should be P(entity | mention), which is pretty important to know.
|
|
||||||
# We can pack both pieces of information into a 64-bit value, to keep things
|
|
||||||
# efficient.
|
|
||||||
cdef object _aliases_table
|
|
||||||
|
|
||||||
cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
|
cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows,
|
||||||
int feats_row):
|
int feats_row):
|
||||||
"""Add an entry to the knowledge base."""
|
"""Add an entry to the knowledge base."""
|
||||||
# This is what we'll map the hash key to. It's where the entry will sit
|
# This is what we'll map the hash key to. It's where the entry will sit
|
||||||
# in the vector of entries, so we can get it later.
|
# in the vector of entries, so we can get it later.
|
||||||
cdef int64_t index = self._entries.size()
|
cdef int64_t entity_index = self._entries.size()
|
||||||
self._entries.push_back(
|
self._entries.push_back(
|
||||||
_EntryC(
|
_EntryC(
|
||||||
vector_rows=vector_rows,
|
vector_rows=vector_rows,
|
||||||
feats_row=feats_row,
|
feats_row=feats_row,
|
||||||
prob=prob
|
prob=prob
|
||||||
))
|
))
|
||||||
self._index[key] = index
|
self._index[entity_key] = entity_index
|
||||||
return index
|
return entity_index
|
||||||
|
|
||||||
|
cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
|
||||||
|
"""Connect a mention to a list of potential entities with their prior probabilities ."""
|
||||||
|
cdef int64_t alias_index = self._aliases_table.size()
|
||||||
|
|
||||||
|
self._aliases_table.push_back(
|
||||||
|
_AliasC(
|
||||||
|
entry_indices=entry_indices,
|
||||||
|
probs=probs
|
||||||
|
))
|
||||||
|
self._alias_index[alias_key] = alias_index
|
||||||
|
return alias_index
|
23
spacy/kb.pyx
23
spacy/kb.pyx
|
@ -5,14 +5,14 @@ cdef class KnowledgeBase:
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self._entries.size()
|
return self._entries.size()
|
||||||
|
|
||||||
def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
|
def add_entity(self, entity_id: str, float prob, vectors=None, features=None):
|
||||||
# TODO: more friendly check for non-unique name
|
# TODO: more friendly check for non-unique name
|
||||||
if name in self:
|
if entity_id in self:
|
||||||
return
|
return
|
||||||
|
|
||||||
cdef hash_t name_hash = hash_string(name)
|
cdef hash_t id_hash = hash_string(entity_id)
|
||||||
cdef int32_t dummy_value = 342
|
cdef int32_t dummy_value = 342
|
||||||
self.c_add_entity(name_hash, prob, &dummy_value, dummy_value)
|
self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
|
||||||
# TODO self._vectors_table.get_pointer(vectors),
|
# TODO self._vectors_table.get_pointer(vectors),
|
||||||
# self._features_table.get(features))
|
# self._features_table.get(features))
|
||||||
|
|
||||||
|
@ -22,10 +22,13 @@ cdef class KnowledgeBase:
|
||||||
cdef hash_t entity_hash = 0
|
cdef hash_t entity_hash = 0
|
||||||
cdef int64_t entity_index = 0
|
cdef int64_t entity_index = 0
|
||||||
|
|
||||||
# TODO: check len(entities) == len(probabilities)
|
cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities]
|
||||||
for entity, prob in zip(entities, probabilities):
|
|
||||||
entity_hash = hash_string(entity)
|
self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities)
|
||||||
entity_index = self._index[entity_hash]
|
|
||||||
# TODO: check that entity is already in this KB (entity_index is OK)
|
# TODO: check that alias hadn't been defined before
|
||||||
self._aliases_table.add(alias_hash, entity_index, prob)
|
# TODO: check that entity is already in this KB (entity_index is OK)
|
||||||
|
# TODO: check sum(probabilities) <= 1
|
||||||
|
# TODO: check len(entities) == len(probabilities)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user