spaCy/spacy/kb/kb_in_memory.pxd

155 lines
6.6 KiB
Cython
Raw Normal View History

2019-03-15 13:17:35 +03:00
"""Knowledge-base for entity or concept linking."""
2019-03-15 17:00:53 +03:00
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
2023-06-26 12:41:03 +03:00
from libcpp.vector cimport vector
from preshed.maps cimport PreshMap
2019-03-19 18:43:23 +03:00
2023-06-26 12:41:03 +03:00
from ..structs cimport AliasC, KBEntryC
Refactor KB for easier customization (#11268) * Add implementation of batching + backwards compatibility fixes. Tests indicate issue with batch disambiguation for custom singular entity lookups. * Fix tests. Add distinction w.r.t. batch size. * Remove redundant and add new comments. * Adjust comments. Fix variable naming in EL prediction. * Fix mypy errors. * Remove KB entity type config option. Change return types of candidate retrieval functions to Iterable from Iterator. Fix various other issues. * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/kb_base.pyx Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/kb_base.pyx Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Add error messages to NotImplementedErrors. Remove redundant comment. * Fix imports. * Remove redundant comments. * Rename KnowledgeBase to InMemoryLookupKB and BaseKnowledgeBase to KnowledgeBase. * Fix tests. * Update spacy/errors.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Move KB into subdirectory. * Adjust imports after KB move to dedicated subdirectory. * Fix config imports. * Move Candidate + retrieval functions to separate module. Fix other, small issues. * Fix docstrings and error message w.r.t. class names. Fix typing for candidate retrieval functions. * Update spacy/kb/kb_in_memory.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix typing. * Change typing of mentions to be Span instead of Union[Span, str]. * Update docs. * Update EntityLinker and _architecture docs. * Update website/docs/api/entitylinker.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Adjust message for E1046. * Re-add section for Candidate in kb.md, add reference to dedicated page. * Update docs and docstrings. * Re-add section + reference for KnowledgeBase.get_alias_candidates() in docs. * Update spacy/kb/candidate.pyx * Update spacy/kb/kb_in_memory.pyx * Update spacy/pipeline/legacy/entity_linker.py * Remove canididate.md. Remove mistakenly added config snippet in entity_linker.py. Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-08 11:38:07 +03:00
from ..typedefs cimport hash_t
from .kb cimport KnowledgeBase
2019-06-26 16:55:26 +03:00
ctypedef vector[KBEntryC] entry_vec
ctypedef vector[AliasC] alias_vec
ctypedef vector[float] float_vec
ctypedef vector[float_vec] float_matrix
2019-03-18 14:38:40 +03:00
Refactor KB for easier customization (#11268) * Add implementation of batching + backwards compatibility fixes. Tests indicate issue with batch disambiguation for custom singular entity lookups. * Fix tests. Add distinction w.r.t. batch size. * Remove redundant and add new comments. * Adjust comments. Fix variable naming in EL prediction. * Fix mypy errors. * Remove KB entity type config option. Change return types of candidate retrieval functions to Iterable from Iterator. Fix various other issues. * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/kb_base.pyx Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/kb_base.pyx Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Update spacy/pipeline/entity_linker.py Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Add error messages to NotImplementedErrors. Remove redundant comment. * Fix imports. * Remove redundant comments. * Rename KnowledgeBase to InMemoryLookupKB and BaseKnowledgeBase to KnowledgeBase. * Fix tests. * Update spacy/errors.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Move KB into subdirectory. * Adjust imports after KB move to dedicated subdirectory. * Fix config imports. * Move Candidate + retrieval functions to separate module. Fix other, small issues. * Fix docstrings and error message w.r.t. class names. Fix typing for candidate retrieval functions. * Update spacy/kb/kb_in_memory.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix typing. * Change typing of mentions to be Span instead of Union[Span, str]. * Update docs. * Update EntityLinker and _architecture docs. * Update website/docs/api/entitylinker.md Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> * Adjust message for E1046. * Re-add section for Candidate in kb.md, add reference to dedicated page. * Update docs and docstrings. * Re-add section + reference for KnowledgeBase.get_alias_candidates() in docs. * Update spacy/kb/candidate.pyx * Update spacy/kb/kb_in_memory.pyx * Update spacy/pipeline/legacy/entity_linker.py * Remove canididate.md. Remove mistakenly added config snippet in entity_linker.py. Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-08 11:38:07 +03:00
cdef class InMemoryLookupKB(KnowledgeBase):
2019-03-18 14:38:40 +03:00
# This maps 64bit keys (hash of unique entity string)
2019-06-26 16:55:26 +03:00
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
2019-03-15 13:17:35 +03:00
# The PreshMap is pretty space efficient, as it uses open addressing. So
# the only overhead is the vacancy rate, which is approximately 30%.
2019-03-18 14:38:40 +03:00
cdef PreshMap _entry_index
2019-03-15 13:17:35 +03:00
# Each entry takes 128 bits, and again we'll have a 30% or so overhead for
# over allocation.
# In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
# Storing 1m entries would take 41.6mb under this scheme.
cdef entry_vec _entries
2019-03-15 13:17:35 +03:00
2019-03-18 14:38:40 +03:00
# This maps 64bit keys (hash of unique alias string)
# to 64bit values (position of the _AliasC struct in the _aliases_table vector).
cdef PreshMap _alias_index
# This should map mention hashes to (entry_id, prob) tuples. The probability
# should be P(entity | mention), which is pretty important to know.
# We can pack both pieces of information into a 64-bit value, to keep things
# efficient.
cdef alias_vec _aliases_table
2019-03-18 14:38:40 +03:00
2019-03-15 13:17:35 +03:00
# This is the part which might take more space: storing various
# categorical features for the entries, and storing vectors for disambiguation
# and possibly usage.
# If each entry gets a 300-dimensional vector, for 1m entries we would need
# 1.2gb. That gets expensive fast. What might be better is to avoid learning
# a unique vector for every entity. We could instead have a compositional
# model, that embeds different features of the entities into vectors. We'll
# still want some per-entity features, like the Wikipedia text or entity
# co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions.
cdef float_matrix _vectors_table
2019-03-15 13:17:35 +03:00
# It's very useful to track categorical features, at least for output, even
# if they're not useful in the model itself. For instance, we should be
# able to track stuff like a person's date of birth or whatever. This can
# easily make the KB bigger, but if this isn't needed by the model, and it's
# optional data, we can let users configure a DB as the backend for this.
cdef object _features_table
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
"""Add an entity vector to the vectors table."""
cdef int64_t new_index = self._vectors_table.size()
self._vectors_table.push_back(entity_vector)
return new_index
2019-07-19 18:40:28 +03:00
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
int32_t vector_index, int feats_row) nogil:
"""Add an entry to the vector of entries.
After calling this method, make sure to update also the _entry_index using the return value"""
2019-04-10 17:06:09 +03:00
# This is what we'll map the entity hash key to. It's where the entry will sit
2019-03-15 13:17:35 +03:00
# in the vector of entries, so we can get it later.
2019-03-21 19:33:25 +03:00
cdef int64_t new_index = self._entries.size()
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
2019-06-26 16:55:26 +03:00
cdef KBEntryC entry
entry.entity_hash = entity_hash
entry.vector_index = vector_index
entry.feats_row = feats_row
2019-07-19 18:40:28 +03:00
entry.freq = freq
self._entries.push_back(entry)
2019-03-21 19:33:25 +03:00
return new_index
2019-03-18 14:38:40 +03:00
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
"""Connect a mention to a list of potential entities with their prior probabilities .
After calling this method, make sure to update also the _alias_index using the return value"""
2019-04-10 17:06:09 +03:00
# This is what we'll map the alias hash key to. It's where the alias will be defined
# in the vector of aliases.
2019-03-21 19:33:25 +03:00
cdef int64_t new_index = self._aliases_table.size()
2019-03-18 14:38:40 +03:00
# Avoid struct initializer to enable nogil
cdef AliasC alias
alias.entry_indices = entry_indices
alias.probs = probs
self._aliases_table.push_back(alias)
2019-03-21 19:33:25 +03:00
return new_index
2019-03-18 19:50:01 +03:00
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
2020-02-18 17:38:18 +03:00
"""
2019-04-10 17:06:09 +03:00
Initializing the vectors and making sure the first element of each vector is a dummy,
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
cf. https://github.com/explosion/preshed/issues/17
"""
cdef int32_t dummy_value = 0
# Avoid struct initializer to enable nogil
2019-06-26 16:55:26 +03:00
cdef KBEntryC entry
entry.entity_hash = dummy_hash
entry.vector_index = dummy_value
entry.feats_row = dummy_value
2019-07-19 18:40:28 +03:00
entry.freq = dummy_value
# Avoid struct initializer to enable nogil
cdef vector[int64_t] dummy_entry_indices
dummy_entry_indices.push_back(0)
cdef vector[float] dummy_probs
dummy_probs.push_back(0)
cdef AliasC alias
alias.entry_indices = dummy_entry_indices
alias.probs = dummy_probs
self._entries.push_back(entry)
self._aliases_table.push_back(alias)
2019-07-19 18:40:28 +03:00
cpdef set_entities(self, entity_list, freq_list, vector_list)
2019-03-18 19:50:01 +03:00
cdef class Writer:
cdef FILE* _fp
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
cdef int write_vector_element(self, float element) except -1
2019-07-19 18:40:28 +03:00
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
2019-04-24 21:24:24 +03:00
cdef int write_alias_length(self, int64_t alias_length) except -1
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
cdef int write_alias(self, int64_t entry_index, float prob) except -1
cdef int _write(self, void* value, size_t size) except -1
cdef class Reader:
cdef FILE* _fp
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
cdef int read_vector_element(self, float* element) except -1
2019-07-19 18:40:28 +03:00
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
2019-04-24 21:24:24 +03:00
cdef int read_alias_length(self, int64_t* alias_length) except -1
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
cdef int _read(self, void* value, size_t size) except -1