Convert Candidate from Cython to Python class.

This commit is contained in:
Raphael Mitsch 2023-02-28 13:49:52 +01:00
parent df4c069a13
commit cd98ab4e95
8 changed files with 147 additions and 99 deletions

View File

@ -30,7 +30,6 @@ MOD_NAMES = [
"spacy.lexeme", "spacy.lexeme",
"spacy.vocab", "spacy.vocab",
"spacy.attrs", "spacy.attrs",
"spacy.kb.candidate",
"spacy.kb.kb", "spacy.kb.kb",
"spacy.kb.kb_in_memory", "spacy.kb.kb_in_memory",
"spacy.ml.tb_framework", "spacy.ml.tb_framework",

View File

@ -1,3 +1,5 @@
from .kb import KnowledgeBase from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB from .kb_in_memory import InMemoryLookupKB
from .candidate import Candidate, get_candidates, get_candidates_batch from .candidate import Candidate
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate"]

View File

@ -1,12 +0,0 @@
from .kb cimport KnowledgeBase
from libcpp.vector cimport vector
from ..typedefs cimport hash_t
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
cdef class Candidate:
cdef readonly KnowledgeBase kb
cdef hash_t entity_hash
cdef float entity_freq
cdef vector[float] entity_vector
cdef hash_t alias_hash
cdef float prior_prob

109
spacy/kb/candidate.py Normal file
View File

@ -0,0 +1,109 @@
import abc
from typing import List, Union, Callable
class BaseCandidate(abc.ABC):
"""A `BaseCandidate` object refers to a textual mention (`alias`) that may or may not be resolved
to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking
algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity_id) pair is assigned a certain prior probability.
DOCS: https://spacy.io/api/kb/#candidate-init
"""
def __init__(
self, mention: str, entity_id: Union[int, str], entity_vector: List[float]
):
"""Create new instance of `Candidate`. Note: has to be a sub-class, otherwise error will be raised.
mention (str): Mention text for this candidate.
entity_id (Union[int, str]): Unique entity ID.
entity_vector (List[float]): Entity embedding.
"""
self._mention = mention
self._entity_id = entity_id
self._entity_vector = entity_vector
@property
def entity(self) -> Union[int, str]:
"""RETURNS (Union[int, str]): Entity ID."""
return self._entity_id
@property
@abc.abstractmethod
def entity_(self) -> str:
"""RETURNS (str): Entity name."""
@property
def mention(self) -> str:
"""RETURNS (str): Mention."""
return self._mention
@property
def entity_vector(self) -> List[float]:
"""RETURNS (List[float]): Entity vector."""
return self._entity_vector
class Candidate(BaseCandidate):
"""`Candidate` for InMemoryLookupKBCandidate."""
def __init__(
self,
retrieve_string_from_hash: Callable[[int], str],
entity_hash: int,
entity_freq: int,
entity_vector: List[float],
alias_hash: int,
prior_prob: float,
):
"""
retrieve_string_from_hash (Callable[[int], str]): Callable retrieveing entity name from provided entity/vocab
hash.
entity_hash (str): Hashed entity name /ID.
entity_freq (int): Entity frequency in KB corpus.
entity_vector (List[float]): Entity embedding.
alias_hash (int): Hashed alias.
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
doesn't) it might be better to eschew this information and always supply the same value.
"""
super().__init__(
mention=retrieve_string_from_hash(alias_hash),
entity_id=entity_hash,
entity_vector=entity_vector,
)
self._retrieve_string_from_hash = retrieve_string_from_hash
self._entity_hash = entity_hash
self._entity_freq = entity_freq
self._alias_hash = alias_hash
self._prior_prob = prior_prob
@property
def entity(self) -> int:
"""RETURNS (int): hash of the entity_id's KB ID/name"""
return self._entity_hash
@property
def entity_(self) -> str:
"""RETURNS (str): ID/name of this entity_id in the KB"""
return self._retrieve_string_from_hash(self._entity_hash)
@property
def alias(self) -> int:
"""RETURNS (int): hash of the alias"""
return self._alias_hash
@property
def alias_(self) -> str:
"""RETURNS (str): ID of the original alias"""
return self._retrieve_string_from_hash(self._alias_hash)
@property
def entity_freq(self) -> float:
return self._entity_freq
@property
def prior_prob(self) -> float:
"""RETURNS (List[float]): Entity vector."""
return self._prior_prob

View File

@ -1,74 +0,0 @@
# cython: infer_types=True, profile=True
from typing import Iterable
from .kb cimport KnowledgeBase
from ..tokens import Span
cdef class Candidate:
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity) pair is assigned a certain prior probability.
DOCS: https://spacy.io/api/kb/#candidate-init
"""
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
self.kb = kb
self.entity_hash = entity_hash
self.entity_freq = entity_freq
self.entity_vector = entity_vector
self.alias_hash = alias_hash
self.prior_prob = prior_prob
@property
def entity(self) -> int:
"""RETURNS (uint64): hash of the entity's KB ID/name"""
return self.entity_hash
@property
def entity_(self) -> str:
"""RETURNS (str): ID/name of this entity in the KB"""
return self.kb.vocab.strings[self.entity_hash]
@property
def alias(self) -> int:
"""RETURNS (uint64): hash of the alias"""
return self.alias_hash
@property
def alias_(self) -> str:
"""RETURNS (str): ID of the original alias"""
return self.kb.vocab.strings[self.alias_hash]
@property
def entity_freq(self) -> float:
return self.entity_freq
@property
def entity_vector(self) -> Iterable[float]:
return self.entity_vector
@property
def prior_prob(self) -> float:
return self.prior_prob
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
"""
Return candidate entities for a given mention and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (Span): Entity mention for which to identify candidates.
RETURNS (Iterable[Candidate]): Identified candidates.
"""
return kb.get_candidates(mention)
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
"""
Return candidate entities for the given mentions and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (Iterable[Span]): Entity mentions for which to identify candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
"""
return kb.get_candidates_batch(mentions)

View File

@ -238,14 +238,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
alias_index = <int64_t>self._alias_index.get(alias_hash) alias_index = <int64_t>self._alias_index.get(alias_hash)
alias_entry = self._aliases_table[alias_index] alias_entry = self._aliases_table[alias_index]
return [Candidate(kb=self, return [
entity_hash=self._entries[entry_index].entity_hash, Candidate(
entity_freq=self._entries[entry_index].freq, retrieve_string_from_hash=self.vocab.strings.__getitem__,
entity_vector=self._vectors_table[self._entries[entry_index].vector_index], entity_hash=self._entries[entry_index].entity_hash,
alias_hash=alias_hash, entity_freq=self._entries[entry_index].freq,
prior_prob=prior_prob) entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) alias_hash=alias_hash,
if entry_index != 0] prior_prob=prior_prob
)
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
if entry_index != 0
]
def get_vector(self, str entity): def get_vector(self, str entity):
cdef hash_t entity_hash = self.vocab.strings[entity] cdef hash_t entity_hash = self.vocab.strings[entity]

View File

@ -6,7 +6,7 @@ from thinc.api import Model, Maxout, Linear, tuplify, Ragged
from ...util import registry from ...util import registry
from ...kb import KnowledgeBase, InMemoryLookupKB from ...kb import KnowledgeBase, InMemoryLookupKB
from ...kb import Candidate, get_candidates, get_candidates_batch from ...kb import Candidate
from ...vocab import Vocab from ...vocab import Vocab
from ...tokens import Span, Doc from ...tokens import Span, Doc
from ..extract_spans import extract_spans from ..extract_spans import extract_spans
@ -109,3 +109,23 @@ def create_candidates_batch() -> Callable[
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
]: ]:
return get_candidates_batch return get_candidates_batch
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
"""
Return candidate entities for a given mention and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (Span): Entity mention for which to identify candidates.
RETURNS (Iterable[Candidate]): Identified candidates.
"""
return kb.get_candidates(mention)
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
"""
Return candidate entities for the given mentions and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (Iterable[Span]): Entity mentions for which to identify candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
"""
return kb.get_candidates_batch(mentions)

View File

@ -7,10 +7,10 @@ from thinc.types import Ragged
from spacy import registry, util from spacy import registry, util
from spacy.attrs import ENT_KB_ID from spacy.attrs import ENT_KB_ID
from spacy.compat import pickle from spacy.compat import pickle
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
from spacy.lang.en import English from spacy.lang.en import English
from spacy.ml import load_kb from spacy.ml import load_kb
from spacy.ml.models.entity_linker import build_span_maker from spacy.ml.models.entity_linker import build_span_maker, get_candidates
from spacy.pipeline import EntityLinker, TrainablePipe from spacy.pipeline import EntityLinker, TrainablePipe
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer from spacy.scorer import Scorer