spaCy/spacy/kb/candidate.py

import abc
from typing import List, Union, Callable


class BaseCandidate(abc.ABC):
    """A `BaseCandidate` object refers to a textual mention (`alias`) that may or may not be resolved
    to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking
    algorithm which will disambiguate the various candidates to the correct one.
    Each candidate (alias, entity_id) pair is assigned a certain prior probability.

    DOCS: https://spacy.io/api/kb/#candidate-init
    """

    def __init__(
        self, mention: str, entity_id: Union[int, str], entity_vector: List[float]
    ):
        """Create new instance of `Candidate`. Note: has to be a sub-class, otherwise error will be raised.
        mention (str): Mention text for this candidate.
        entity_id (Union[int, str]): Unique entity ID.
        entity_vector (List[float]): Entity embedding.
        """
        self._mention = mention
        self._entity_id = entity_id
        self._entity_vector = entity_vector

    @property
    def entity(self) -> Union[int, str]:
        """RETURNS (Union[int, str]): Entity ID."""
        return self._entity_id

    @property
    @abc.abstractmethod
    def entity_(self) -> str:
        """RETURNS (str): Entity name."""

    @property
    def mention(self) -> str:
        """RETURNS (str): Mention."""
        return self._mention

    @property
    def entity_vector(self) -> List[float]:
        """RETURNS (List[float]): Entity vector."""
        return self._entity_vector


class Candidate(BaseCandidate):
    """`Candidate` for InMemoryLookupKBCandidate."""

    def __init__(
        self,
        retrieve_string_from_hash: Callable[[int], str],
        entity_hash: int,
        entity_freq: int,
        entity_vector: List[float],
        alias_hash: int,
        prior_prob: float,
    ):
        """
        retrieve_string_from_hash (Callable[[int], str]): Callable retrieveing entity name from provided entity/vocab
            hash.
        entity_hash (str): Hashed entity name /ID.
        entity_freq (int): Entity frequency in KB corpus.
        entity_vector (List[float]): Entity embedding.
        alias_hash (int): Hashed alias.
        prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
            the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
            cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
            doesn't) it might be better to eschew this information and always supply the same value.
        """
        super().__init__(
            mention=retrieve_string_from_hash(alias_hash),
            entity_id=entity_hash,
            entity_vector=entity_vector,
        )
        self._retrieve_string_from_hash = retrieve_string_from_hash
        self._entity_hash = entity_hash
        self._entity_freq = entity_freq
        self._alias_hash = alias_hash
        self._prior_prob = prior_prob

    @property
    def entity(self) -> int:
        """RETURNS (int): hash of the entity_id's KB ID/name"""
        return self._entity_hash

    @property
    def entity_(self) -> str:
        """RETURNS (str): ID/name of this entity_id in the KB"""
        return self._retrieve_string_from_hash(self._entity_hash)

    @property
    def alias(self) -> int:
        """RETURNS (int): hash of the alias"""
        return self._alias_hash

    @property
    def alias_(self) -> str:
        """RETURNS (str): ID of the original alias"""
        return self._retrieve_string_from_hash(self._alias_hash)

    @property
    def entity_freq(self) -> float:
        return self._entity_freq

    @property
    def prior_prob(self) -> float:
        """RETURNS (List[float]): Entity vector."""
        return self._prior_prob