2023-09-12 09:50:01 +03:00
|
|
|
# cython: infer_types=True
|
2022-09-08 11:38:07 +03:00
|
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
from typing import Iterable, Tuple, Union
|
2023-06-26 12:41:03 +03:00
|
|
|
|
2022-09-08 11:38:07 +03:00
|
|
|
from cymem.cymem cimport Pool
|
|
|
|
|
2023-06-26 12:41:03 +03:00
|
|
|
from ..errors import Errors
|
2023-03-20 14:25:18 +03:00
|
|
|
from ..tokens import Span, SpanGroup
|
2022-09-08 11:38:07 +03:00
|
|
|
from ..util import SimpleFrozenList
|
2023-06-26 12:41:03 +03:00
|
|
|
from .candidate import Candidate
|
2022-09-08 11:38:07 +03:00
|
|
|
|
|
|
|
|
|
|
|
cdef class KnowledgeBase:
|
2023-07-19 13:03:31 +03:00
|
|
|
"""A `KnowledgeBase` instance stores unique identifiers for entities and
|
|
|
|
their textual aliases, to support entity linking of named entities to
|
|
|
|
real-world concepts.
|
2022-09-08 11:38:07 +03:00
|
|
|
This is an abstract class and requires its operations to be implemented.
|
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/kb
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, vocab: Vocab, entity_vector_length: int):
|
|
|
|
"""Create a KnowledgeBase."""
|
|
|
|
# Make sure abstract KB is not instantiated.
|
|
|
|
if self.__class__ == KnowledgeBase:
|
|
|
|
raise TypeError(
|
|
|
|
Errors.E1046.format(cls_name=self.__class__.__name__)
|
|
|
|
)
|
|
|
|
|
|
|
|
self.vocab = vocab
|
|
|
|
self.entity_vector_length = entity_vector_length
|
|
|
|
self.mem = Pool()
|
|
|
|
|
2023-07-19 13:03:31 +03:00
|
|
|
def get_candidates_batch(
|
2023-07-19 17:37:31 +03:00
|
|
|
self, mentions: SpanGroup
|
2023-07-19 13:03:31 +03:00
|
|
|
) -> Iterable[Iterable[Candidate]]:
|
2022-09-08 11:38:07 +03:00
|
|
|
"""
|
2023-03-20 02:34:35 +03:00
|
|
|
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
|
|
|
|
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
|
|
|
probability of the specified mention text resolving to that entity - might be included.
|
|
|
|
If no candidates are found for a given mention, an empty list is returned.
|
2023-03-20 14:25:18 +03:00
|
|
|
mentions (SpanGroup): Mentions for which to get candidates.
|
2022-09-08 11:38:07 +03:00
|
|
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
|
|
|
"""
|
|
|
|
return [self.get_candidates(span) for span in mentions]
|
|
|
|
|
|
|
|
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
|
|
|
"""
|
2023-03-20 02:34:35 +03:00
|
|
|
Return candidate entities for a specific mention. Each candidate defines at least the entity and the
|
|
|
|
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
|
|
|
probability of the specified mention text resolving to that entity - might be included.
|
|
|
|
If no candidate is found for the given mention, an empty list is returned.
|
2022-09-08 11:38:07 +03:00
|
|
|
mention (Span): Mention for which to get candidates.
|
|
|
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError(
|
2023-07-19 13:03:31 +03:00
|
|
|
Errors.E1045.format(
|
|
|
|
parent="KnowledgeBase", method="get_candidates", name=self.__name__
|
|
|
|
)
|
2022-09-08 11:38:07 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
|
|
|
"""
|
|
|
|
Return vectors for entities.
|
|
|
|
entity (str): Entity name/ID.
|
|
|
|
RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
|
|
|
|
"""
|
|
|
|
return [self.get_vector(entity) for entity in entities]
|
|
|
|
|
|
|
|
def get_vector(self, str entity) -> Iterable[float]:
|
|
|
|
"""
|
|
|
|
Return vector for entity.
|
|
|
|
entity (str): Entity name/ID.
|
|
|
|
RETURNS (Iterable[float]): Vector for specified entity.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError(
|
2023-07-19 13:03:31 +03:00
|
|
|
Errors.E1045.format(
|
|
|
|
parent="KnowledgeBase", method="get_vector", name=self.__name__
|
|
|
|
)
|
2022-09-08 11:38:07 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
def to_bytes(self, **kwargs) -> bytes:
|
|
|
|
"""Serialize the current state to a binary string.
|
|
|
|
RETURNS (bytes): Current state as binary string.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError(
|
2023-07-19 13:03:31 +03:00
|
|
|
Errors.E1045.format(
|
|
|
|
parent="KnowledgeBase", method="to_bytes", name=self.__name__
|
|
|
|
)
|
2022-09-08 11:38:07 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
|
|
|
"""Load state from a binary string.
|
|
|
|
bytes_data (bytes): KB state.
|
|
|
|
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError(
|
2023-07-19 13:03:31 +03:00
|
|
|
Errors.E1045.format(
|
|
|
|
parent="KnowledgeBase", method="from_bytes", name=self.__name__
|
|
|
|
)
|
2022-09-08 11:38:07 +03:00
|
|
|
)
|
|
|
|
|
2023-07-19 13:03:31 +03:00
|
|
|
def to_disk(
|
|
|
|
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
|
|
|
) -> None:
|
2022-09-08 11:38:07 +03:00
|
|
|
"""
|
|
|
|
Write KnowledgeBase content to disk.
|
|
|
|
path (Union[str, Path]): Target file path.
|
|
|
|
exclude (Iterable[str]): List of components to exclude.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError(
|
2023-07-19 13:03:31 +03:00
|
|
|
Errors.E1045.format(
|
|
|
|
parent="KnowledgeBase", method="to_disk", name=self.__name__
|
|
|
|
)
|
2022-09-08 11:38:07 +03:00
|
|
|
)
|
|
|
|
|
2023-07-19 13:03:31 +03:00
|
|
|
def from_disk(
|
|
|
|
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
|
|
|
) -> None:
|
2022-09-08 11:38:07 +03:00
|
|
|
"""
|
|
|
|
Load KnowledgeBase content from disk.
|
|
|
|
path (Union[str, Path]): Target file path.
|
|
|
|
exclude (Iterable[str]): List of components to exclude.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError(
|
2023-07-19 13:03:31 +03:00
|
|
|
Errors.E1045.format(
|
|
|
|
parent="KnowledgeBase", method="from_disk", name=self.__name__
|
|
|
|
)
|
2022-09-08 11:38:07 +03:00
|
|
|
)
|
2023-03-20 02:34:35 +03:00
|
|
|
|
|
|
|
@property
|
|
|
|
def supports_prior_probs(self) -> bool:
|
|
|
|
"""RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
|
|
|
|
raise NotImplementedError(
|
|
|
|
Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
|
|
|
|
)
|