mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
304b9331e6
* Convert Candidate from Cython to Python class. * Format. * Fix .entity_ typo in _add_activations() usage. * Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span]. * Update docs. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update doc string of BaseCandidate.__init__(). * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate. * Adjust Candidate to support and mandate numerical entity IDs. * Format. * Fix docstring and docs. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename alias -> mention. * Refactor Candidate attribute names. Update docs and tests accordingly. * Refacor Candidate attributes and their usage. * Format. * Fix mypy error. * Update error code in line with v4 convention. * Modify EL batching system. * Update leftover get_candidates() mention in docs. * Format docs. * Format. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Updated error code. * Simplify interface for int/str representations. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename 'alias' to 'mention'. * Port Candidate and InMemoryCandidate to Cython. * Remove redundant entry in setup.py. * Add abstract class check. * Drop storing mention. * Update spacy/kb/candidate.pxd Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix entity_id refactoring problems in docstrings. * Drop unused InMemoryCandidate._entity_hash. * Update docstrings. * Move attributes out of Candidate. * Partially fix alias/mention terminology usage. Convert Candidate to interface. * Remove prior_prob from supported properties in Candidate. Introduce KnowledgeBase.supports_prior_probs(). * Update docstrings related to prior_prob. * Update alias/mention usage in doc(strings). * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Mention -> alias renaming. Drop Candidate.mentions(). Drop InMemoryLookupKB.get_alias_candidates() from docs. * Update docstrings. * Fix InMemoryCandidate attribute names. * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update W401 test. * Update spacy/errors.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Use Candidate output type for toy generators in the test suite to mimick best practices * fix docs * fix import * Fix merge leftovers. * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update website/docs/api/entitylinker.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/kb/kb_in_memory.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update website/docs/api/inmemorylookupkb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update get_candidates() docstring. * Reformat imports in entity_linker.py. * Drop valid_ent_idx_per_doc. * Update docs. * Format. * Simplify doc loop in predict(). * Remove E1044 comment. * Fix merge errors. * Format. * Format. * Format. * Fix merge error & tests. * Format. * Apply suggestions from code review Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> * Use type alias. * isort. * isort. * Lint. * Add typedefs.pyx. * Fix typedef import. * Fix type aliases. * Format. * Update docstring and type usage. * Add info on get_candidates(), get_candidates_batched(). * Readd get_candidates info to v3 changelog. * Update website/docs/api/entitylinker.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update factory functions for backwards compatibility. * Format. * Ignore mypy error. * Fix mypy error. * Format. * Add test for multiple docs with multiple entities. --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> Co-authored-by: svlandeg <svlandeg@github.com>
128 lines
4.7 KiB
Cython
128 lines
4.7 KiB
Cython
# cython: infer_types=True
|
|
|
|
from pathlib import Path
|
|
from typing import Iterable, Iterator, Tuple, Union
|
|
|
|
from cymem.cymem cimport Pool
|
|
|
|
from ..errors import Errors
|
|
from ..tokens import SpanGroup
|
|
from ..util import SimpleFrozenList
|
|
from .candidate cimport Candidate
|
|
|
|
|
|
cdef class KnowledgeBase:
|
|
"""A `KnowledgeBase` instance stores unique identifiers for entities and
|
|
their textual aliases, to support entity linking of named entities to
|
|
real-world concepts.
|
|
This is an abstract class and requires its operations to be implemented.
|
|
|
|
DOCS: https://spacy.io/api/kb
|
|
"""
|
|
CandidatesForMentionT = Iterable[Candidate]
|
|
CandidatesForDocT = Iterable[CandidatesForMentionT]
|
|
|
|
def __init__(self, vocab: Vocab, entity_vector_length: int):
|
|
"""Create a KnowledgeBase."""
|
|
# Make sure abstract KB is not instantiated.
|
|
if self.__class__ == KnowledgeBase:
|
|
raise TypeError(
|
|
Errors.E1046.format(cls_name=self.__class__.__name__)
|
|
)
|
|
|
|
self.vocab = vocab
|
|
self.entity_vector_length = entity_vector_length
|
|
self.mem = Pool()
|
|
|
|
def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[CandidatesForDocT]:
|
|
"""
|
|
Return candidate entities for the specified groups of mentions (as SpanGroup) per Doc.
|
|
Each candidate for a mention defines at least the entity and the entity's embedding vector. Depending on the KB
|
|
implementation, further properties - such as the prior probability of the specified mention text resolving to
|
|
that entity - might be included.
|
|
If no candidates are found for a given mention, an empty list is returned.
|
|
mentions (Iterator[SpanGroup]): Mentions for which to get candidates.
|
|
RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per mention/doc/doc batch.
|
|
"""
|
|
raise NotImplementedError(
|
|
Errors.E1045.format(
|
|
parent="KnowledgeBase", method="get_candidates", name=self.__name__
|
|
)
|
|
)
|
|
|
|
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
|
"""
|
|
Return vectors for entities.
|
|
entity (str): Entity name/ID.
|
|
RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
|
|
"""
|
|
return [self.get_vector(entity) for entity in entities]
|
|
|
|
def get_vector(self, str entity) -> Iterable[float]:
|
|
"""
|
|
Return vector for entity.
|
|
entity (str): Entity name/ID.
|
|
RETURNS (Iterable[float]): Vector for specified entity.
|
|
"""
|
|
raise NotImplementedError(
|
|
Errors.E1045.format(
|
|
parent="KnowledgeBase", method="get_vector", name=self.__name__
|
|
)
|
|
)
|
|
|
|
def to_bytes(self, **kwargs) -> bytes:
|
|
"""Serialize the current state to a binary string.
|
|
RETURNS (bytes): Current state as binary string.
|
|
"""
|
|
raise NotImplementedError(
|
|
Errors.E1045.format(
|
|
parent="KnowledgeBase", method="to_bytes", name=self.__name__
|
|
)
|
|
)
|
|
|
|
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
|
"""Load state from a binary string.
|
|
bytes_data (bytes): KB state.
|
|
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
|
"""
|
|
raise NotImplementedError(
|
|
Errors.E1045.format(
|
|
parent="KnowledgeBase", method="from_bytes", name=self.__name__
|
|
)
|
|
)
|
|
|
|
def to_disk(
|
|
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
|
) -> None:
|
|
"""
|
|
Write KnowledgeBase content to disk.
|
|
path (Union[str, Path]): Target file path.
|
|
exclude (Iterable[str]): List of components to exclude.
|
|
"""
|
|
raise NotImplementedError(
|
|
Errors.E1045.format(
|
|
parent="KnowledgeBase", method="to_disk", name=self.__name__
|
|
)
|
|
)
|
|
|
|
def from_disk(
|
|
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
|
) -> None:
|
|
"""
|
|
Load KnowledgeBase content from disk.
|
|
path (Union[str, Path]): Target file path.
|
|
exclude (Iterable[str]): List of components to exclude.
|
|
"""
|
|
raise NotImplementedError(
|
|
Errors.E1045.format(
|
|
parent="KnowledgeBase", method="from_disk", name=self.__name__
|
|
)
|
|
)
|
|
|
|
@property
|
|
def supports_prior_probs(self) -> bool:
|
|
"""RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
|
|
raise NotImplementedError(
|
|
Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
|
|
)
|