mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
Introduce hierarchy for EL Candidate
objects (#12341)
* Convert Candidate from Cython to Python class. * Format. * Fix .entity_ typo in _add_activations() usage. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update doc string of BaseCandidate.__init__(). * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate. * Adjust Candidate to support and mandate numerical entity IDs. * Format. * Fix docstring and docs. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename alias -> mention. * Refactor Candidate attribute names. Update docs and tests accordingly. * Refacor Candidate attributes and their usage. * Format. * Fix mypy error. * Update error code in line with v4 convention. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Updated error code. * Simplify interface for int/str representations. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename 'alias' to 'mention'. * Port Candidate and InMemoryCandidate to Cython. * Remove redundant entry in setup.py. * Add abstract class check. * Drop storing mention. * Update spacy/kb/candidate.pxd Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix entity_id refactoring problems in docstrings. * Drop unused InMemoryCandidate._entity_hash. * Update docstrings. * Move attributes out of Candidate. * Partially fix alias/mention terminology usage. Convert Candidate to interface. * Remove prior_prob from supported properties in Candidate. Introduce KnowledgeBase.supports_prior_probs(). * Update docstrings related to prior_prob. * Update alias/mention usage in doc(strings). * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Mention -> alias renaming. Drop Candidate.mentions(). Drop InMemoryLookupKB.get_alias_candidates() from docs. * Update docstrings. * Fix InMemoryCandidate attribute names. * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update W401 test. * Update spacy/errors.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Use Candidate output type for toy generators in the test suite to mimick best practices * fix docs * fix import --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
6ae7618418
commit
9340eb8ad2
|
@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
"ignoring the duplicate entry.")
|
"ignoring the duplicate entry.")
|
||||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
||||||
"you are constructing a parse tree incrementally by setting "
|
"you are constructing a parse tree incrementally by setting "
|
||||||
|
@ -209,7 +209,11 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||||
|
|
||||||
|
# v4 warning strings
|
||||||
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
||||||
|
W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
|
||||||
|
"lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
|
||||||
|
"to return `True` in `.supports_prior_probs`.")
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
|
@ -961,6 +965,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"reference and predicted docs.")
|
"reference and predicted docs.")
|
||||||
E4004 = ("Backprop is not supported when is_train is not set.")
|
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||||
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
|
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
|
||||||
|
E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
|
||||||
|
|
||||||
|
|
||||||
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
from .kb import KnowledgeBase
|
from .kb import KnowledgeBase
|
||||||
from .kb_in_memory import InMemoryLookupKB
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
from .candidate import Candidate, InMemoryCandidate
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
from .kb cimport KnowledgeBase
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
from .kb_in_memory cimport InMemoryLookupKB
|
||||||
from ..typedefs cimport hash_t
|
from ..typedefs cimport hash_t
|
||||||
|
|
||||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
cdef readonly KnowledgeBase kb
|
pass
|
||||||
cdef hash_t entity_hash
|
|
||||||
cdef float entity_freq
|
|
||||||
cdef vector[float] entity_vector
|
cdef class InMemoryCandidate(Candidate):
|
||||||
cdef hash_t alias_hash
|
cdef readonly hash_t _entity_hash
|
||||||
cdef float prior_prob
|
cdef readonly hash_t _alias_hash
|
||||||
|
cpdef vector[float] _entity_vector
|
||||||
|
cdef float _prior_prob
|
||||||
|
cdef readonly InMemoryLookupKB _kb
|
||||||
|
cdef float _entity_freq
|
||||||
|
|
|
@ -1,74 +1,96 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
|
|
||||||
from typing import Iterable
|
from .kb_in_memory cimport InMemoryLookupKB
|
||||||
from .kb cimport KnowledgeBase
|
from ..errors import Errors
|
||||||
from ..tokens import Span
|
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
"""A `Candidate` object refers to a textual mention that may or may not be resolved
|
||||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
to a specific entity from a Knowledge Base. This will be used as input for the entity linking
|
||||||
algorithm which will disambiguate the various candidates to the correct one.
|
algorithm which will disambiguate the various candidates to the correct one.
|
||||||
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
|
||||||
|
is assigned a certain prior probability.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
def __init__(self):
|
||||||
self.kb = kb
|
# Make sure abstract Candidate is not instantiated.
|
||||||
self.entity_hash = entity_hash
|
if self.__class__ == Candidate:
|
||||||
self.entity_freq = entity_freq
|
raise TypeError(
|
||||||
self.entity_vector = entity_vector
|
Errors.E1046.format(cls_name=self.__class__.__name__)
|
||||||
self.alias_hash = alias_hash
|
)
|
||||||
self.prior_prob = prior_prob
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def entity(self) -> int:
|
def entity_id(self) -> int:
|
||||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
"""RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
|
||||||
return self.entity_hash
|
otherwise the hash of the entity ID string)."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def entity_(self) -> str:
|
def entity_id_(self) -> str:
|
||||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
"""RETURNS (str): String representation of entity ID."""
|
||||||
return self.kb.vocab.strings[self.entity_hash]
|
raise NotImplementedError
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def alias(self) -> int:
|
def entity_vector(self) -> vector[float]:
|
||||||
"""RETURNS (uint64): hash of the alias"""
|
"""RETURNS (vector[float]): Entity vector."""
|
||||||
return self.alias_hash
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
cdef class InMemoryCandidate(Candidate):
|
||||||
|
"""Candidate for InMemoryLookupKB."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
kb: InMemoryLookupKB,
|
||||||
|
entity_hash: int,
|
||||||
|
alias_hash: int,
|
||||||
|
entity_vector: vector[float],
|
||||||
|
prior_prob: float,
|
||||||
|
entity_freq: float
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
kb (InMemoryLookupKB]): InMemoryLookupKB instance.
|
||||||
|
entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
|
||||||
|
entity_freq (int): Entity frequency in KB corpus.
|
||||||
|
entity_vector (List[float]): Entity embedding.
|
||||||
|
alias_hash (int): Alias hash.
|
||||||
|
prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
|
||||||
|
the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self._entity_hash = entity_hash
|
||||||
|
self._entity_vector = entity_vector
|
||||||
|
self._prior_prob = prior_prob
|
||||||
|
self._kb = kb
|
||||||
|
self._alias_hash = alias_hash
|
||||||
|
self._entity_freq = entity_freq
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def alias_(self) -> str:
|
def entity_id(self) -> int:
|
||||||
"""RETURNS (str): ID of the original alias"""
|
return self._entity_hash
|
||||||
return self.kb.vocab.strings[self.alias_hash]
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def entity_freq(self) -> float:
|
def entity_vector(self) -> vector[float]:
|
||||||
return self.entity_freq
|
return self._entity_vector
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_vector(self) -> Iterable[float]:
|
|
||||||
return self.entity_vector
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def prior_prob(self) -> float:
|
def prior_prob(self) -> float:
|
||||||
return self.prior_prob
|
"""RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
|
||||||
|
this entity."""
|
||||||
|
return self._prior_prob
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alias(self) -> str:
|
||||||
|
"""RETURNS (str): Alias."""
|
||||||
|
return self._kb.vocab.strings[self._alias_hash]
|
||||||
|
|
||||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
@property
|
||||||
"""
|
def entity_id_(self) -> str:
|
||||||
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
return self._kb.vocab.strings[self._entity_hash]
|
||||||
kb (KnowledgeBase): Knowledge base to query.
|
|
||||||
mention (Span): Entity mention for which to identify candidates.
|
|
||||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
|
||||||
"""
|
|
||||||
return kb.get_candidates(mention)
|
|
||||||
|
|
||||||
|
@property
|
||||||
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
def entity_freq(self) -> float:
|
||||||
"""
|
"""RETURNS (float): Entity frequency in KB corpus."""
|
||||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
return self._entity_freq
|
||||||
kb (KnowledgeBase): Knowledge base to query.
|
|
||||||
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
|
||||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
|
||||||
"""
|
|
||||||
return kb.get_candidates_batch(mentions)
|
|
||||||
|
|
|
@ -32,9 +32,10 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
|
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
|
||||||
and the prior probability of that alias resolving to that entity.
|
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
||||||
If no candidate is found for a given text, an empty list is returned.
|
probability of the specified mention text resolving to that entity - might be included.
|
||||||
|
If no candidates are found for a given mention, an empty list is returned.
|
||||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
"""
|
"""
|
||||||
|
@ -42,9 +43,10 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
Return candidate entities for a specific mention. Each candidate defines at least the entity and the
|
||||||
and the prior probability of that alias resolving to that entity.
|
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
||||||
If the no candidate is found for a given text, an empty list is returned.
|
probability of the specified mention text resolving to that entity - might be included.
|
||||||
|
If no candidate is found for the given mention, an empty list is returned.
|
||||||
mention (Span): Mention for which to get candidates.
|
mention (Span): Mention for which to get candidates.
|
||||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
"""
|
"""
|
||||||
|
@ -106,3 +108,10 @@ cdef class KnowledgeBase:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supports_prior_probs(self) -> bool:
|
||||||
|
"""RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
|
@ -18,7 +18,7 @@ from .. import util
|
||||||
from ..util import SimpleFrozenList, ensure_path
|
from ..util import SimpleFrozenList, ensure_path
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from .kb cimport KnowledgeBase
|
from .kb cimport KnowledgeBase
|
||||||
from .candidate import Candidate as Candidate
|
from .candidate import InMemoryCandidate
|
||||||
|
|
||||||
|
|
||||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
@ -226,10 +226,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
alias_entry.probs = probs
|
alias_entry.probs = probs
|
||||||
self._aliases_table[alias_index] = alias_entry
|
self._aliases_table[alias_index] = alias_entry
|
||||||
|
|
||||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
|
||||||
return self.get_alias_candidates(mention.text) # type: ignore
|
return self._get_alias_candidates(mention.text) # type: ignore
|
||||||
|
|
||||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||||
and the prior probability of that alias resolving to that entity.
|
and the prior probability of that alias resolving to that entity.
|
||||||
|
@ -241,14 +241,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
|
|
||||||
return [Candidate(kb=self,
|
return [
|
||||||
|
InMemoryCandidate(
|
||||||
|
kb=self,
|
||||||
entity_hash=self._entries[entry_index].entity_hash,
|
entity_hash=self._entries[entry_index].entity_hash,
|
||||||
entity_freq=self._entries[entry_index].freq,
|
|
||||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
|
||||||
alias_hash=alias_hash,
|
alias_hash=alias_hash,
|
||||||
prior_prob=prior_prob)
|
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||||
|
prior_prob=prior_prob,
|
||||||
|
entity_freq=self._entries[entry_index].freq
|
||||||
|
)
|
||||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||||
if entry_index != 0]
|
if entry_index != 0
|
||||||
|
]
|
||||||
|
|
||||||
def get_vector(self, str entity):
|
def get_vector(self, str entity):
|
||||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||||
|
@ -279,6 +283,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
def supports_prior_probs(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -6,7 +6,7 @@ from thinc.api import Model, Maxout, Linear, tuplify, Ragged
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...kb import KnowledgeBase, InMemoryLookupKB
|
from ...kb import KnowledgeBase, InMemoryLookupKB
|
||||||
from ...kb import Candidate, get_candidates, get_candidates_batch
|
from ...kb import Candidate
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...tokens import Span, Doc
|
from ...tokens import Span, Doc
|
||||||
from ..extract_spans import extract_spans
|
from ..extract_spans import extract_spans
|
||||||
|
@ -117,3 +117,25 @@ def create_candidates_batch() -> Callable[
|
||||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
]:
|
]:
|
||||||
return get_candidates_batch
|
return get_candidates_batch
|
||||||
|
|
||||||
|
|
||||||
|
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||||
|
"""
|
||||||
|
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
||||||
|
kb (KnowledgeBase): Knowledge base to query.
|
||||||
|
mention (Span): Entity mention for which to identify candidates.
|
||||||
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
|
"""
|
||||||
|
return kb.get_candidates(mention)
|
||||||
|
|
||||||
|
|
||||||
|
def get_candidates_batch(
|
||||||
|
kb: KnowledgeBase, mentions: Iterable[Span]
|
||||||
|
) -> Iterable[Iterable[Candidate]]:
|
||||||
|
"""
|
||||||
|
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||||
|
kb (KnowledgeBase): Knowledge base to query.
|
||||||
|
mentions (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||||
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
|
"""
|
||||||
|
return kb.get_candidates_batch(mentions)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
|
import warnings
|
||||||
from typing import cast
|
from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast
|
||||||
from numpy import dtype
|
from numpy import dtype
|
||||||
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
|
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -10,14 +10,13 @@ from thinc.api import CosineDistance, Model, Optimizer, Config
|
||||||
from thinc.api import set_dropout_rate
|
from thinc.api import set_dropout_rate
|
||||||
|
|
||||||
from ..kb import KnowledgeBase, Candidate
|
from ..kb import KnowledgeBase, Candidate
|
||||||
from ..ml import empty_kb
|
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from .pipe import deserialize_config
|
from .pipe import deserialize_config
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe import TrainablePipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..training import Example, validate_examples, validate_get_examples
|
from ..training import Example, validate_examples, validate_get_examples
|
||||||
from ..errors import Errors
|
from ..errors import Errors, Warnings
|
||||||
from ..util import SimpleFrozenList, registry
|
from ..util import SimpleFrozenList, registry
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
@ -240,6 +239,8 @@ class EntityLinker(TrainablePipe):
|
||||||
|
|
||||||
if candidates_batch_size < 1:
|
if candidates_batch_size < 1:
|
||||||
raise ValueError(Errors.E1044)
|
raise ValueError(Errors.E1044)
|
||||||
|
if self.incl_prior and not self.kb.supports_prior_probs:
|
||||||
|
warnings.warn(Warnings.W401)
|
||||||
|
|
||||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
"""Define the KB of this pipe by providing a function that will
|
"""Define the KB of this pipe by providing a function that will
|
||||||
|
@ -522,18 +523,19 @@ class EntityLinker(TrainablePipe):
|
||||||
)
|
)
|
||||||
elif len(candidates) == 1 and self.threshold is None:
|
elif len(candidates) == 1 and self.threshold is None:
|
||||||
# shortcut for efficiency reasons: take the 1 candidate
|
# shortcut for efficiency reasons: take the 1 candidate
|
||||||
final_kb_ids.append(candidates[0].entity_)
|
final_kb_ids.append(candidates[0].entity_id_)
|
||||||
self._add_activations(
|
self._add_activations(
|
||||||
doc_scores=doc_scores,
|
doc_scores=doc_scores,
|
||||||
doc_ents=doc_ents,
|
doc_ents=doc_ents,
|
||||||
scores=[1.0],
|
scores=[1.0],
|
||||||
ents=[candidates[0].entity_],
|
ents=[candidates[0].entity_id],
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
random.shuffle(candidates)
|
random.shuffle(candidates)
|
||||||
# set all prior probabilities to 0 if incl_prior=False
|
# set all prior probabilities to 0 if incl_prior=False
|
||||||
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
if self.incl_prior and self.kb.supports_prior_probs:
|
||||||
if not self.incl_prior:
|
prior_probs = xp.asarray([c.prior_prob for c in candidates]) # type: ignore
|
||||||
|
else:
|
||||||
prior_probs = xp.asarray([0.0 for _ in candidates])
|
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||||
scores = prior_probs
|
scores = prior_probs
|
||||||
# add in similarity from the context
|
# add in similarity from the context
|
||||||
|
@ -557,7 +559,7 @@ class EntityLinker(TrainablePipe):
|
||||||
raise ValueError(Errors.E161)
|
raise ValueError(Errors.E161)
|
||||||
scores = prior_probs + sims - (prior_probs * sims)
|
scores = prior_probs + sims - (prior_probs * sims)
|
||||||
final_kb_ids.append(
|
final_kb_ids.append(
|
||||||
candidates[scores.argmax().item()].entity_
|
candidates[scores.argmax().item()].entity_id_
|
||||||
if self.threshold is None
|
if self.threshold is None
|
||||||
or scores.max() >= self.threshold
|
or scores.max() >= self.threshold
|
||||||
else EntityLinker.NIL
|
else EntityLinker.NIL
|
||||||
|
@ -566,7 +568,7 @@ class EntityLinker(TrainablePipe):
|
||||||
doc_scores=doc_scores,
|
doc_scores=doc_scores,
|
||||||
doc_ents=doc_ents,
|
doc_ents=doc_ents,
|
||||||
scores=scores,
|
scores=scores,
|
||||||
ents=[c.entity for c in candidates],
|
ents=[c.entity_id for c in candidates],
|
||||||
)
|
)
|
||||||
self._add_doc_activations(
|
self._add_doc_activations(
|
||||||
docs_scores=docs_scores,
|
docs_scores=docs_scores,
|
||||||
|
|
|
@ -7,10 +7,10 @@ from thinc.types import Ragged
|
||||||
from spacy import registry, util
|
from spacy import registry, util
|
||||||
from spacy.attrs import ENT_KB_ID
|
from spacy.attrs import ENT_KB_ID
|
||||||
from spacy.compat import pickle
|
from spacy.compat import pickle
|
||||||
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.ml import load_kb
|
from spacy.ml import load_kb
|
||||||
from spacy.ml.models.entity_linker import build_span_maker
|
from spacy.ml.models.entity_linker import build_span_maker, get_candidates
|
||||||
from spacy.pipeline import EntityLinker, TrainablePipe
|
from spacy.pipeline import EntityLinker, TrainablePipe
|
||||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
|
@ -465,16 +465,17 @@ def test_candidate_generation(nlp):
|
||||||
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||||
|
|
||||||
# test the size of the relevant candidates
|
# test the size of the relevant candidates
|
||||||
|
adam_ent_cands = get_candidates(mykb, adam_ent)
|
||||||
assert len(get_candidates(mykb, douglas_ent)) == 2
|
assert len(get_candidates(mykb, douglas_ent)) == 2
|
||||||
assert len(get_candidates(mykb, adam_ent)) == 1
|
assert len(adam_ent_cands) == 1
|
||||||
assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive
|
assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive
|
||||||
assert len(get_candidates(mykb, shrubbery_ent)) == 0
|
assert len(get_candidates(mykb, shrubbery_ent)) == 0
|
||||||
|
|
||||||
# test the content of the candidates
|
# test the content of the candidates
|
||||||
assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
|
assert adam_ent_cands[0].entity_id_ == "Q2"
|
||||||
assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
|
assert adam_ent_cands[0].alias == "adam"
|
||||||
assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
|
assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
|
||||||
assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
|
assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
|
||||||
|
|
||||||
|
|
||||||
def test_el_pipe_configuration(nlp):
|
def test_el_pipe_configuration(nlp):
|
||||||
|
@ -502,7 +503,7 @@ def test_el_pipe_configuration(nlp):
|
||||||
assert doc[2].ent_kb_id_ == "Q2"
|
assert doc[2].ent_kb_id_ == "Q2"
|
||||||
|
|
||||||
def get_lowercased_candidates(kb, span):
|
def get_lowercased_candidates(kb, span):
|
||||||
return kb.get_alias_candidates(span.text.lower())
|
return kb._get_alias_candidates(span.text.lower())
|
||||||
|
|
||||||
def get_lowercased_candidates_batch(kb, spans):
|
def get_lowercased_candidates_batch(kb, spans):
|
||||||
return [get_lowercased_candidates(kb, span) for span in spans]
|
return [get_lowercased_candidates(kb, span) for span in spans]
|
||||||
|
@ -561,24 +562,22 @@ def test_vocab_serialization(nlp):
|
||||||
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
|
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
|
||||||
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||||
|
|
||||||
candidates = mykb.get_alias_candidates("adam")
|
candidates = mykb._get_alias_candidates("adam")
|
||||||
assert len(candidates) == 1
|
assert len(candidates) == 1
|
||||||
assert candidates[0].entity == q2_hash
|
assert candidates[0].entity_id == q2_hash
|
||||||
assert candidates[0].entity_ == "Q2"
|
assert candidates[0].entity_id_ == "Q2"
|
||||||
assert candidates[0].alias == adam_hash
|
assert candidates[0].alias == "adam"
|
||||||
assert candidates[0].alias_ == "adam"
|
|
||||||
|
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
mykb.to_disk(d / "kb")
|
mykb.to_disk(d / "kb")
|
||||||
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
||||||
kb_new_vocab.from_disk(d / "kb")
|
kb_new_vocab.from_disk(d / "kb")
|
||||||
|
|
||||||
candidates = kb_new_vocab.get_alias_candidates("adam")
|
candidates = kb_new_vocab._get_alias_candidates("adam")
|
||||||
assert len(candidates) == 1
|
assert len(candidates) == 1
|
||||||
assert candidates[0].entity == q2_hash
|
assert candidates[0].entity_id == q2_hash
|
||||||
assert candidates[0].entity_ == "Q2"
|
assert candidates[0].entity_id_ == "Q2"
|
||||||
assert candidates[0].alias == adam_hash
|
assert candidates[0].alias == "adam"
|
||||||
assert candidates[0].alias_ == "adam"
|
|
||||||
|
|
||||||
assert kb_new_vocab.get_vector("Q2") == [2]
|
assert kb_new_vocab.get_vector("Q2") == [2]
|
||||||
assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
|
assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
|
||||||
|
@ -598,20 +597,20 @@ def test_append_alias(nlp):
|
||||||
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||||
|
|
||||||
# test the size of the relevant candidates
|
# test the size of the relevant candidates
|
||||||
assert len(mykb.get_alias_candidates("douglas")) == 2
|
assert len(mykb._get_alias_candidates("douglas")) == 2
|
||||||
|
|
||||||
# append an alias
|
# append an alias
|
||||||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
|
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
|
||||||
|
|
||||||
# test the size of the relevant candidates has been incremented
|
# test the size of the relevant candidates has been incremented
|
||||||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
assert len(mykb._get_alias_candidates("douglas")) == 3
|
||||||
|
|
||||||
# append the same alias-entity pair again should not work (will throw a warning)
|
# append the same alias-entity pair again should not work (will throw a warning)
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
|
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
|
||||||
|
|
||||||
# test the size of the relevant candidates remained unchanged
|
# test the size of the relevant candidates remained unchanged
|
||||||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
assert len(mykb._get_alias_candidates("douglas")) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore:\\[W036")
|
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||||
|
@ -908,11 +907,11 @@ def test_kb_to_bytes():
|
||||||
assert kb_2.contains_alias("Russ Cochran")
|
assert kb_2.contains_alias("Russ Cochran")
|
||||||
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
|
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
|
||||||
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
|
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
|
||||||
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
|
assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
|
||||||
kb_2.get_alias_candidates("Russ Cochran")
|
kb_2._get_alias_candidates("Russ Cochran")
|
||||||
)
|
)
|
||||||
assert len(kb_1.get_alias_candidates("Randomness")) == len(
|
assert len(kb_1._get_alias_candidates("Randomness")) == len(
|
||||||
kb_2.get_alias_candidates("Randomness")
|
kb_2._get_alias_candidates("Randomness")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -66,19 +66,21 @@ def _check_kb(kb):
|
||||||
assert alias_string not in kb.get_alias_strings()
|
assert alias_string not in kb.get_alias_strings()
|
||||||
|
|
||||||
# check candidates & probabilities
|
# check candidates & probabilities
|
||||||
candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
|
candidates = sorted(
|
||||||
|
kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_
|
||||||
|
)
|
||||||
assert len(candidates) == 2
|
assert len(candidates) == 2
|
||||||
|
|
||||||
assert candidates[0].entity_ == "Q007"
|
assert candidates[0].entity_id_ == "Q007"
|
||||||
assert 6.999 < candidates[0].entity_freq < 7.01
|
assert 6.999 < candidates[0].entity_freq < 7.01
|
||||||
assert candidates[0].entity_vector == [0, 0, 7]
|
assert candidates[0].entity_vector == [0, 0, 7]
|
||||||
assert candidates[0].alias_ == "double07"
|
assert candidates[0].alias == "double07"
|
||||||
assert 0.899 < candidates[0].prior_prob < 0.901
|
assert 0.899 < candidates[0].prior_prob < 0.901
|
||||||
|
|
||||||
assert candidates[1].entity_ == "Q17"
|
assert candidates[1].entity_id_ == "Q17"
|
||||||
assert 1.99 < candidates[1].entity_freq < 2.01
|
assert 1.99 < candidates[1].entity_freq < 2.01
|
||||||
assert candidates[1].entity_vector == [7, 1, 0]
|
assert candidates[1].entity_vector == [7, 1, 0]
|
||||||
assert candidates[1].alias_ == "double07"
|
assert candidates[1].alias == "double07"
|
||||||
assert 0.099 < candidates[1].prior_prob < 0.101
|
assert 0.099 < candidates[1].prior_prob < 0.101
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,9 +10,9 @@ version: 3.5
|
||||||
|
|
||||||
The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
|
The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
|
||||||
implements all of its methods. It stores all KB data in-memory and generates
|
implements all of its methods. It stores all KB data in-memory and generates
|
||||||
[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
|
[`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions
|
||||||
entity names. It's highly optimized for both a low memory footprint and speed of
|
with entity names. It's highly optimized for both a low memory footprint and
|
||||||
retrieval.
|
speed of retrieval.
|
||||||
|
|
||||||
## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"}
|
## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
|
@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base.
|
||||||
## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"}
|
## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"}
|
||||||
|
|
||||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||||
of type [`Candidate`](/api/kb#candidate). Wraps
|
of type [`InMemoryCandidate`](/api/kb#candidate). Wraps
|
||||||
[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -169,9 +169,9 @@ of type [`Candidate`](/api/kb#candidate). Wraps
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | -------------------------------------------------------------------- |
|
| ----------- | ------------------------------------------------------------------------------------ |
|
||||||
| `mention` | The textual mention or alias. ~~Span~~ |
|
| `mention` | The textual mention or alias. ~~Span~~ |
|
||||||
| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
|
| **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ |
|
||||||
|
|
||||||
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
|
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
|
||||||
|
|
||||||
|
@ -195,25 +195,9 @@ to you.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | -------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------------------------------------------------------------------------------------------------ |
|
||||||
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
| `mentions` | The textual mentions. ~~Iterable[Span]~~ |
|
||||||
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
|
||||||
|
|
||||||
## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
|
|
||||||
|
|
||||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
|
||||||
of type [`Candidate`](/api/kb#candidate).
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> candidates = kb.get_alias_candidates("Douglas")
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | ------------------------------------------------------------- |
|
|
||||||
| `alias` | The textual mention or alias. ~~str~~ |
|
|
||||||
| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
|
|
||||||
|
|
||||||
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
|
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -103,23 +103,6 @@ to you.
|
||||||
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
||||||
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
||||||
|
|
||||||
## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"}
|
|
||||||
|
|
||||||
<Infobox variant="warning">
|
|
||||||
This method is _not_ available from spaCy 3.5 onwards.
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
|
|
||||||
[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
|
|
||||||
allow more flexibility in customizing knowledge bases. Some of its methods were
|
|
||||||
moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
|
|
||||||
one of those being `get_alias_candidates()`. This method is now available as
|
|
||||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
|
||||||
Note:
|
|
||||||
[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
|
|
||||||
defaults to
|
|
||||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
|
||||||
|
|
||||||
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
|
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
|
||||||
|
|
||||||
Given a certain entity ID, retrieve its pretrained entity vector.
|
Given a certain entity ID, retrieve its pretrained entity vector.
|
||||||
|
@ -190,25 +173,27 @@ Restore the state of the knowledge base from a given directory. Note that the
|
||||||
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
||||||
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
|
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
|
||||||
|
|
||||||
## Candidate {id="candidate",tag="class"}
|
## InMemoryCandidate {id="candidate",tag="class"}
|
||||||
|
|
||||||
A `Candidate` object refers to a textual mention (alias) that may or may not be
|
An `InMemoryCandidate` object refers to a textual mention (alias) that may or
|
||||||
resolved to a specific entity from a `KnowledgeBase`. This will be used as input
|
may not be resolved to a specific entity from a `KnowledgeBase`. This will be
|
||||||
for the entity linking algorithm which will disambiguate the various candidates
|
used as input for the entity linking algorithm which will disambiguate the
|
||||||
to the correct one. Each candidate `(alias, entity)` pair is assigned to a
|
various candidates to the correct one. Each candidate `(alias, entity)` pair is
|
||||||
certain prior probability.
|
assigned to a certain prior probability.
|
||||||
|
|
||||||
### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"}
|
### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
|
||||||
|
|
||||||
Construct a `Candidate` object. Usually this constructor is not called directly,
|
Construct an `InMemoryCandidate` object. Usually this constructor is not called
|
||||||
but instead these objects are returned by the `get_candidates` method of the
|
directly, but instead these objects are returned by the `get_candidates` method
|
||||||
[`entity_linker`](/api/entitylinker) pipe.
|
of the [`entity_linker`](/api/entitylinker) pipe.
|
||||||
|
|
||||||
> #### Example
|
> #### Example```python
|
||||||
|
>
|
||||||
|
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
|
||||||
|
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
|
||||||
|
>
|
||||||
|
> ```
|
||||||
>
|
>
|
||||||
> ```python
|
|
||||||
> from spacy.kb import Candidate
|
|
||||||
> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -216,10 +201,10 @@ but instead these objects are returned by the `get_candidates` method of the
|
||||||
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
|
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
|
||||||
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
|
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
|
||||||
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
|
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
|
||||||
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ |
|
| `alias_hash` | The hash of the entity alias. ~~int~~ |
|
||||||
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
||||||
|
|
||||||
## Candidate attributes {id="candidate-attributes"}
|
## InMemoryCandidate attributes {id="candidate-attributes"}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | ------------------------------------------------------------------------ |
|
| --------------- | ------------------------------------------------------------------------ |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user