Merge branch 'refactor/el-candidates' into feature/docwise-generator-batching

# Conflicts:
#	spacy/kb/candidate.py
#	spacy/kb/kb.pyx
#	spacy/kb/kb_in_memory.pyx
#	spacy/ml/models/entity_linker.py
#	spacy/pipeline/entity_linker.py
#	spacy/tests/pipeline/test_entity_linker.py
#	website/docs/api/inmemorylookupkb.mdx
#	website/docs/api/kb.mdx
This commit is contained in:
Raphael Mitsch 2023-03-20 10:24:17 +01:00
commit 73bdeb01e4
14 changed files with 185 additions and 209 deletions

View File

@ -30,6 +30,7 @@ MOD_NAMES = [
"spacy.lexeme", "spacy.lexeme",
"spacy.vocab", "spacy.vocab",
"spacy.attrs", "spacy.attrs",
"spacy.kb.candidate",
"spacy.kb.kb", "spacy.kb.kb",
"spacy.kb.kb_in_memory", "spacy.kb.kb_in_memory",
"spacy.ml.tb_framework", "spacy.ml.tb_framework",

View File

@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes):
"ignoring the duplicate entry.") "ignoring the duplicate entry.")
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
"incorrect. Modify PhraseMatcher._terminal_hash to fix.") "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
"the Knowledge Base.") "the Knowledge Base.")
W026 = ("Unable to set all sentence boundaries from dependency parses. If " W026 = ("Unable to set all sentence boundaries from dependency parses. If "
"you are constructing a parse tree incrementally by setting " "you are constructing a parse tree incrementally by setting "
@ -209,7 +209,11 @@ class Warnings(metaclass=ErrorsWithCodes):
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
# v4 warning strings
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
"lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
"to return `True` in `.supports_prior_probs`.")
class Errors(metaclass=ErrorsWithCodes): class Errors(metaclass=ErrorsWithCodes):
@ -960,6 +964,9 @@ class Errors(metaclass=ErrorsWithCodes):
E4003 = ("Training examples for distillation must have the exact same tokens in the " E4003 = ("Training examples for distillation must have the exact same tokens in the "
"reference and predicted docs.") "reference and predicted docs.")
E4004 = ("Backprop is not supported when is_train is not set.") E4004 = ("Backprop is not supported when is_train is not set.")
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}

View File

@ -2,4 +2,5 @@ from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB from .kb_in_memory import InMemoryLookupKB
from .candidate import Candidate, InMemoryCandidate from .candidate import Candidate, InMemoryCandidate
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]

15
spacy/kb/candidate.pxd Normal file
View File

@ -0,0 +1,15 @@
from libcpp.vector cimport vector
from .kb_in_memory cimport InMemoryLookupKB
from ..typedefs cimport hash_t
cdef class Candidate:
pass
cdef class InMemoryCandidate(Candidate):
cdef readonly hash_t _entity_hash
cdef readonly hash_t _alias_hash
cpdef vector[float] _entity_vector
cdef float _prior_prob
cdef readonly InMemoryLookupKB _kb
cdef float _entity_freq

View File

@ -1,118 +0,0 @@
import abc
from typing import List, Callable
class Candidate(abc.ABC):
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking
algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity_id) pair is assigned a certain prior probability.
DOCS: https://spacy.io/api/kb/#candidate-init
"""
def __init__(
self,
mention: str,
entity_id: int,
entity_name: str,
entity_vector: List[float],
prior_prob: float,
):
"""Initializes properties of `Candidate` instance.
mention (str): Mention text for this candidate.
entity_id (int): Unique entity ID.
entity_name (str): Entity name.
entity_vector (List[float]): Entity embedding.
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
doesn't) it might be better to eschew this information and always supply the same value.
"""
self._mention = mention
self._entity_id = entity_id
self._entity_name = entity_name
self._entity_vector = entity_vector
self._prior_prob = prior_prob
@property
def entity(self) -> int:
"""RETURNS (int): Unique entity ID."""
return self._entity_id
@property
def entity_(self) -> str:
"""RETURNS (int): Entity name."""
return self._entity_name
@property
def mention(self) -> str:
"""RETURNS (str): Mention."""
return self._mention
@property
def entity_vector(self) -> List[float]:
"""RETURNS (List[float]): Entity vector."""
return self._entity_vector
@property
def prior_prob(self) -> float:
"""RETURNS (List[float]): Entity vector."""
return self._prior_prob
class InMemoryCandidate(Candidate):
"""Candidate for InMemoryLookupKB."""
def __init__(
self,
retrieve_string_from_hash: Callable[[int], str],
entity_hash: int,
entity_freq: int,
entity_vector: List[float],
alias_hash: int,
prior_prob: float,
):
"""
retrieve_string_from_hash (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab
hash.
entity_hash (str): Hashed entity name /ID.
entity_freq (int): Entity frequency in KB corpus.
entity_vector (List[float]): Entity embedding.
alias_hash (int): Hashed alias.
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
doesn't) it might be better to eschew this information and always supply the same value.
"""
super().__init__(
mention=retrieve_string_from_hash(alias_hash),
entity_id=entity_hash,
entity_name=retrieve_string_from_hash(entity_hash),
entity_vector=entity_vector,
prior_prob=prior_prob,
)
self._retrieve_string_from_hash = retrieve_string_from_hash
self._entity_hash = entity_hash
self._entity_freq = entity_freq
self._alias_hash = alias_hash
self._prior_prob = prior_prob
@property
def entity(self) -> int:
"""RETURNS (int): hash of the entity_id's KB ID/name"""
return self._entity_hash
@property
def alias(self) -> int:
"""RETURNS (int): hash of the alias"""
return self._alias_hash
@property
def alias_(self) -> str:
"""RETURNS (str): ID of the original alias"""
return self._retrieve_string_from_hash(self._alias_hash)
@property
def entity_freq(self) -> float:
return self._entity_freq

96
spacy/kb/candidate.pyx Normal file
View File

@ -0,0 +1,96 @@
# cython: infer_types=True, profile=True
from .kb_in_memory cimport InMemoryLookupKB
from ..errors import Errors
cdef class Candidate:
"""A `Candidate` object refers to a textual mention that may or may not be resolved
to a specific entity from a Knowledge Base. This will be used as input for the entity linking
algorithm which will disambiguate the various candidates to the correct one.
Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
is assigned a certain prior probability.
DOCS: https://spacy.io/api/kb/#candidate-init
"""
def __init__(self):
# Make sure abstract Candidate is not instantiated.
if self.__class__ == Candidate:
raise TypeError(
Errors.E1046.format(cls_name=self.__class__.__name__)
)
@property
def entity_id(self) -> int:
"""RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
otherwise the hash of the entity ID string)."""
raise NotImplementedError
@property
def entity_id_(self) -> str:
"""RETURNS (str): String representation of entity ID."""
raise NotImplementedError
@property
def entity_vector(self) -> vector[float]:
"""RETURNS (vector[float]): Entity vector."""
raise NotImplementedError
cdef class InMemoryCandidate(Candidate):
"""Candidate for InMemoryLookupKB."""
def __init__(
self,
kb: InMemoryLookupKB,
entity_hash: int,
alias_hash: int,
entity_vector: vector[float],
prior_prob: float,
entity_freq: float
):
"""
kb (InMemoryLookupKB]): InMemoryLookupKB instance.
entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
entity_freq (int): Entity frequency in KB corpus.
entity_vector (List[float]): Entity embedding.
alias_hash (int): Alias hash.
prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
"""
super().__init__()
self._entity_hash = entity_hash
self._entity_vector = entity_vector
self._prior_prob = prior_prob
self._kb = kb
self._alias_hash = alias_hash
self._entity_freq = entity_freq
@property
def entity_id(self) -> int:
return self._entity_hash
@property
def entity_vector(self) -> vector[float]:
return self._entity_vector
@property
def prior_prob(self) -> float:
"""RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
this entity."""
return self._prior_prob
@property
def alias(self) -> str:
"""RETURNS (str): Alias."""
return self._kb.vocab.strings[self._alias_hash]
@property
def entity_id_(self) -> str:
return self._kb.vocab.strings[self._entity_hash]
@property
def entity_freq(self) -> float:
"""RETURNS (float): Entity frequency in KB corpus."""
return self._entity_freq

View File

@ -32,11 +32,12 @@ cdef class KnowledgeBase:
def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[Iterable[Iterable[Candidate]]]: def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[Iterable[Iterable[Candidate]]]:
""" """
Return candidate entities for mentions stored in `ent` attribute in passed docs. Each candidate defines the Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
entity, the original alias, and the prior probability of that alias resolving to that entity. entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
If no candidate is found for a given mention, an empty list is returned. probability of the specified mention text resolving to that entity - might be included.
mentions (Iterator[SpanGroup]): Mentions per doc as SpanGroup instance. If no candidates are found for a given mention, an empty list is returned.
RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per document. mentions (Iterable[SpangGroup]): Mentions for which to get candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__) Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
@ -96,3 +97,10 @@ cdef class KnowledgeBase:
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
) )
@property
def supports_prior_probs(self) -> bool:
"""RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
)

View File

@ -230,7 +230,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
for mentions_for_doc in mentions: for mentions_for_doc in mentions:
yield [self.get_alias_candidates(ent_span.text) for ent_span in mentions_for_doc] yield [self.get_alias_candidates(ent_span.text) for ent_span in mentions_for_doc]
def get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
""" """
Return candidate entities for an alias. Each candidate defines the entity, the original alias, Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity. and the prior probability of that alias resolving to that entity.
@ -244,12 +244,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
return [ return [
InMemoryCandidate( InMemoryCandidate(
retrieve_string_from_hash=self.vocab.strings.__getitem__, kb=self,
entity_hash=self._entries[entry_index].entity_hash, entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].freq,
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
alias_hash=alias_hash, alias_hash=alias_hash,
prior_prob=prior_prob entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
prior_prob=prior_prob,
entity_freq=self._entries[entry_index].freq
) )
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
if entry_index != 0 if entry_index != 0
@ -284,6 +284,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
return 0.0 return 0.0
def supports_prior_probs(self) -> bool:
return True
def to_bytes(self, **kwargs): def to_bytes(self, **kwargs):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
""" """

View File

@ -108,7 +108,7 @@ def empty_kb(
@registry.misc("spacy.CandidateGenerator.v1") @registry.misc("spacy.CandidateGenerator.v1")
def create_candidates_all() -> Callable[ def create_get_candidates() -> Callable[
[KnowledgeBase, Iterator[SpanGroup]], [KnowledgeBase, Iterator[SpanGroup]],
Iterator[Iterable[Iterable[Candidate]]], Iterator[Iterable[Iterable[Candidate]]],
]: ]:

View File

@ -1,4 +1,6 @@
import warnings
from typing import ( from typing import (
cast,
Optional, Optional,
Iterable, Iterable,
Callable, Callable,
@ -9,7 +11,6 @@ from typing import (
Any, Any,
Iterator, Iterator,
) )
from typing import cast
from numpy import dtype from numpy import dtype
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
from pathlib import Path from pathlib import Path
@ -27,7 +28,7 @@ from .trainable_pipe import TrainablePipe
from ..language import Language from ..language import Language
from ..vocab import Vocab from ..vocab import Vocab
from ..training import Example, validate_examples, validate_get_examples from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors from ..errors import Errors, Warnings
from ..util import SimpleFrozenList, registry from ..util import SimpleFrozenList, registry
from .. import util from .. import util
from ..scorer import Scorer from ..scorer import Scorer
@ -120,28 +121,9 @@ def make_entity_linker(
prediction is discarded. If None, predictions are not filtered by any threshold. prediction is discarded. If None, predictions are not filtered by any threshold.
save_activations (bool): save model activations in Doc when annotating. save_activations (bool): save model activations in Doc when annotating.
""" """
if not model.attrs.get("include_span_maker", False): if not model.attrs.get("include_span_maker", False):
try: raise ValueError(Errors.E4005)
from spacy_legacy.components.entity_linker import EntityLinker_v1
except:
raise ImportError(
"In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
)
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
return EntityLinker_v1(
nlp.vocab,
model,
name,
labels_discard=labels_discard,
n_sents=n_sents,
incl_prior=incl_prior,
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
overwrite=overwrite,
scorer=scorer,
)
return EntityLinker( return EntityLinker(
nlp.vocab, nlp.vocab,
model, model,
@ -251,6 +233,9 @@ class EntityLinker(TrainablePipe):
self.threshold = threshold self.threshold = threshold
self.save_activations = save_activations self.save_activations = save_activations
if self.incl_prior and not self.kb.supports_prior_probs:
warnings.warn(Warnings.W401)
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will """Define the KB of this pipe by providing a function that will
create it using this object's vocab.""" create it using this object's vocab."""

View File

@ -7,7 +7,7 @@ from thinc.types import Ragged
from spacy import registry, util from spacy import registry, util
from spacy.attrs import ENT_KB_ID from spacy.attrs import ENT_KB_ID
from spacy.compat import pickle from spacy.compat import pickle
from spacy.kb import InMemoryCandidate, InMemoryLookupKB, KnowledgeBase from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
from spacy.lang.en import English from spacy.lang.en import English
from spacy.ml import load_kb from spacy.ml import load_kb
from spacy.ml.models.entity_linker import build_span_maker, get_candidates from spacy.ml.models.entity_linker import build_span_maker, get_candidates
@ -479,8 +479,8 @@ def test_candidate_generation(nlp):
) )
# test the content of the candidates # test the content of the candidates
assert adam_ent_cands[0].entity_ == "Q2" assert adam_ent_cands[0].entity_id_ == "Q2"
assert adam_ent_cands[0].alias_ == "adam" assert adam_ent_cands[0].alias == "adam"
assert_almost_equal(adam_ent_cands[0].entity_freq, 12) assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9) assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
@ -519,7 +519,7 @@ def test_el_pipe_configuration(nlp):
@registry.misc("spacy.LowercaseCandidateGenerator.v1") @registry.misc("spacy.LowercaseCandidateGenerator.v1")
def create_candidates() -> Callable[ def create_candidates() -> Callable[
[InMemoryLookupKB, Iterator[SpanGroup]], [InMemoryLookupKB, Iterator[SpanGroup]],
Iterator[Iterable[Iterable[InMemoryCandidate]]], Iterator[Iterable[Iterable[Candidate]]],
]: ]:
return get_lowercased_candidates return get_lowercased_candidates
@ -562,24 +562,22 @@ def test_vocab_serialization(nlp):
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
candidates = mykb.get_alias_candidates("adam") candidates = mykb._get_alias_candidates("adam")
assert len(candidates) == 1 assert len(candidates) == 1
assert candidates[0].entity == q2_hash assert candidates[0].entity_id == q2_hash
assert candidates[0].entity_ == "Q2" assert candidates[0].entity_id_ == "Q2"
assert candidates[0].alias == adam_hash assert candidates[0].alias == "adam"
assert candidates[0].alias_ == "adam"
with make_tempdir() as d: with make_tempdir() as d:
mykb.to_disk(d / "kb") mykb.to_disk(d / "kb")
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1) kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
kb_new_vocab.from_disk(d / "kb") kb_new_vocab.from_disk(d / "kb")
candidates = kb_new_vocab.get_alias_candidates("adam") candidates = kb_new_vocab._get_alias_candidates("adam")
assert len(candidates) == 1 assert len(candidates) == 1
assert candidates[0].entity == q2_hash assert candidates[0].entity_id == q2_hash
assert candidates[0].entity_ == "Q2" assert candidates[0].entity_id_ == "Q2"
assert candidates[0].alias == adam_hash assert candidates[0].alias == "adam"
assert candidates[0].alias_ == "adam"
assert kb_new_vocab.get_vector("Q2") == [2] assert kb_new_vocab.get_vector("Q2") == [2]
assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
@ -599,20 +597,20 @@ def test_append_alias(nlp):
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
# test the size of the relevant candidates # test the size of the relevant candidates
assert len(mykb.get_alias_candidates("douglas")) == 2 assert len(mykb._get_alias_candidates("douglas")) == 2
# append an alias # append an alias
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
# test the size of the relevant candidates has been incremented # test the size of the relevant candidates has been incremented
assert len(mykb.get_alias_candidates("douglas")) == 3 assert len(mykb._get_alias_candidates("douglas")) == 3
# append the same alias-entity pair again should not work (will throw a warning) # append the same alias-entity pair again should not work (will throw a warning)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
# test the size of the relevant candidates remained unchanged # test the size of the relevant candidates remained unchanged
assert len(mykb.get_alias_candidates("douglas")) == 3 assert len(mykb._get_alias_candidates("douglas")) == 3
@pytest.mark.filterwarnings("ignore:\\[W036") @pytest.mark.filterwarnings("ignore:\\[W036")
@ -909,11 +907,11 @@ def test_kb_to_bytes():
assert kb_2.contains_alias("Russ Cochran") assert kb_2.contains_alias("Russ Cochran")
assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
assert kb_1.get_alias_strings() == kb_2.get_alias_strings() assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
kb_2.get_alias_candidates("Russ Cochran") kb_2._get_alias_candidates("Russ Cochran")
) )
assert len(kb_1.get_alias_candidates("Randomness")) == len( assert len(kb_1._get_alias_candidates("Randomness")) == len(
kb_2.get_alias_candidates("Randomness") kb_2._get_alias_candidates("Randomness")
) )
@ -994,14 +992,11 @@ def test_scorer_links():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,config", "name,config",
[ [
("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
], ],
) )
# fmt: on # fmt: on
def test_legacy_architectures(name, config): def test_legacy_architectures(name, config):
from spacy_legacy.components.entity_linker import EntityLinker_v1
# Ensure that the legacy architectures still work # Ensure that the legacy architectures still work
vector_length = 3 vector_length = 3
nlp = English() nlp = English()
@ -1023,10 +1018,7 @@ def test_legacy_architectures(name, config):
return mykb return mykb
entity_linker = nlp.add_pipe(name, config={"model": config}) entity_linker = nlp.add_pipe(name, config={"model": config})
if config["@architectures"] == "spacy.EntityLinker.v1": assert isinstance(entity_linker, EntityLinker)
assert isinstance(entity_linker, EntityLinker_v1)
else:
assert isinstance(entity_linker, EntityLinker)
entity_linker.set_kb(create_kb) entity_linker.set_kb(create_kb)
optimizer = nlp.initialize(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)

View File

@ -66,19 +66,21 @@ def _check_kb(kb):
assert alias_string not in kb.get_alias_strings() assert alias_string not in kb.get_alias_strings()
# check candidates & probabilities # check candidates & probabilities
candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_) candidates = sorted(
kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_
)
assert len(candidates) == 2 assert len(candidates) == 2
assert candidates[0].entity_ == "Q007" assert candidates[0].entity_id_ == "Q007"
assert 6.999 < candidates[0].entity_freq < 7.01 assert 6.999 < candidates[0].entity_freq < 7.01
assert candidates[0].entity_vector == [0, 0, 7] assert candidates[0].entity_vector == [0, 0, 7]
assert candidates[0].alias_ == "double07" assert candidates[0].alias == "double07"
assert 0.899 < candidates[0].prior_prob < 0.901 assert 0.899 < candidates[0].prior_prob < 0.901
assert candidates[1].entity_ == "Q17" assert candidates[1].entity_id_ == "Q17"
assert 1.99 < candidates[1].entity_freq < 2.01 assert 1.99 < candidates[1].entity_freq < 2.01
assert candidates[1].entity_vector == [7, 1, 0] assert candidates[1].entity_vector == [7, 1, 0]
assert candidates[1].alias_ == "double07" assert candidates[1].alias == "double07"
assert 0.099 < candidates[1].prior_prob < 0.101 assert 0.099 < candidates[1].prior_prob < 0.101

View File

@ -178,22 +178,6 @@ implementation of `KnowledgeBase.get_candidates()`.
| `mentions` | The textual mention or alias. ~~Iterable[SpanGroup]~~ | | `mentions` | The textual mention or alias. ~~Iterable[SpanGroup]~~ |
| **RETURNS** | An iterator over iterables of iterables with relevant [`InMemoryCandidate`](/api/kb#candidate) objects (per mention and doc). ~~Iterator[Iterable[Iterable[InMemoryCandidate]]]~~ | | **RETURNS** | An iterator over iterables of iterables with relevant [`InMemoryCandidate`](/api/kb#candidate) objects (per mention and doc). ~~Iterator[Iterable[Iterable[InMemoryCandidate]]]~~ |
## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
Given a certain textual mention as input, retrieve a list of candidate entities
of type [`InMemoryCandidate`](/api/kb#candidate).
> #### Example
>
> ```python
> candidates = kb.get_alias_candidates("Douglas")
> ```
| Name | Description |
| ----------- | ----------------------------------------------------------------------------- |
| `alias` | The textual mention or alias. ~~str~~ |
| **RETURNS** | The list of relevant `InMemoryCandidate` objects. ~~List[InMemoryCandidate]~~ |
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"} ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
Given a certain entity ID, retrieve its pretrained entity vector. Given a certain entity ID, retrieve its pretrained entity vector.

View File

@ -155,15 +155,15 @@ Restore the state of the knowledge base from a given directory. Note that the
## InMemoryCandidate {id="candidate",tag="class"} ## InMemoryCandidate {id="candidate",tag="class"}
A `InMemoryCandidate` object refers to a textual mention that may or may not be An `InMemoryCandidate` object refers to a textual mention (alias) that may or
resolved to a specific entity from a `KnowledgeBase`. This will be used as input may not be resolved to a specific entity from a `KnowledgeBase`. This will be
for the entity linking algorithm which will disambiguate the various candidates used as input for the entity linking algorithm which will disambiguate the
to the correct one. Each candidate `(mention, entity)` pair is assigned to a various candidates to the correct one. Each candidate `(alias, entity)` pair is
certain prior probability. assigned to a certain prior probability.
### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"} ### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
Construct a `InMemoryCandidate` object. Usually this constructor is not called Construct an `InMemoryCandidate` object. Usually this constructor is not called
directly, but instead these objects are returned by the `get_candidates` method directly, but instead these objects are returned by the `get_candidates` method
of the [`entity_linker`](/api/entitylinker) pipe. of the [`entity_linker`](/api/entitylinker) pipe.
@ -181,7 +181,7 @@ of the [`entity_linker`](/api/entitylinker) pipe.
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ | | `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ | | `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ | | `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ | | `alias_hash` | The hash of the entity alias. ~~int~~ |
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ | | `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
## InMemoryCandidate attributes {id="candidate-attributes"} ## InMemoryCandidate attributes {id="candidate-attributes"}