mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-21 17:41:59 +03:00
Merge branch 'refactor/el-candidates' into feature/docwise-generator-batching
# Conflicts: # spacy/kb/candidate.py # spacy/kb/kb.pyx # spacy/kb/kb_in_memory.pyx # spacy/ml/models/entity_linker.py # spacy/pipeline/entity_linker.py # spacy/tests/pipeline/test_entity_linker.py # website/docs/api/inmemorylookupkb.mdx # website/docs/api/kb.mdx
This commit is contained in:
commit
73bdeb01e4
1
setup.py
1
setup.py
|
@ -30,6 +30,7 @@ MOD_NAMES = [
|
|||
"spacy.lexeme",
|
||||
"spacy.vocab",
|
||||
"spacy.attrs",
|
||||
"spacy.kb.candidate",
|
||||
"spacy.kb.kb",
|
||||
"spacy.kb.kb_in_memory",
|
||||
"spacy.ml.tb_framework",
|
||||
|
|
|
@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"ignoring the duplicate entry.")
|
||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||
W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
|
||||
"the Knowledge Base.")
|
||||
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
||||
"you are constructing a parse tree incrementally by setting "
|
||||
|
@ -209,7 +209,11 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||
|
||||
# v4 warning strings
|
||||
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
||||
W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
|
||||
"lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
|
||||
"to return `True` in `.supports_prior_probs`.")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
@ -960,6 +964,9 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E4003 = ("Training examples for distillation must have the exact same tokens in the "
|
||||
"reference and predicted docs.")
|
||||
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
|
||||
E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
|
||||
|
||||
|
||||
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
||||
|
||||
|
|
|
@ -2,4 +2,5 @@ from .kb import KnowledgeBase
|
|||
from .kb_in_memory import InMemoryLookupKB
|
||||
from .candidate import Candidate, InMemoryCandidate
|
||||
|
||||
|
||||
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
||||
|
|
15
spacy/kb/candidate.pxd
Normal file
15
spacy/kb/candidate.pxd
Normal file
|
@ -0,0 +1,15 @@
|
|||
from libcpp.vector cimport vector
|
||||
from .kb_in_memory cimport InMemoryLookupKB
|
||||
from ..typedefs cimport hash_t
|
||||
|
||||
cdef class Candidate:
|
||||
pass
|
||||
|
||||
|
||||
cdef class InMemoryCandidate(Candidate):
|
||||
cdef readonly hash_t _entity_hash
|
||||
cdef readonly hash_t _alias_hash
|
||||
cpdef vector[float] _entity_vector
|
||||
cdef float _prior_prob
|
||||
cdef readonly InMemoryLookupKB _kb
|
||||
cdef float _entity_freq
|
|
@ -1,118 +0,0 @@
|
|||
import abc
|
||||
from typing import List, Callable
|
||||
|
||||
|
||||
class Candidate(abc.ABC):
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity_id) pair is assigned a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mention: str,
|
||||
entity_id: int,
|
||||
entity_name: str,
|
||||
entity_vector: List[float],
|
||||
prior_prob: float,
|
||||
):
|
||||
"""Initializes properties of `Candidate` instance.
|
||||
mention (str): Mention text for this candidate.
|
||||
entity_id (int): Unique entity ID.
|
||||
entity_name (str): Entity name.
|
||||
entity_vector (List[float]): Entity embedding.
|
||||
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
|
||||
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
|
||||
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
|
||||
doesn't) it might be better to eschew this information and always supply the same value.
|
||||
"""
|
||||
self._mention = mention
|
||||
self._entity_id = entity_id
|
||||
self._entity_name = entity_name
|
||||
self._entity_vector = entity_vector
|
||||
self._prior_prob = prior_prob
|
||||
|
||||
@property
|
||||
def entity(self) -> int:
|
||||
"""RETURNS (int): Unique entity ID."""
|
||||
return self._entity_id
|
||||
|
||||
@property
|
||||
def entity_(self) -> str:
|
||||
"""RETURNS (int): Entity name."""
|
||||
return self._entity_name
|
||||
|
||||
@property
|
||||
def mention(self) -> str:
|
||||
"""RETURNS (str): Mention."""
|
||||
return self._mention
|
||||
|
||||
@property
|
||||
def entity_vector(self) -> List[float]:
|
||||
"""RETURNS (List[float]): Entity vector."""
|
||||
return self._entity_vector
|
||||
|
||||
@property
|
||||
def prior_prob(self) -> float:
|
||||
"""RETURNS (List[float]): Entity vector."""
|
||||
return self._prior_prob
|
||||
|
||||
|
||||
class InMemoryCandidate(Candidate):
|
||||
"""Candidate for InMemoryLookupKB."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
retrieve_string_from_hash: Callable[[int], str],
|
||||
entity_hash: int,
|
||||
entity_freq: int,
|
||||
entity_vector: List[float],
|
||||
alias_hash: int,
|
||||
prior_prob: float,
|
||||
):
|
||||
"""
|
||||
retrieve_string_from_hash (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab
|
||||
hash.
|
||||
entity_hash (str): Hashed entity name /ID.
|
||||
entity_freq (int): Entity frequency in KB corpus.
|
||||
entity_vector (List[float]): Entity embedding.
|
||||
alias_hash (int): Hashed alias.
|
||||
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
|
||||
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
|
||||
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
|
||||
doesn't) it might be better to eschew this information and always supply the same value.
|
||||
"""
|
||||
super().__init__(
|
||||
mention=retrieve_string_from_hash(alias_hash),
|
||||
entity_id=entity_hash,
|
||||
entity_name=retrieve_string_from_hash(entity_hash),
|
||||
entity_vector=entity_vector,
|
||||
prior_prob=prior_prob,
|
||||
)
|
||||
self._retrieve_string_from_hash = retrieve_string_from_hash
|
||||
self._entity_hash = entity_hash
|
||||
self._entity_freq = entity_freq
|
||||
self._alias_hash = alias_hash
|
||||
self._prior_prob = prior_prob
|
||||
|
||||
@property
|
||||
def entity(self) -> int:
|
||||
"""RETURNS (int): hash of the entity_id's KB ID/name"""
|
||||
return self._entity_hash
|
||||
|
||||
@property
|
||||
def alias(self) -> int:
|
||||
"""RETURNS (int): hash of the alias"""
|
||||
return self._alias_hash
|
||||
|
||||
@property
|
||||
def alias_(self) -> str:
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self._retrieve_string_from_hash(self._alias_hash)
|
||||
|
||||
@property
|
||||
def entity_freq(self) -> float:
|
||||
return self._entity_freq
|
96
spacy/kb/candidate.pyx
Normal file
96
spacy/kb/candidate.pyx
Normal file
|
@ -0,0 +1,96 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
|
||||
from .kb_in_memory cimport InMemoryLookupKB
|
||||
from ..errors import Errors
|
||||
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention that may or may not be resolved
|
||||
to a specific entity from a Knowledge Base. This will be used as input for the entity linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
|
||||
is assigned a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Make sure abstract Candidate is not instantiated.
|
||||
if self.__class__ == Candidate:
|
||||
raise TypeError(
|
||||
Errors.E1046.format(cls_name=self.__class__.__name__)
|
||||
)
|
||||
|
||||
@property
|
||||
def entity_id(self) -> int:
|
||||
"""RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
|
||||
otherwise the hash of the entity ID string)."""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def entity_id_(self) -> str:
|
||||
"""RETURNS (str): String representation of entity ID."""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def entity_vector(self) -> vector[float]:
|
||||
"""RETURNS (vector[float]): Entity vector."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
cdef class InMemoryCandidate(Candidate):
|
||||
"""Candidate for InMemoryLookupKB."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kb: InMemoryLookupKB,
|
||||
entity_hash: int,
|
||||
alias_hash: int,
|
||||
entity_vector: vector[float],
|
||||
prior_prob: float,
|
||||
entity_freq: float
|
||||
):
|
||||
"""
|
||||
kb (InMemoryLookupKB]): InMemoryLookupKB instance.
|
||||
entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
|
||||
entity_freq (int): Entity frequency in KB corpus.
|
||||
entity_vector (List[float]): Entity embedding.
|
||||
alias_hash (int): Alias hash.
|
||||
prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
|
||||
the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self._entity_hash = entity_hash
|
||||
self._entity_vector = entity_vector
|
||||
self._prior_prob = prior_prob
|
||||
self._kb = kb
|
||||
self._alias_hash = alias_hash
|
||||
self._entity_freq = entity_freq
|
||||
|
||||
@property
|
||||
def entity_id(self) -> int:
|
||||
return self._entity_hash
|
||||
|
||||
@property
|
||||
def entity_vector(self) -> vector[float]:
|
||||
return self._entity_vector
|
||||
|
||||
@property
|
||||
def prior_prob(self) -> float:
|
||||
"""RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
|
||||
this entity."""
|
||||
return self._prior_prob
|
||||
|
||||
@property
|
||||
def alias(self) -> str:
|
||||
"""RETURNS (str): Alias."""
|
||||
return self._kb.vocab.strings[self._alias_hash]
|
||||
|
||||
@property
|
||||
def entity_id_(self) -> str:
|
||||
return self._kb.vocab.strings[self._entity_hash]
|
||||
|
||||
@property
|
||||
def entity_freq(self) -> float:
|
||||
"""RETURNS (float): Entity frequency in KB corpus."""
|
||||
return self._entity_freq
|
|
@ -32,11 +32,12 @@ cdef class KnowledgeBase:
|
|||
|
||||
def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[Iterable[Iterable[Candidate]]]:
|
||||
"""
|
||||
Return candidate entities for mentions stored in `ent` attribute in passed docs. Each candidate defines the
|
||||
entity, the original alias, and the prior probability of that alias resolving to that entity.
|
||||
If no candidate is found for a given mention, an empty list is returned.
|
||||
mentions (Iterator[SpanGroup]): Mentions per doc as SpanGroup instance.
|
||||
RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per document.
|
||||
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
|
||||
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
||||
probability of the specified mention text resolving to that entity - might be included.
|
||||
If no candidates are found for a given mention, an empty list is returned.
|
||||
mentions (Iterable[SpangGroup]): Mentions for which to get candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
||||
|
@ -96,3 +97,10 @@ cdef class KnowledgeBase:
|
|||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
||||
)
|
||||
|
||||
@property
|
||||
def supports_prior_probs(self) -> bool:
|
||||
"""RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
|
||||
)
|
||||
|
|
|
@ -230,7 +230,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
for mentions_for_doc in mentions:
|
||||
yield [self.get_alias_candidates(ent_span.text) for ent_span in mentions_for_doc]
|
||||
|
||||
def get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
|
||||
def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
|
||||
"""
|
||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
|
@ -244,12 +244,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
return [
|
||||
InMemoryCandidate(
|
||||
retrieve_string_from_hash=self.vocab.strings.__getitem__,
|
||||
kb=self,
|
||||
entity_hash=self._entries[entry_index].entity_hash,
|
||||
entity_freq=self._entries[entry_index].freq,
|
||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||
alias_hash=alias_hash,
|
||||
prior_prob=prior_prob
|
||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||
prior_prob=prior_prob,
|
||||
entity_freq=self._entries[entry_index].freq
|
||||
)
|
||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||
if entry_index != 0
|
||||
|
@ -284,6 +284,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
return 0.0
|
||||
|
||||
def supports_prior_probs(self) -> bool:
|
||||
return True
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
"""Serialize the current state to a binary string.
|
||||
"""
|
||||
|
|
|
@ -108,7 +108,7 @@ def empty_kb(
|
|||
|
||||
|
||||
@registry.misc("spacy.CandidateGenerator.v1")
|
||||
def create_candidates_all() -> Callable[
|
||||
def create_get_candidates() -> Callable[
|
||||
[KnowledgeBase, Iterator[SpanGroup]],
|
||||
Iterator[Iterable[Iterable[Candidate]]],
|
||||
]:
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
import warnings
|
||||
from typing import (
|
||||
cast,
|
||||
Optional,
|
||||
Iterable,
|
||||
Callable,
|
||||
|
@ -9,7 +11,6 @@ from typing import (
|
|||
Any,
|
||||
Iterator,
|
||||
)
|
||||
from typing import cast
|
||||
from numpy import dtype
|
||||
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
|
||||
from pathlib import Path
|
||||
|
@ -27,7 +28,7 @@ from .trainable_pipe import TrainablePipe
|
|||
from ..language import Language
|
||||
from ..vocab import Vocab
|
||||
from ..training import Example, validate_examples, validate_get_examples
|
||||
from ..errors import Errors
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import SimpleFrozenList, registry
|
||||
from .. import util
|
||||
from ..scorer import Scorer
|
||||
|
@ -120,28 +121,9 @@ def make_entity_linker(
|
|||
prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
"""
|
||||
|
||||
if not model.attrs.get("include_span_maker", False):
|
||||
try:
|
||||
from spacy_legacy.components.entity_linker import EntityLinker_v1
|
||||
except:
|
||||
raise ImportError(
|
||||
"In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
|
||||
)
|
||||
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
|
||||
return EntityLinker_v1(
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
labels_discard=labels_discard,
|
||||
n_sents=n_sents,
|
||||
incl_prior=incl_prior,
|
||||
incl_context=incl_context,
|
||||
entity_vector_length=entity_vector_length,
|
||||
get_candidates=get_candidates,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
)
|
||||
raise ValueError(Errors.E4005)
|
||||
|
||||
return EntityLinker(
|
||||
nlp.vocab,
|
||||
model,
|
||||
|
@ -251,6 +233,9 @@ class EntityLinker(TrainablePipe):
|
|||
self.threshold = threshold
|
||||
self.save_activations = save_activations
|
||||
|
||||
if self.incl_prior and not self.kb.supports_prior_probs:
|
||||
warnings.warn(Warnings.W401)
|
||||
|
||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||
"""Define the KB of this pipe by providing a function that will
|
||||
create it using this object's vocab."""
|
||||
|
|
|
@ -7,7 +7,7 @@ from thinc.types import Ragged
|
|||
from spacy import registry, util
|
||||
from spacy.attrs import ENT_KB_ID
|
||||
from spacy.compat import pickle
|
||||
from spacy.kb import InMemoryCandidate, InMemoryLookupKB, KnowledgeBase
|
||||
from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
|
||||
from spacy.lang.en import English
|
||||
from spacy.ml import load_kb
|
||||
from spacy.ml.models.entity_linker import build_span_maker, get_candidates
|
||||
|
@ -479,8 +479,8 @@ def test_candidate_generation(nlp):
|
|||
)
|
||||
|
||||
# test the content of the candidates
|
||||
assert adam_ent_cands[0].entity_ == "Q2"
|
||||
assert adam_ent_cands[0].alias_ == "adam"
|
||||
assert adam_ent_cands[0].entity_id_ == "Q2"
|
||||
assert adam_ent_cands[0].alias == "adam"
|
||||
assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
|
||||
assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
|
||||
|
||||
|
@ -519,7 +519,7 @@ def test_el_pipe_configuration(nlp):
|
|||
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
||||
def create_candidates() -> Callable[
|
||||
[InMemoryLookupKB, Iterator[SpanGroup]],
|
||||
Iterator[Iterable[Iterable[InMemoryCandidate]]],
|
||||
Iterator[Iterable[Iterable[Candidate]]],
|
||||
]:
|
||||
return get_lowercased_candidates
|
||||
|
||||
|
@ -562,24 +562,22 @@ def test_vocab_serialization(nlp):
|
|||
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
|
||||
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||
|
||||
candidates = mykb.get_alias_candidates("adam")
|
||||
candidates = mykb._get_alias_candidates("adam")
|
||||
assert len(candidates) == 1
|
||||
assert candidates[0].entity == q2_hash
|
||||
assert candidates[0].entity_ == "Q2"
|
||||
assert candidates[0].alias == adam_hash
|
||||
assert candidates[0].alias_ == "adam"
|
||||
assert candidates[0].entity_id == q2_hash
|
||||
assert candidates[0].entity_id_ == "Q2"
|
||||
assert candidates[0].alias == "adam"
|
||||
|
||||
with make_tempdir() as d:
|
||||
mykb.to_disk(d / "kb")
|
||||
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
||||
kb_new_vocab.from_disk(d / "kb")
|
||||
|
||||
candidates = kb_new_vocab.get_alias_candidates("adam")
|
||||
candidates = kb_new_vocab._get_alias_candidates("adam")
|
||||
assert len(candidates) == 1
|
||||
assert candidates[0].entity == q2_hash
|
||||
assert candidates[0].entity_ == "Q2"
|
||||
assert candidates[0].alias == adam_hash
|
||||
assert candidates[0].alias_ == "adam"
|
||||
assert candidates[0].entity_id == q2_hash
|
||||
assert candidates[0].entity_id_ == "Q2"
|
||||
assert candidates[0].alias == "adam"
|
||||
|
||||
assert kb_new_vocab.get_vector("Q2") == [2]
|
||||
assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
|
||||
|
@ -599,20 +597,20 @@ def test_append_alias(nlp):
|
|||
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||
|
||||
# test the size of the relevant candidates
|
||||
assert len(mykb.get_alias_candidates("douglas")) == 2
|
||||
assert len(mykb._get_alias_candidates("douglas")) == 2
|
||||
|
||||
# append an alias
|
||||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
|
||||
|
||||
# test the size of the relevant candidates has been incremented
|
||||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
||||
assert len(mykb._get_alias_candidates("douglas")) == 3
|
||||
|
||||
# append the same alias-entity pair again should not work (will throw a warning)
|
||||
with pytest.warns(UserWarning):
|
||||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
|
||||
|
||||
# test the size of the relevant candidates remained unchanged
|
||||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
||||
assert len(mykb._get_alias_candidates("douglas")) == 3
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||
|
@ -909,11 +907,11 @@ def test_kb_to_bytes():
|
|||
assert kb_2.contains_alias("Russ Cochran")
|
||||
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
|
||||
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
|
||||
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
|
||||
kb_2.get_alias_candidates("Russ Cochran")
|
||||
assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
|
||||
kb_2._get_alias_candidates("Russ Cochran")
|
||||
)
|
||||
assert len(kb_1.get_alias_candidates("Randomness")) == len(
|
||||
kb_2.get_alias_candidates("Randomness")
|
||||
assert len(kb_1._get_alias_candidates("Randomness")) == len(
|
||||
kb_2._get_alias_candidates("Randomness")
|
||||
)
|
||||
|
||||
|
||||
|
@ -994,14 +992,11 @@ def test_scorer_links():
|
|||
@pytest.mark.parametrize(
|
||||
"name,config",
|
||||
[
|
||||
("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
|
||||
("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
def test_legacy_architectures(name, config):
|
||||
from spacy_legacy.components.entity_linker import EntityLinker_v1
|
||||
|
||||
# Ensure that the legacy architectures still work
|
||||
vector_length = 3
|
||||
nlp = English()
|
||||
|
@ -1023,10 +1018,7 @@ def test_legacy_architectures(name, config):
|
|||
return mykb
|
||||
|
||||
entity_linker = nlp.add_pipe(name, config={"model": config})
|
||||
if config["@architectures"] == "spacy.EntityLinker.v1":
|
||||
assert isinstance(entity_linker, EntityLinker_v1)
|
||||
else:
|
||||
assert isinstance(entity_linker, EntityLinker)
|
||||
assert isinstance(entity_linker, EntityLinker)
|
||||
entity_linker.set_kb(create_kb)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
|
|
@ -66,19 +66,21 @@ def _check_kb(kb):
|
|||
assert alias_string not in kb.get_alias_strings()
|
||||
|
||||
# check candidates & probabilities
|
||||
candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
|
||||
candidates = sorted(
|
||||
kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_
|
||||
)
|
||||
assert len(candidates) == 2
|
||||
|
||||
assert candidates[0].entity_ == "Q007"
|
||||
assert candidates[0].entity_id_ == "Q007"
|
||||
assert 6.999 < candidates[0].entity_freq < 7.01
|
||||
assert candidates[0].entity_vector == [0, 0, 7]
|
||||
assert candidates[0].alias_ == "double07"
|
||||
assert candidates[0].alias == "double07"
|
||||
assert 0.899 < candidates[0].prior_prob < 0.901
|
||||
|
||||
assert candidates[1].entity_ == "Q17"
|
||||
assert candidates[1].entity_id_ == "Q17"
|
||||
assert 1.99 < candidates[1].entity_freq < 2.01
|
||||
assert candidates[1].entity_vector == [7, 1, 0]
|
||||
assert candidates[1].alias_ == "double07"
|
||||
assert candidates[1].alias == "double07"
|
||||
assert 0.099 < candidates[1].prior_prob < 0.101
|
||||
|
||||
|
||||
|
|
|
@ -178,22 +178,6 @@ implementation of `KnowledgeBase.get_candidates()`.
|
|||
| `mentions` | The textual mention or alias. ~~Iterable[SpanGroup]~~ |
|
||||
| **RETURNS** | An iterator over iterables of iterables with relevant [`InMemoryCandidate`](/api/kb#candidate) objects (per mention and doc). ~~Iterator[Iterable[Iterable[InMemoryCandidate]]]~~ |
|
||||
|
||||
## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
|
||||
|
||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||
of type [`InMemoryCandidate`](/api/kb#candidate).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> candidates = kb.get_alias_candidates("Douglas")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------- |
|
||||
| `alias` | The textual mention or alias. ~~str~~ |
|
||||
| **RETURNS** | The list of relevant `InMemoryCandidate` objects. ~~List[InMemoryCandidate]~~ |
|
||||
|
||||
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
|
||||
|
||||
Given a certain entity ID, retrieve its pretrained entity vector.
|
||||
|
|
|
@ -155,15 +155,15 @@ Restore the state of the knowledge base from a given directory. Note that the
|
|||
|
||||
## InMemoryCandidate {id="candidate",tag="class"}
|
||||
|
||||
A `InMemoryCandidate` object refers to a textual mention that may or may not be
|
||||
resolved to a specific entity from a `KnowledgeBase`. This will be used as input
|
||||
for the entity linking algorithm which will disambiguate the various candidates
|
||||
to the correct one. Each candidate `(mention, entity)` pair is assigned to a
|
||||
certain prior probability.
|
||||
An `InMemoryCandidate` object refers to a textual mention (alias) that may or
|
||||
may not be resolved to a specific entity from a `KnowledgeBase`. This will be
|
||||
used as input for the entity linking algorithm which will disambiguate the
|
||||
various candidates to the correct one. Each candidate `(alias, entity)` pair is
|
||||
assigned to a certain prior probability.
|
||||
|
||||
### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
|
||||
|
||||
Construct a `InMemoryCandidate` object. Usually this constructor is not called
|
||||
Construct an `InMemoryCandidate` object. Usually this constructor is not called
|
||||
directly, but instead these objects are returned by the `get_candidates` method
|
||||
of the [`entity_linker`](/api/entitylinker) pipe.
|
||||
|
||||
|
@ -181,7 +181,7 @@ of the [`entity_linker`](/api/entitylinker) pipe.
|
|||
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
|
||||
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
|
||||
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
|
||||
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ |
|
||||
| `alias_hash` | The hash of the entity alias. ~~int~~ |
|
||||
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
||||
|
||||
## InMemoryCandidate attributes {id="candidate-attributes"}
|
||||
|
|
Loading…
Reference in New Issue
Block a user