mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 08:42:28 +03:00
Merge branch 'refactor/el-candidates' into feature/docwise-generator-batching
# Conflicts: # spacy/kb/candidate.py # spacy/kb/kb.pyx # spacy/kb/kb_in_memory.pyx # spacy/ml/models/entity_linker.py # spacy/pipeline/entity_linker.py # spacy/tests/pipeline/test_entity_linker.py # website/docs/api/inmemorylookupkb.mdx # website/docs/api/kb.mdx
This commit is contained in:
commit
73bdeb01e4
1
setup.py
1
setup.py
|
@ -30,6 +30,7 @@ MOD_NAMES = [
|
||||||
"spacy.lexeme",
|
"spacy.lexeme",
|
||||||
"spacy.vocab",
|
"spacy.vocab",
|
||||||
"spacy.attrs",
|
"spacy.attrs",
|
||||||
|
"spacy.kb.candidate",
|
||||||
"spacy.kb.kb",
|
"spacy.kb.kb",
|
||||||
"spacy.kb.kb_in_memory",
|
"spacy.kb.kb_in_memory",
|
||||||
"spacy.ml.tb_framework",
|
"spacy.ml.tb_framework",
|
||||||
|
|
|
@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
"ignoring the duplicate entry.")
|
"ignoring the duplicate entry.")
|
||||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
||||||
"you are constructing a parse tree incrementally by setting "
|
"you are constructing a parse tree incrementally by setting "
|
||||||
|
@ -209,7 +209,11 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||||
|
|
||||||
|
# v4 warning strings
|
||||||
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
||||||
|
W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
|
||||||
|
"lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
|
||||||
|
"to return `True` in `.supports_prior_probs`.")
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
|
@ -960,6 +964,9 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E4003 = ("Training examples for distillation must have the exact same tokens in the "
|
E4003 = ("Training examples for distillation must have the exact same tokens in the "
|
||||||
"reference and predicted docs.")
|
"reference and predicted docs.")
|
||||||
E4004 = ("Backprop is not supported when is_train is not set.")
|
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||||
|
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
|
||||||
|
E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
|
||||||
|
|
||||||
|
|
||||||
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
||||||
|
|
||||||
|
|
|
@ -2,4 +2,5 @@ from .kb import KnowledgeBase
|
||||||
from .kb_in_memory import InMemoryLookupKB
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
from .candidate import Candidate, InMemoryCandidate
|
from .candidate import Candidate, InMemoryCandidate
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
||||||
|
|
15
spacy/kb/candidate.pxd
Normal file
15
spacy/kb/candidate.pxd
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from .kb_in_memory cimport InMemoryLookupKB
|
||||||
|
from ..typedefs cimport hash_t
|
||||||
|
|
||||||
|
cdef class Candidate:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
cdef class InMemoryCandidate(Candidate):
|
||||||
|
cdef readonly hash_t _entity_hash
|
||||||
|
cdef readonly hash_t _alias_hash
|
||||||
|
cpdef vector[float] _entity_vector
|
||||||
|
cdef float _prior_prob
|
||||||
|
cdef readonly InMemoryLookupKB _kb
|
||||||
|
cdef float _entity_freq
|
|
@ -1,118 +0,0 @@
|
||||||
import abc
|
|
||||||
from typing import List, Callable
|
|
||||||
|
|
||||||
|
|
||||||
class Candidate(abc.ABC):
|
|
||||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
|
||||||
to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking
|
|
||||||
algorithm which will disambiguate the various candidates to the correct one.
|
|
||||||
Each candidate (alias, entity_id) pair is assigned a certain prior probability.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
mention: str,
|
|
||||||
entity_id: int,
|
|
||||||
entity_name: str,
|
|
||||||
entity_vector: List[float],
|
|
||||||
prior_prob: float,
|
|
||||||
):
|
|
||||||
"""Initializes properties of `Candidate` instance.
|
|
||||||
mention (str): Mention text for this candidate.
|
|
||||||
entity_id (int): Unique entity ID.
|
|
||||||
entity_name (str): Entity name.
|
|
||||||
entity_vector (List[float]): Entity embedding.
|
|
||||||
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
|
|
||||||
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
|
|
||||||
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
|
|
||||||
doesn't) it might be better to eschew this information and always supply the same value.
|
|
||||||
"""
|
|
||||||
self._mention = mention
|
|
||||||
self._entity_id = entity_id
|
|
||||||
self._entity_name = entity_name
|
|
||||||
self._entity_vector = entity_vector
|
|
||||||
self._prior_prob = prior_prob
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity(self) -> int:
|
|
||||||
"""RETURNS (int): Unique entity ID."""
|
|
||||||
return self._entity_id
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_(self) -> str:
|
|
||||||
"""RETURNS (int): Entity name."""
|
|
||||||
return self._entity_name
|
|
||||||
|
|
||||||
@property
|
|
||||||
def mention(self) -> str:
|
|
||||||
"""RETURNS (str): Mention."""
|
|
||||||
return self._mention
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_vector(self) -> List[float]:
|
|
||||||
"""RETURNS (List[float]): Entity vector."""
|
|
||||||
return self._entity_vector
|
|
||||||
|
|
||||||
@property
|
|
||||||
def prior_prob(self) -> float:
|
|
||||||
"""RETURNS (List[float]): Entity vector."""
|
|
||||||
return self._prior_prob
|
|
||||||
|
|
||||||
|
|
||||||
class InMemoryCandidate(Candidate):
|
|
||||||
"""Candidate for InMemoryLookupKB."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
retrieve_string_from_hash: Callable[[int], str],
|
|
||||||
entity_hash: int,
|
|
||||||
entity_freq: int,
|
|
||||||
entity_vector: List[float],
|
|
||||||
alias_hash: int,
|
|
||||||
prior_prob: float,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
retrieve_string_from_hash (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab
|
|
||||||
hash.
|
|
||||||
entity_hash (str): Hashed entity name /ID.
|
|
||||||
entity_freq (int): Entity frequency in KB corpus.
|
|
||||||
entity_vector (List[float]): Entity embedding.
|
|
||||||
alias_hash (int): Hashed alias.
|
|
||||||
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
|
|
||||||
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
|
|
||||||
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
|
|
||||||
doesn't) it might be better to eschew this information and always supply the same value.
|
|
||||||
"""
|
|
||||||
super().__init__(
|
|
||||||
mention=retrieve_string_from_hash(alias_hash),
|
|
||||||
entity_id=entity_hash,
|
|
||||||
entity_name=retrieve_string_from_hash(entity_hash),
|
|
||||||
entity_vector=entity_vector,
|
|
||||||
prior_prob=prior_prob,
|
|
||||||
)
|
|
||||||
self._retrieve_string_from_hash = retrieve_string_from_hash
|
|
||||||
self._entity_hash = entity_hash
|
|
||||||
self._entity_freq = entity_freq
|
|
||||||
self._alias_hash = alias_hash
|
|
||||||
self._prior_prob = prior_prob
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity(self) -> int:
|
|
||||||
"""RETURNS (int): hash of the entity_id's KB ID/name"""
|
|
||||||
return self._entity_hash
|
|
||||||
|
|
||||||
@property
|
|
||||||
def alias(self) -> int:
|
|
||||||
"""RETURNS (int): hash of the alias"""
|
|
||||||
return self._alias_hash
|
|
||||||
|
|
||||||
@property
|
|
||||||
def alias_(self) -> str:
|
|
||||||
"""RETURNS (str): ID of the original alias"""
|
|
||||||
return self._retrieve_string_from_hash(self._alias_hash)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_freq(self) -> float:
|
|
||||||
return self._entity_freq
|
|
96
spacy/kb/candidate.pyx
Normal file
96
spacy/kb/candidate.pyx
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
# cython: infer_types=True, profile=True
|
||||||
|
|
||||||
|
from .kb_in_memory cimport InMemoryLookupKB
|
||||||
|
from ..errors import Errors
|
||||||
|
|
||||||
|
cdef class Candidate:
|
||||||
|
"""A `Candidate` object refers to a textual mention that may or may not be resolved
|
||||||
|
to a specific entity from a Knowledge Base. This will be used as input for the entity linking
|
||||||
|
algorithm which will disambiguate the various candidates to the correct one.
|
||||||
|
Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
|
||||||
|
is assigned a certain prior probability.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# Make sure abstract Candidate is not instantiated.
|
||||||
|
if self.__class__ == Candidate:
|
||||||
|
raise TypeError(
|
||||||
|
Errors.E1046.format(cls_name=self.__class__.__name__)
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_id(self) -> int:
|
||||||
|
"""RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
|
||||||
|
otherwise the hash of the entity ID string)."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_id_(self) -> str:
|
||||||
|
"""RETURNS (str): String representation of entity ID."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_vector(self) -> vector[float]:
|
||||||
|
"""RETURNS (vector[float]): Entity vector."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
cdef class InMemoryCandidate(Candidate):
|
||||||
|
"""Candidate for InMemoryLookupKB."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
kb: InMemoryLookupKB,
|
||||||
|
entity_hash: int,
|
||||||
|
alias_hash: int,
|
||||||
|
entity_vector: vector[float],
|
||||||
|
prior_prob: float,
|
||||||
|
entity_freq: float
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
kb (InMemoryLookupKB]): InMemoryLookupKB instance.
|
||||||
|
entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
|
||||||
|
entity_freq (int): Entity frequency in KB corpus.
|
||||||
|
entity_vector (List[float]): Entity embedding.
|
||||||
|
alias_hash (int): Alias hash.
|
||||||
|
prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
|
||||||
|
the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self._entity_hash = entity_hash
|
||||||
|
self._entity_vector = entity_vector
|
||||||
|
self._prior_prob = prior_prob
|
||||||
|
self._kb = kb
|
||||||
|
self._alias_hash = alias_hash
|
||||||
|
self._entity_freq = entity_freq
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_id(self) -> int:
|
||||||
|
return self._entity_hash
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_vector(self) -> vector[float]:
|
||||||
|
return self._entity_vector
|
||||||
|
|
||||||
|
@property
|
||||||
|
def prior_prob(self) -> float:
|
||||||
|
"""RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
|
||||||
|
this entity."""
|
||||||
|
return self._prior_prob
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alias(self) -> str:
|
||||||
|
"""RETURNS (str): Alias."""
|
||||||
|
return self._kb.vocab.strings[self._alias_hash]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_id_(self) -> str:
|
||||||
|
return self._kb.vocab.strings[self._entity_hash]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity_freq(self) -> float:
|
||||||
|
"""RETURNS (float): Entity frequency in KB corpus."""
|
||||||
|
return self._entity_freq
|
|
@ -32,11 +32,12 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[Iterable[Iterable[Candidate]]]:
|
def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[Iterable[Iterable[Candidate]]]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for mentions stored in `ent` attribute in passed docs. Each candidate defines the
|
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
|
||||||
entity, the original alias, and the prior probability of that alias resolving to that entity.
|
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
||||||
If no candidate is found for a given mention, an empty list is returned.
|
probability of the specified mention text resolving to that entity - might be included.
|
||||||
mentions (Iterator[SpanGroup]): Mentions per doc as SpanGroup instance.
|
If no candidates are found for a given mention, an empty list is returned.
|
||||||
RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per document.
|
mentions (Iterable[SpangGroup]): Mentions for which to get candidates.
|
||||||
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
||||||
|
@ -96,3 +97,10 @@ cdef class KnowledgeBase:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supports_prior_probs(self) -> bool:
|
||||||
|
"""RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
|
@ -230,7 +230,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
for mentions_for_doc in mentions:
|
for mentions_for_doc in mentions:
|
||||||
yield [self.get_alias_candidates(ent_span.text) for ent_span in mentions_for_doc]
|
yield [self.get_alias_candidates(ent_span.text) for ent_span in mentions_for_doc]
|
||||||
|
|
||||||
def get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
|
def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||||
and the prior probability of that alias resolving to that entity.
|
and the prior probability of that alias resolving to that entity.
|
||||||
|
@ -244,12 +244,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
return [
|
return [
|
||||||
InMemoryCandidate(
|
InMemoryCandidate(
|
||||||
retrieve_string_from_hash=self.vocab.strings.__getitem__,
|
kb=self,
|
||||||
entity_hash=self._entries[entry_index].entity_hash,
|
entity_hash=self._entries[entry_index].entity_hash,
|
||||||
entity_freq=self._entries[entry_index].freq,
|
|
||||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
|
||||||
alias_hash=alias_hash,
|
alias_hash=alias_hash,
|
||||||
prior_prob=prior_prob
|
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||||
|
prior_prob=prior_prob,
|
||||||
|
entity_freq=self._entries[entry_index].freq
|
||||||
)
|
)
|
||||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||||
if entry_index != 0
|
if entry_index != 0
|
||||||
|
@ -284,6 +284,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
def supports_prior_probs(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -108,7 +108,7 @@ def empty_kb(
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.CandidateGenerator.v1")
|
@registry.misc("spacy.CandidateGenerator.v1")
|
||||||
def create_candidates_all() -> Callable[
|
def create_get_candidates() -> Callable[
|
||||||
[KnowledgeBase, Iterator[SpanGroup]],
|
[KnowledgeBase, Iterator[SpanGroup]],
|
||||||
Iterator[Iterable[Iterable[Candidate]]],
|
Iterator[Iterable[Iterable[Candidate]]],
|
||||||
]:
|
]:
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
import warnings
|
||||||
from typing import (
|
from typing import (
|
||||||
|
cast,
|
||||||
Optional,
|
Optional,
|
||||||
Iterable,
|
Iterable,
|
||||||
Callable,
|
Callable,
|
||||||
|
@ -9,7 +11,6 @@ from typing import (
|
||||||
Any,
|
Any,
|
||||||
Iterator,
|
Iterator,
|
||||||
)
|
)
|
||||||
from typing import cast
|
|
||||||
from numpy import dtype
|
from numpy import dtype
|
||||||
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
|
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -27,7 +28,7 @@ from .trainable_pipe import TrainablePipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..training import Example, validate_examples, validate_get_examples
|
from ..training import Example, validate_examples, validate_get_examples
|
||||||
from ..errors import Errors
|
from ..errors import Errors, Warnings
|
||||||
from ..util import SimpleFrozenList, registry
|
from ..util import SimpleFrozenList, registry
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
@ -120,28 +121,9 @@ def make_entity_linker(
|
||||||
prediction is discarded. If None, predictions are not filtered by any threshold.
|
prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||||
save_activations (bool): save model activations in Doc when annotating.
|
save_activations (bool): save model activations in Doc when annotating.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not model.attrs.get("include_span_maker", False):
|
if not model.attrs.get("include_span_maker", False):
|
||||||
try:
|
raise ValueError(Errors.E4005)
|
||||||
from spacy_legacy.components.entity_linker import EntityLinker_v1
|
|
||||||
except:
|
|
||||||
raise ImportError(
|
|
||||||
"In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
|
|
||||||
)
|
|
||||||
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
|
|
||||||
return EntityLinker_v1(
|
|
||||||
nlp.vocab,
|
|
||||||
model,
|
|
||||||
name,
|
|
||||||
labels_discard=labels_discard,
|
|
||||||
n_sents=n_sents,
|
|
||||||
incl_prior=incl_prior,
|
|
||||||
incl_context=incl_context,
|
|
||||||
entity_vector_length=entity_vector_length,
|
|
||||||
get_candidates=get_candidates,
|
|
||||||
overwrite=overwrite,
|
|
||||||
scorer=scorer,
|
|
||||||
)
|
|
||||||
return EntityLinker(
|
return EntityLinker(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
model,
|
model,
|
||||||
|
@ -251,6 +233,9 @@ class EntityLinker(TrainablePipe):
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
self.save_activations = save_activations
|
self.save_activations = save_activations
|
||||||
|
|
||||||
|
if self.incl_prior and not self.kb.supports_prior_probs:
|
||||||
|
warnings.warn(Warnings.W401)
|
||||||
|
|
||||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
"""Define the KB of this pipe by providing a function that will
|
"""Define the KB of this pipe by providing a function that will
|
||||||
create it using this object's vocab."""
|
create it using this object's vocab."""
|
||||||
|
|
|
@ -7,7 +7,7 @@ from thinc.types import Ragged
|
||||||
from spacy import registry, util
|
from spacy import registry, util
|
||||||
from spacy.attrs import ENT_KB_ID
|
from spacy.attrs import ENT_KB_ID
|
||||||
from spacy.compat import pickle
|
from spacy.compat import pickle
|
||||||
from spacy.kb import InMemoryCandidate, InMemoryLookupKB, KnowledgeBase
|
from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.ml import load_kb
|
from spacy.ml import load_kb
|
||||||
from spacy.ml.models.entity_linker import build_span_maker, get_candidates
|
from spacy.ml.models.entity_linker import build_span_maker, get_candidates
|
||||||
|
@ -479,8 +479,8 @@ def test_candidate_generation(nlp):
|
||||||
)
|
)
|
||||||
|
|
||||||
# test the content of the candidates
|
# test the content of the candidates
|
||||||
assert adam_ent_cands[0].entity_ == "Q2"
|
assert adam_ent_cands[0].entity_id_ == "Q2"
|
||||||
assert adam_ent_cands[0].alias_ == "adam"
|
assert adam_ent_cands[0].alias == "adam"
|
||||||
assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
|
assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
|
||||||
assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
|
assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
|
||||||
|
|
||||||
|
@ -519,7 +519,7 @@ def test_el_pipe_configuration(nlp):
|
||||||
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[
|
def create_candidates() -> Callable[
|
||||||
[InMemoryLookupKB, Iterator[SpanGroup]],
|
[InMemoryLookupKB, Iterator[SpanGroup]],
|
||||||
Iterator[Iterable[Iterable[InMemoryCandidate]]],
|
Iterator[Iterable[Iterable[Candidate]]],
|
||||||
]:
|
]:
|
||||||
return get_lowercased_candidates
|
return get_lowercased_candidates
|
||||||
|
|
||||||
|
@ -562,24 +562,22 @@ def test_vocab_serialization(nlp):
|
||||||
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
|
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
|
||||||
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||||
|
|
||||||
candidates = mykb.get_alias_candidates("adam")
|
candidates = mykb._get_alias_candidates("adam")
|
||||||
assert len(candidates) == 1
|
assert len(candidates) == 1
|
||||||
assert candidates[0].entity == q2_hash
|
assert candidates[0].entity_id == q2_hash
|
||||||
assert candidates[0].entity_ == "Q2"
|
assert candidates[0].entity_id_ == "Q2"
|
||||||
assert candidates[0].alias == adam_hash
|
assert candidates[0].alias == "adam"
|
||||||
assert candidates[0].alias_ == "adam"
|
|
||||||
|
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
mykb.to_disk(d / "kb")
|
mykb.to_disk(d / "kb")
|
||||||
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
||||||
kb_new_vocab.from_disk(d / "kb")
|
kb_new_vocab.from_disk(d / "kb")
|
||||||
|
|
||||||
candidates = kb_new_vocab.get_alias_candidates("adam")
|
candidates = kb_new_vocab._get_alias_candidates("adam")
|
||||||
assert len(candidates) == 1
|
assert len(candidates) == 1
|
||||||
assert candidates[0].entity == q2_hash
|
assert candidates[0].entity_id == q2_hash
|
||||||
assert candidates[0].entity_ == "Q2"
|
assert candidates[0].entity_id_ == "Q2"
|
||||||
assert candidates[0].alias == adam_hash
|
assert candidates[0].alias == "adam"
|
||||||
assert candidates[0].alias_ == "adam"
|
|
||||||
|
|
||||||
assert kb_new_vocab.get_vector("Q2") == [2]
|
assert kb_new_vocab.get_vector("Q2") == [2]
|
||||||
assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
|
assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
|
||||||
|
@ -599,20 +597,20 @@ def test_append_alias(nlp):
|
||||||
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||||
|
|
||||||
# test the size of the relevant candidates
|
# test the size of the relevant candidates
|
||||||
assert len(mykb.get_alias_candidates("douglas")) == 2
|
assert len(mykb._get_alias_candidates("douglas")) == 2
|
||||||
|
|
||||||
# append an alias
|
# append an alias
|
||||||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
|
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
|
||||||
|
|
||||||
# test the size of the relevant candidates has been incremented
|
# test the size of the relevant candidates has been incremented
|
||||||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
assert len(mykb._get_alias_candidates("douglas")) == 3
|
||||||
|
|
||||||
# append the same alias-entity pair again should not work (will throw a warning)
|
# append the same alias-entity pair again should not work (will throw a warning)
|
||||||
with pytest.warns(UserWarning):
|
with pytest.warns(UserWarning):
|
||||||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
|
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
|
||||||
|
|
||||||
# test the size of the relevant candidates remained unchanged
|
# test the size of the relevant candidates remained unchanged
|
||||||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
assert len(mykb._get_alias_candidates("douglas")) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore:\\[W036")
|
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||||
|
@ -909,11 +907,11 @@ def test_kb_to_bytes():
|
||||||
assert kb_2.contains_alias("Russ Cochran")
|
assert kb_2.contains_alias("Russ Cochran")
|
||||||
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
|
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
|
||||||
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
|
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
|
||||||
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
|
assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
|
||||||
kb_2.get_alias_candidates("Russ Cochran")
|
kb_2._get_alias_candidates("Russ Cochran")
|
||||||
)
|
)
|
||||||
assert len(kb_1.get_alias_candidates("Randomness")) == len(
|
assert len(kb_1._get_alias_candidates("Randomness")) == len(
|
||||||
kb_2.get_alias_candidates("Randomness")
|
kb_2._get_alias_candidates("Randomness")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -994,14 +992,11 @@ def test_scorer_links():
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"name,config",
|
"name,config",
|
||||||
[
|
[
|
||||||
("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
|
|
||||||
("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
|
("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
def test_legacy_architectures(name, config):
|
def test_legacy_architectures(name, config):
|
||||||
from spacy_legacy.components.entity_linker import EntityLinker_v1
|
|
||||||
|
|
||||||
# Ensure that the legacy architectures still work
|
# Ensure that the legacy architectures still work
|
||||||
vector_length = 3
|
vector_length = 3
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -1023,10 +1018,7 @@ def test_legacy_architectures(name, config):
|
||||||
return mykb
|
return mykb
|
||||||
|
|
||||||
entity_linker = nlp.add_pipe(name, config={"model": config})
|
entity_linker = nlp.add_pipe(name, config={"model": config})
|
||||||
if config["@architectures"] == "spacy.EntityLinker.v1":
|
assert isinstance(entity_linker, EntityLinker)
|
||||||
assert isinstance(entity_linker, EntityLinker_v1)
|
|
||||||
else:
|
|
||||||
assert isinstance(entity_linker, EntityLinker)
|
|
||||||
entity_linker.set_kb(create_kb)
|
entity_linker.set_kb(create_kb)
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
|
@ -66,19 +66,21 @@ def _check_kb(kb):
|
||||||
assert alias_string not in kb.get_alias_strings()
|
assert alias_string not in kb.get_alias_strings()
|
||||||
|
|
||||||
# check candidates & probabilities
|
# check candidates & probabilities
|
||||||
candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
|
candidates = sorted(
|
||||||
|
kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_
|
||||||
|
)
|
||||||
assert len(candidates) == 2
|
assert len(candidates) == 2
|
||||||
|
|
||||||
assert candidates[0].entity_ == "Q007"
|
assert candidates[0].entity_id_ == "Q007"
|
||||||
assert 6.999 < candidates[0].entity_freq < 7.01
|
assert 6.999 < candidates[0].entity_freq < 7.01
|
||||||
assert candidates[0].entity_vector == [0, 0, 7]
|
assert candidates[0].entity_vector == [0, 0, 7]
|
||||||
assert candidates[0].alias_ == "double07"
|
assert candidates[0].alias == "double07"
|
||||||
assert 0.899 < candidates[0].prior_prob < 0.901
|
assert 0.899 < candidates[0].prior_prob < 0.901
|
||||||
|
|
||||||
assert candidates[1].entity_ == "Q17"
|
assert candidates[1].entity_id_ == "Q17"
|
||||||
assert 1.99 < candidates[1].entity_freq < 2.01
|
assert 1.99 < candidates[1].entity_freq < 2.01
|
||||||
assert candidates[1].entity_vector == [7, 1, 0]
|
assert candidates[1].entity_vector == [7, 1, 0]
|
||||||
assert candidates[1].alias_ == "double07"
|
assert candidates[1].alias == "double07"
|
||||||
assert 0.099 < candidates[1].prior_prob < 0.101
|
assert 0.099 < candidates[1].prior_prob < 0.101
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -178,22 +178,6 @@ implementation of `KnowledgeBase.get_candidates()`.
|
||||||
| `mentions` | The textual mention or alias. ~~Iterable[SpanGroup]~~ |
|
| `mentions` | The textual mention or alias. ~~Iterable[SpanGroup]~~ |
|
||||||
| **RETURNS** | An iterator over iterables of iterables with relevant [`InMemoryCandidate`](/api/kb#candidate) objects (per mention and doc). ~~Iterator[Iterable[Iterable[InMemoryCandidate]]]~~ |
|
| **RETURNS** | An iterator over iterables of iterables with relevant [`InMemoryCandidate`](/api/kb#candidate) objects (per mention and doc). ~~Iterator[Iterable[Iterable[InMemoryCandidate]]]~~ |
|
||||||
|
|
||||||
## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
|
|
||||||
|
|
||||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
|
||||||
of type [`InMemoryCandidate`](/api/kb#candidate).
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> candidates = kb.get_alias_candidates("Douglas")
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ----------- | ----------------------------------------------------------------------------- |
|
|
||||||
| `alias` | The textual mention or alias. ~~str~~ |
|
|
||||||
| **RETURNS** | The list of relevant `InMemoryCandidate` objects. ~~List[InMemoryCandidate]~~ |
|
|
||||||
|
|
||||||
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
|
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
|
||||||
|
|
||||||
Given a certain entity ID, retrieve its pretrained entity vector.
|
Given a certain entity ID, retrieve its pretrained entity vector.
|
||||||
|
|
|
@ -155,15 +155,15 @@ Restore the state of the knowledge base from a given directory. Note that the
|
||||||
|
|
||||||
## InMemoryCandidate {id="candidate",tag="class"}
|
## InMemoryCandidate {id="candidate",tag="class"}
|
||||||
|
|
||||||
A `InMemoryCandidate` object refers to a textual mention that may or may not be
|
An `InMemoryCandidate` object refers to a textual mention (alias) that may or
|
||||||
resolved to a specific entity from a `KnowledgeBase`. This will be used as input
|
may not be resolved to a specific entity from a `KnowledgeBase`. This will be
|
||||||
for the entity linking algorithm which will disambiguate the various candidates
|
used as input for the entity linking algorithm which will disambiguate the
|
||||||
to the correct one. Each candidate `(mention, entity)` pair is assigned to a
|
various candidates to the correct one. Each candidate `(alias, entity)` pair is
|
||||||
certain prior probability.
|
assigned to a certain prior probability.
|
||||||
|
|
||||||
### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
|
### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
|
||||||
|
|
||||||
Construct a `InMemoryCandidate` object. Usually this constructor is not called
|
Construct an `InMemoryCandidate` object. Usually this constructor is not called
|
||||||
directly, but instead these objects are returned by the `get_candidates` method
|
directly, but instead these objects are returned by the `get_candidates` method
|
||||||
of the [`entity_linker`](/api/entitylinker) pipe.
|
of the [`entity_linker`](/api/entitylinker) pipe.
|
||||||
|
|
||||||
|
@ -181,7 +181,7 @@ of the [`entity_linker`](/api/entitylinker) pipe.
|
||||||
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
|
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
|
||||||
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
|
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
|
||||||
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
|
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
|
||||||
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ |
|
| `alias_hash` | The hash of the entity alias. ~~int~~ |
|
||||||
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
||||||
|
|
||||||
## InMemoryCandidate attributes {id="candidate-attributes"}
|
## InMemoryCandidate attributes {id="candidate-attributes"}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user