Merge branch 'refactor/el-candidates' into refactor/span-group-for-mentions

# Conflicts:
#	spacy/ml/models/entity_linker.py
#	website/docs/api/inmemorylookupkb.mdx
This commit is contained in:
Raphael Mitsch 2023-03-03 08:32:38 +01:00
commit 3beda2b23a
7 changed files with 76 additions and 65 deletions

View File

@ -1,5 +1,5 @@
from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB
from .candidate import Candidate
from .candidate import Candidate, InMemoryCandidate
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate"]
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]

View File

@ -2,8 +2,8 @@ import abc
from typing import List, Union, Callable
class BaseCandidate(abc.ABC):
"""A `BaseCandidate` object refers to a textual mention (`alias`) that may or may not be resolved
class Candidate(abc.ABC):
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking
algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity_id) pair is assigned a certain prior probability.
@ -12,26 +12,38 @@ class BaseCandidate(abc.ABC):
"""
def __init__(
self, mention: str, entity_id: Union[int, str], entity_vector: List[float]
self,
mention: str,
entity_id: int,
entity_name: str,
entity_vector: List[float],
prior_prob: float,
):
"""Create new instance of `Candidate`. Note: has to be a sub-class, otherwise error will be raised.
"""Initializes properties of `Candidate` instance.
mention (str): Mention text for this candidate.
entity_id (Union[int, str]): Unique entity ID.
entity_id (int): Unique entity ID.
entity_name (str): Entity name.
entity_vector (List[float]): Entity embedding.
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
doesn't) it might be better to eschew this information and always supply the same value.
"""
self._mention = mention
self._entity_id = entity_id
self._entity_name = entity_name
self._entity_vector = entity_vector
self._prior_prob = prior_prob
@property
def entity(self) -> Union[int, str]:
"""RETURNS (Union[int, str]): Entity ID."""
def entity(self) -> int:
"""RETURNS (int): Unique entity ID."""
return self._entity_id
@property
@abc.abstractmethod
def entity_(self) -> str:
"""RETURNS (str): Entity name."""
"""RETURNS (int): Entity name."""
return self._entity_name
@property
def mention(self) -> str:
@ -43,9 +55,14 @@ class BaseCandidate(abc.ABC):
"""RETURNS (List[float]): Entity vector."""
return self._entity_vector
@property
def prior_prob(self) -> float:
"""RETURNS (List[float]): Entity vector."""
return self._prior_prob
class Candidate(BaseCandidate):
"""`Candidate` for InMemoryLookupKBCandidate."""
class InMemoryCandidate(Candidate):
"""Candidate for InMemoryLookupKB."""
def __init__(
self,
@ -57,7 +74,7 @@ class Candidate(BaseCandidate):
prior_prob: float,
):
"""
retrieve_string_from_hash (Callable[[int], str]): Callable retrieveing entity name from provided entity/vocab
retrieve_string_from_hash (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab
hash.
entity_hash (str): Hashed entity name /ID.
entity_freq (int): Entity frequency in KB corpus.
@ -71,7 +88,9 @@ class Candidate(BaseCandidate):
super().__init__(
mention=retrieve_string_from_hash(alias_hash),
entity_id=entity_hash,
entity_name=retrieve_string_from_hash(entity_hash),
entity_vector=entity_vector,
prior_prob=prior_prob,
)
self._retrieve_string_from_hash = retrieve_string_from_hash
self._entity_hash = entity_hash
@ -84,11 +103,6 @@ class Candidate(BaseCandidate):
"""RETURNS (int): hash of the entity_id's KB ID/name"""
return self._entity_hash
@property
def entity_(self) -> str:
"""RETURNS (str): ID/name of this entity_id in the KB"""
return self._retrieve_string_from_hash(self._entity_hash)
@property
def alias(self) -> int:
"""RETURNS (int): hash of the alias"""
@ -102,8 +116,3 @@ class Candidate(BaseCandidate):
@property
def entity_freq(self) -> float:
return self._entity_freq
@property
def prior_prob(self) -> float:
"""RETURNS (List[float]): Entity vector."""
return self._prior_prob

View File

@ -18,7 +18,7 @@ from .. import util
from ..util import SimpleFrozenList, ensure_path
from ..vocab cimport Vocab
from .kb cimport KnowledgeBase
from .candidate import Candidate as Candidate
from .candidate import InMemoryCandidate
cdef class InMemoryLookupKB(KnowledgeBase):
@ -223,10 +223,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
alias_entry.probs = probs
self._aliases_table[alias_index] = alias_entry
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
return self.get_alias_candidates(mention.text) # type: ignore
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
def get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
"""
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
@ -239,7 +239,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
alias_entry = self._aliases_table[alias_index]
return [
Candidate(
InMemoryCandidate(
retrieve_string_from_hash=self.vocab.strings.__getitem__,
entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].freq,

View File

@ -116,7 +116,7 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
Return candidate entities for a given mention and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (Span): Entity mention for which to identify candidates.
RETURNS (Iterable[Candidate]): Identified candidates.
RETURNS (Iterable[InMemoryCandidate]): Identified candidates.
"""
return kb.get_candidates(mention)
@ -128,6 +128,6 @@ def get_candidates_batch(
Return candidate entities for the given mentions and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (SpanGroup): Entity mentions for which to identify candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
RETURNS (Iterable[Iterable[InMemoryCandidate]]): Identified candidates.
"""
return kb.get_candidates_batch(mentions)

View File

@ -7,7 +7,7 @@ from thinc.types import Ragged
from spacy import registry, util
from spacy.attrs import ENT_KB_ID
from spacy.compat import pickle
from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
from spacy.kb import InMemoryCandidate, InMemoryLookupKB, KnowledgeBase
from spacy.lang.en import English
from spacy.ml import load_kb
from spacy.ml.models.entity_linker import build_span_maker, get_candidates
@ -506,13 +506,13 @@ def test_el_pipe_configuration(nlp):
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
def create_candidates() -> Callable[
[InMemoryLookupKB, "Span"], Iterable[Candidate]
[InMemoryLookupKB, "Span"], Iterable[InMemoryCandidate]
]:
return get_lowercased_candidates
@registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
def create_candidates_batch() -> Callable[
[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]
[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[InMemoryCandidate]]
]:
return get_lowercased_candidates_batch

View File

@ -10,9 +10,9 @@ version: 3.5
The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
implements all of its methods. It stores all KB data in-memory and generates
[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
entity names. It's highly optimized for both a low memory footprint and speed of
retrieval.
[`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions
with entity names. It's highly optimized for both a low memory footprint and
speed of retrieval.
## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"}
@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base.
## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"}
Given a certain textual mention as input, retrieve a list of candidate entities
of type [`Candidate`](/api/kb#candidate). Wraps
of type [`InMemoryCandidate`](/api/kb#candidate). Wraps
[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
> #### Example
@ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps
> candidates = kb.get_candidates(doc[0:2])
> ```
| Name | Description |
| ----------- | -------------------------------------------------------------------- |
| `mention` | The textual mention or alias. ~~Span~~ |
| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------ |
| `mention` | The textual mention or alias. ~~Span~~ |
| **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ |
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
@ -195,15 +195,15 @@ to you.
> candidates = kb.get_candidates(SpanGroup(doc, spans=[doc[0:2], doc[3:]])
> ```
| Name | Description |
| ----------- | -------------------------------------------------------------------------------------------- |
| `mentions` | The textual mention or alias. ~~SpanGroup~~ |
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------ |
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
Given a certain textual mention as input, retrieve a list of candidate entities
of type [`Candidate`](/api/kb#candidate).
of type [`InMemoryCandidate`](/api/kb#candidate).
> #### Example
>
@ -211,10 +211,10 @@ of type [`Candidate`](/api/kb#candidate).
> candidates = kb.get_alias_candidates("Douglas")
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------- |
| `alias` | The textual mention or alias. ~~str~~ |
| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
| Name | Description |
| ----------- | ----------------------------------------------------------------------------- |
| `alias` | The textual mention or alias. ~~str~~ |
| **RETURNS** | The list of relevant `InMemoryCandidate` objects. ~~List[InMemoryCandidate]~~ |
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}

View File

@ -191,25 +191,27 @@ Restore the state of the knowledge base from a given directory. Note that the
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
## Candidate {id="candidate",tag="class"}
## InMemoryCandidate {id="candidate",tag="class"}
A `Candidate` object refers to a textual mention (alias) that may or may not be
resolved to a specific entity from a `KnowledgeBase`. This will be used as input
for the entity linking algorithm which will disambiguate the various candidates
to the correct one. Each candidate `(alias, entity)` pair is assigned to a
certain prior probability.
A `InMemoryCandidate` object refers to a textual mention (alias) that may or may
not be resolved to a specific entity from a `KnowledgeBase`. This will be used
as input for the entity linking algorithm which will disambiguate the various
candidates to the correct one. Each candidate `(alias, entity)` pair is assigned
to a certain prior probability.
### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"}
### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
Construct a `Candidate` object. Usually this constructor is not called directly,
but instead these objects are returned by the `get_candidates` method of the
[`entity_linker`](/api/entitylinker) pipe.
Construct a `InMemoryCandidate` object. Usually this constructor is not called
directly, but instead these objects are returned by the `get_candidates` method
of the [`entity_linker`](/api/entitylinker) pipe.
> #### Example
> #### Example```python
>
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
>
> ```
>
> ```python
> from spacy.kb import Candidate
> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
> ```
| Name | Description |
@ -220,7 +222,7 @@ but instead these objects are returned by the `get_candidates` method of the
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ |
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
## Candidate attributes {id="candidate-attributes"}
## InMemoryCandidate attributes {id="candidate-attributes"}
| Name | Description |
| --------------- | ------------------------------------------------------------------------ |