Merge branch 'refactor/el-candidates' into refactor/span-group-for-mentions

# Conflicts:
#	spacy/ml/models/entity_linker.py
#	website/docs/api/inmemorylookupkb.mdx
This commit is contained in:
Raphael Mitsch 2023-03-03 08:32:38 +01:00
commit 3beda2b23a
7 changed files with 76 additions and 65 deletions

View File

@ -1,5 +1,5 @@
from .kb import KnowledgeBase from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB from .kb_in_memory import InMemoryLookupKB
from .candidate import Candidate from .candidate import Candidate, InMemoryCandidate
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate"] __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]

View File

@ -2,8 +2,8 @@ import abc
from typing import List, Union, Callable from typing import List, Union, Callable
class BaseCandidate(abc.ABC): class Candidate(abc.ABC):
"""A `BaseCandidate` object refers to a textual mention (`alias`) that may or may not be resolved """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking
algorithm which will disambiguate the various candidates to the correct one. algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity_id) pair is assigned a certain prior probability. Each candidate (alias, entity_id) pair is assigned a certain prior probability.
@ -12,26 +12,38 @@ class BaseCandidate(abc.ABC):
""" """
def __init__( def __init__(
self, mention: str, entity_id: Union[int, str], entity_vector: List[float] self,
mention: str,
entity_id: int,
entity_name: str,
entity_vector: List[float],
prior_prob: float,
): ):
"""Create new instance of `Candidate`. Note: has to be a sub-class, otherwise error will be raised. """Initializes properties of `Candidate` instance.
mention (str): Mention text for this candidate. mention (str): Mention text for this candidate.
entity_id (Union[int, str]): Unique entity ID. entity_id (int): Unique entity ID.
entity_name (str): Entity name.
entity_vector (List[float]): Entity embedding. entity_vector (List[float]): Entity embedding.
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
doesn't) it might be better to eschew this information and always supply the same value.
""" """
self._mention = mention self._mention = mention
self._entity_id = entity_id self._entity_id = entity_id
self._entity_name = entity_name
self._entity_vector = entity_vector self._entity_vector = entity_vector
self._prior_prob = prior_prob
@property @property
def entity(self) -> Union[int, str]: def entity(self) -> int:
"""RETURNS (Union[int, str]): Entity ID.""" """RETURNS (int): Unique entity ID."""
return self._entity_id return self._entity_id
@property @property
@abc.abstractmethod
def entity_(self) -> str: def entity_(self) -> str:
"""RETURNS (str): Entity name.""" """RETURNS (int): Entity name."""
return self._entity_name
@property @property
def mention(self) -> str: def mention(self) -> str:
@ -43,9 +55,14 @@ class BaseCandidate(abc.ABC):
"""RETURNS (List[float]): Entity vector.""" """RETURNS (List[float]): Entity vector."""
return self._entity_vector return self._entity_vector
@property
def prior_prob(self) -> float:
"""RETURNS (List[float]): Entity vector."""
return self._prior_prob
class Candidate(BaseCandidate):
"""`Candidate` for InMemoryLookupKBCandidate.""" class InMemoryCandidate(Candidate):
"""Candidate for InMemoryLookupKB."""
def __init__( def __init__(
self, self,
@ -57,7 +74,7 @@ class Candidate(BaseCandidate):
prior_prob: float, prior_prob: float,
): ):
""" """
retrieve_string_from_hash (Callable[[int], str]): Callable retrieveing entity name from provided entity/vocab retrieve_string_from_hash (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab
hash. hash.
entity_hash (str): Hashed entity name /ID. entity_hash (str): Hashed entity name /ID.
entity_freq (int): Entity frequency in KB corpus. entity_freq (int): Entity frequency in KB corpus.
@ -71,7 +88,9 @@ class Candidate(BaseCandidate):
super().__init__( super().__init__(
mention=retrieve_string_from_hash(alias_hash), mention=retrieve_string_from_hash(alias_hash),
entity_id=entity_hash, entity_id=entity_hash,
entity_name=retrieve_string_from_hash(entity_hash),
entity_vector=entity_vector, entity_vector=entity_vector,
prior_prob=prior_prob,
) )
self._retrieve_string_from_hash = retrieve_string_from_hash self._retrieve_string_from_hash = retrieve_string_from_hash
self._entity_hash = entity_hash self._entity_hash = entity_hash
@ -84,11 +103,6 @@ class Candidate(BaseCandidate):
"""RETURNS (int): hash of the entity_id's KB ID/name""" """RETURNS (int): hash of the entity_id's KB ID/name"""
return self._entity_hash return self._entity_hash
@property
def entity_(self) -> str:
"""RETURNS (str): ID/name of this entity_id in the KB"""
return self._retrieve_string_from_hash(self._entity_hash)
@property @property
def alias(self) -> int: def alias(self) -> int:
"""RETURNS (int): hash of the alias""" """RETURNS (int): hash of the alias"""
@ -102,8 +116,3 @@ class Candidate(BaseCandidate):
@property @property
def entity_freq(self) -> float: def entity_freq(self) -> float:
return self._entity_freq return self._entity_freq
@property
def prior_prob(self) -> float:
"""RETURNS (List[float]): Entity vector."""
return self._prior_prob

View File

@ -18,7 +18,7 @@ from .. import util
from ..util import SimpleFrozenList, ensure_path from ..util import SimpleFrozenList, ensure_path
from ..vocab cimport Vocab from ..vocab cimport Vocab
from .kb cimport KnowledgeBase from .kb cimport KnowledgeBase
from .candidate import Candidate as Candidate from .candidate import InMemoryCandidate
cdef class InMemoryLookupKB(KnowledgeBase): cdef class InMemoryLookupKB(KnowledgeBase):
@ -223,10 +223,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
alias_entry.probs = probs alias_entry.probs = probs
self._aliases_table[alias_index] = alias_entry self._aliases_table[alias_index] = alias_entry
def get_candidates(self, mention: Span) -> Iterable[Candidate]: def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
return self.get_alias_candidates(mention.text) # type: ignore return self.get_alias_candidates(mention.text) # type: ignore
def get_alias_candidates(self, str alias) -> Iterable[Candidate]: def get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
""" """
Return candidate entities for an alias. Each candidate defines the entity, the original alias, Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity. and the prior probability of that alias resolving to that entity.
@ -239,7 +239,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
alias_entry = self._aliases_table[alias_index] alias_entry = self._aliases_table[alias_index]
return [ return [
Candidate( InMemoryCandidate(
retrieve_string_from_hash=self.vocab.strings.__getitem__, retrieve_string_from_hash=self.vocab.strings.__getitem__,
entity_hash=self._entries[entry_index].entity_hash, entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].freq, entity_freq=self._entries[entry_index].freq,

View File

@ -116,7 +116,7 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
Return candidate entities for a given mention and fetching appropriate entries from the index. Return candidate entities for a given mention and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query. kb (KnowledgeBase): Knowledge base to query.
mention (Span): Entity mention for which to identify candidates. mention (Span): Entity mention for which to identify candidates.
RETURNS (Iterable[Candidate]): Identified candidates. RETURNS (Iterable[InMemoryCandidate]): Identified candidates.
""" """
return kb.get_candidates(mention) return kb.get_candidates(mention)
@ -128,6 +128,6 @@ def get_candidates_batch(
Return candidate entities for the given mentions and fetching appropriate entries from the index. Return candidate entities for the given mentions and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query. kb (KnowledgeBase): Knowledge base to query.
mention (SpanGroup): Entity mentions for which to identify candidates. mention (SpanGroup): Entity mentions for which to identify candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. RETURNS (Iterable[Iterable[InMemoryCandidate]]): Identified candidates.
""" """
return kb.get_candidates_batch(mentions) return kb.get_candidates_batch(mentions)

View File

@ -7,7 +7,7 @@ from thinc.types import Ragged
from spacy import registry, util from spacy import registry, util
from spacy.attrs import ENT_KB_ID from spacy.attrs import ENT_KB_ID
from spacy.compat import pickle from spacy.compat import pickle
from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase from spacy.kb import InMemoryCandidate, InMemoryLookupKB, KnowledgeBase
from spacy.lang.en import English from spacy.lang.en import English
from spacy.ml import load_kb from spacy.ml import load_kb
from spacy.ml.models.entity_linker import build_span_maker, get_candidates from spacy.ml.models.entity_linker import build_span_maker, get_candidates
@ -506,13 +506,13 @@ def test_el_pipe_configuration(nlp):
@registry.misc("spacy.LowercaseCandidateGenerator.v1") @registry.misc("spacy.LowercaseCandidateGenerator.v1")
def create_candidates() -> Callable[ def create_candidates() -> Callable[
[InMemoryLookupKB, "Span"], Iterable[Candidate] [InMemoryLookupKB, "Span"], Iterable[InMemoryCandidate]
]: ]:
return get_lowercased_candidates return get_lowercased_candidates
@registry.misc("spacy.LowercaseCandidateBatchGenerator.v1") @registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
def create_candidates_batch() -> Callable[ def create_candidates_batch() -> Callable[
[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]] [InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[InMemoryCandidate]]
]: ]:
return get_lowercased_candidates_batch return get_lowercased_candidates_batch

View File

@ -10,9 +10,9 @@ version: 3.5
The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
implements all of its methods. It stores all KB data in-memory and generates implements all of its methods. It stores all KB data in-memory and generates
[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with [`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions
entity names. It's highly optimized for both a low memory footprint and speed of with entity names. It's highly optimized for both a low memory footprint and
retrieval. speed of retrieval.
## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"} ## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"}
@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base.
## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"} ## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"}
Given a certain textual mention as input, retrieve a list of candidate entities Given a certain textual mention as input, retrieve a list of candidate entities
of type [`Candidate`](/api/kb#candidate). Wraps of type [`InMemoryCandidate`](/api/kb#candidate). Wraps
[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). [`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
> #### Example > #### Example
@ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps
> candidates = kb.get_candidates(doc[0:2]) > candidates = kb.get_candidates(doc[0:2])
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | -------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------ |
| `mention` | The textual mention or alias. ~~Span~~ | | `mention` | The textual mention or alias. ~~Span~~ |
| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ | | **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ |
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"} ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
@ -195,15 +195,15 @@ to you.
> candidates = kb.get_candidates(SpanGroup(doc, spans=[doc[0:2], doc[3:]]) > candidates = kb.get_candidates(SpanGroup(doc, spans=[doc[0:2], doc[3:]])
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | -------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------ |
| `mentions` | The textual mention or alias. ~~SpanGroup~~ | | `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | | **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"} ## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
Given a certain textual mention as input, retrieve a list of candidate entities Given a certain textual mention as input, retrieve a list of candidate entities
of type [`Candidate`](/api/kb#candidate). of type [`InMemoryCandidate`](/api/kb#candidate).
> #### Example > #### Example
> >
@ -211,10 +211,10 @@ of type [`Candidate`](/api/kb#candidate).
> candidates = kb.get_alias_candidates("Douglas") > candidates = kb.get_alias_candidates("Douglas")
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ------------------------------------------------------------- | | ----------- | ----------------------------------------------------------------------------- |
| `alias` | The textual mention or alias. ~~str~~ | | `alias` | The textual mention or alias. ~~str~~ |
| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ | | **RETURNS** | The list of relevant `InMemoryCandidate` objects. ~~List[InMemoryCandidate]~~ |
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"} ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}

View File

@ -191,25 +191,27 @@ Restore the state of the knowledge base from a given directory. Note that the
| `exclude` | List of components to exclude. ~~Iterable[str]~~ | | `exclude` | List of components to exclude. ~~Iterable[str]~~ |
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ | | **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
## Candidate {id="candidate",tag="class"} ## InMemoryCandidate {id="candidate",tag="class"}
A `Candidate` object refers to a textual mention (alias) that may or may not be A `InMemoryCandidate` object refers to a textual mention (alias) that may or may
resolved to a specific entity from a `KnowledgeBase`. This will be used as input not be resolved to a specific entity from a `KnowledgeBase`. This will be used
for the entity linking algorithm which will disambiguate the various candidates as input for the entity linking algorithm which will disambiguate the various
to the correct one. Each candidate `(alias, entity)` pair is assigned to a candidates to the correct one. Each candidate `(alias, entity)` pair is assigned
certain prior probability. to a certain prior probability.
### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"} ### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
Construct a `Candidate` object. Usually this constructor is not called directly, Construct a `InMemoryCandidate` object. Usually this constructor is not called
but instead these objects are returned by the `get_candidates` method of the directly, but instead these objects are returned by the `get_candidates` method
[`entity_linker`](/api/entitylinker) pipe. of the [`entity_linker`](/api/entitylinker) pipe.
> #### Example > #### Example```python
>
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
>
> ```
> >
> ```python
> from spacy.kb import Candidate
> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
> ``` > ```
| Name | Description | | Name | Description |
@ -220,7 +222,7 @@ but instead these objects are returned by the `get_candidates` method of the
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ | | `alias_hash` | The hash of the textual mention or alias. ~~int~~ |
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ | | `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
## Candidate attributes {id="candidate-attributes"} ## InMemoryCandidate attributes {id="candidate-attributes"}
| Name | Description | | Name | Description |
| --------------- | ------------------------------------------------------------------------ | | --------------- | ------------------------------------------------------------------------ |