mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-22 18:12:00 +03:00
Merge branch 'refactor/el-candidates' into refactor/span-group-for-mentions
# Conflicts: # spacy/ml/models/entity_linker.py # website/docs/api/inmemorylookupkb.mdx
This commit is contained in:
commit
3beda2b23a
|
@ -1,5 +1,5 @@
|
|||
from .kb import KnowledgeBase
|
||||
from .kb_in_memory import InMemoryLookupKB
|
||||
from .candidate import Candidate
|
||||
from .candidate import Candidate, InMemoryCandidate
|
||||
|
||||
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate"]
|
||||
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
||||
|
|
|
@ -2,8 +2,8 @@ import abc
|
|||
from typing import List, Union, Callable
|
||||
|
||||
|
||||
class BaseCandidate(abc.ABC):
|
||||
"""A `BaseCandidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
class Candidate(abc.ABC):
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity_id) pair is assigned a certain prior probability.
|
||||
|
@ -12,26 +12,38 @@ class BaseCandidate(abc.ABC):
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self, mention: str, entity_id: Union[int, str], entity_vector: List[float]
|
||||
self,
|
||||
mention: str,
|
||||
entity_id: int,
|
||||
entity_name: str,
|
||||
entity_vector: List[float],
|
||||
prior_prob: float,
|
||||
):
|
||||
"""Create new instance of `Candidate`. Note: has to be a sub-class, otherwise error will be raised.
|
||||
"""Initializes properties of `Candidate` instance.
|
||||
mention (str): Mention text for this candidate.
|
||||
entity_id (Union[int, str]): Unique entity ID.
|
||||
entity_id (int): Unique entity ID.
|
||||
entity_name (str): Entity name.
|
||||
entity_vector (List[float]): Entity embedding.
|
||||
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
|
||||
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
|
||||
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
|
||||
doesn't) it might be better to eschew this information and always supply the same value.
|
||||
"""
|
||||
self._mention = mention
|
||||
self._entity_id = entity_id
|
||||
self._entity_name = entity_name
|
||||
self._entity_vector = entity_vector
|
||||
self._prior_prob = prior_prob
|
||||
|
||||
@property
|
||||
def entity(self) -> Union[int, str]:
|
||||
"""RETURNS (Union[int, str]): Entity ID."""
|
||||
def entity(self) -> int:
|
||||
"""RETURNS (int): Unique entity ID."""
|
||||
return self._entity_id
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def entity_(self) -> str:
|
||||
"""RETURNS (str): Entity name."""
|
||||
"""RETURNS (int): Entity name."""
|
||||
return self._entity_name
|
||||
|
||||
@property
|
||||
def mention(self) -> str:
|
||||
|
@ -43,9 +55,14 @@ class BaseCandidate(abc.ABC):
|
|||
"""RETURNS (List[float]): Entity vector."""
|
||||
return self._entity_vector
|
||||
|
||||
@property
|
||||
def prior_prob(self) -> float:
|
||||
"""RETURNS (List[float]): Entity vector."""
|
||||
return self._prior_prob
|
||||
|
||||
class Candidate(BaseCandidate):
|
||||
"""`Candidate` for InMemoryLookupKBCandidate."""
|
||||
|
||||
class InMemoryCandidate(Candidate):
|
||||
"""Candidate for InMemoryLookupKB."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -57,7 +74,7 @@ class Candidate(BaseCandidate):
|
|||
prior_prob: float,
|
||||
):
|
||||
"""
|
||||
retrieve_string_from_hash (Callable[[int], str]): Callable retrieveing entity name from provided entity/vocab
|
||||
retrieve_string_from_hash (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab
|
||||
hash.
|
||||
entity_hash (str): Hashed entity name /ID.
|
||||
entity_freq (int): Entity frequency in KB corpus.
|
||||
|
@ -71,7 +88,9 @@ class Candidate(BaseCandidate):
|
|||
super().__init__(
|
||||
mention=retrieve_string_from_hash(alias_hash),
|
||||
entity_id=entity_hash,
|
||||
entity_name=retrieve_string_from_hash(entity_hash),
|
||||
entity_vector=entity_vector,
|
||||
prior_prob=prior_prob,
|
||||
)
|
||||
self._retrieve_string_from_hash = retrieve_string_from_hash
|
||||
self._entity_hash = entity_hash
|
||||
|
@ -84,11 +103,6 @@ class Candidate(BaseCandidate):
|
|||
"""RETURNS (int): hash of the entity_id's KB ID/name"""
|
||||
return self._entity_hash
|
||||
|
||||
@property
|
||||
def entity_(self) -> str:
|
||||
"""RETURNS (str): ID/name of this entity_id in the KB"""
|
||||
return self._retrieve_string_from_hash(self._entity_hash)
|
||||
|
||||
@property
|
||||
def alias(self) -> int:
|
||||
"""RETURNS (int): hash of the alias"""
|
||||
|
@ -102,8 +116,3 @@ class Candidate(BaseCandidate):
|
|||
@property
|
||||
def entity_freq(self) -> float:
|
||||
return self._entity_freq
|
||||
|
||||
@property
|
||||
def prior_prob(self) -> float:
|
||||
"""RETURNS (List[float]): Entity vector."""
|
||||
return self._prior_prob
|
||||
|
|
|
@ -18,7 +18,7 @@ from .. import util
|
|||
from ..util import SimpleFrozenList, ensure_path
|
||||
from ..vocab cimport Vocab
|
||||
from .kb cimport KnowledgeBase
|
||||
from .candidate import Candidate as Candidate
|
||||
from .candidate import InMemoryCandidate
|
||||
|
||||
|
||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||
|
@ -223,10 +223,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
alias_entry.probs = probs
|
||||
self._aliases_table[alias_index] = alias_entry
|
||||
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
|
||||
return self.get_alias_candidates(mention.text) # type: ignore
|
||||
|
||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||
def get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
|
||||
"""
|
||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
|
@ -239,7 +239,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
alias_entry = self._aliases_table[alias_index]
|
||||
|
||||
return [
|
||||
Candidate(
|
||||
InMemoryCandidate(
|
||||
retrieve_string_from_hash=self.vocab.strings.__getitem__,
|
||||
entity_hash=self._entries[entry_index].entity_hash,
|
||||
entity_freq=self._entries[entry_index].freq,
|
||||
|
|
|
@ -116,7 +116,7 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
|||
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Span): Entity mention for which to identify candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
RETURNS (Iterable[InMemoryCandidate]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates(mention)
|
||||
|
||||
|
@ -128,6 +128,6 @@ def get_candidates_batch(
|
|||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (SpanGroup): Entity mentions for which to identify candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
RETURNS (Iterable[Iterable[InMemoryCandidate]]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates_batch(mentions)
|
||||
|
|
|
@ -7,7 +7,7 @@ from thinc.types import Ragged
|
|||
from spacy import registry, util
|
||||
from spacy.attrs import ENT_KB_ID
|
||||
from spacy.compat import pickle
|
||||
from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
|
||||
from spacy.kb import InMemoryCandidate, InMemoryLookupKB, KnowledgeBase
|
||||
from spacy.lang.en import English
|
||||
from spacy.ml import load_kb
|
||||
from spacy.ml.models.entity_linker import build_span_maker, get_candidates
|
||||
|
@ -506,13 +506,13 @@ def test_el_pipe_configuration(nlp):
|
|||
|
||||
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
||||
def create_candidates() -> Callable[
|
||||
[InMemoryLookupKB, "Span"], Iterable[Candidate]
|
||||
[InMemoryLookupKB, "Span"], Iterable[InMemoryCandidate]
|
||||
]:
|
||||
return get_lowercased_candidates
|
||||
|
||||
@registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
|
||||
def create_candidates_batch() -> Callable[
|
||||
[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]
|
||||
[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[InMemoryCandidate]]
|
||||
]:
|
||||
return get_lowercased_candidates_batch
|
||||
|
||||
|
|
|
@ -10,9 +10,9 @@ version: 3.5
|
|||
|
||||
The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
|
||||
implements all of its methods. It stores all KB data in-memory and generates
|
||||
[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
|
||||
entity names. It's highly optimized for both a low memory footprint and speed of
|
||||
retrieval.
|
||||
[`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions
|
||||
with entity names. It's highly optimized for both a low memory footprint and
|
||||
speed of retrieval.
|
||||
|
||||
## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"}
|
||||
|
||||
|
@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base.
|
|||
## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"}
|
||||
|
||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||
of type [`Candidate`](/api/kb#candidate). Wraps
|
||||
of type [`InMemoryCandidate`](/api/kb#candidate). Wraps
|
||||
[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||
|
||||
> #### Example
|
||||
|
@ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps
|
|||
> candidates = kb.get_candidates(doc[0:2])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------- |
|
||||
| `mention` | The textual mention or alias. ~~Span~~ |
|
||||
| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------ |
|
||||
| `mention` | The textual mention or alias. ~~Span~~ |
|
||||
| **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ |
|
||||
|
||||
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
|
||||
|
||||
|
@ -195,15 +195,15 @@ to you.
|
|||
> candidates = kb.get_candidates(SpanGroup(doc, spans=[doc[0:2], doc[3:]])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||
| `mentions` | The textual mention or alias. ~~SpanGroup~~ |
|
||||
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------ |
|
||||
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
||||
| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
|
||||
|
||||
## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
|
||||
|
||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||
of type [`Candidate`](/api/kb#candidate).
|
||||
of type [`InMemoryCandidate`](/api/kb#candidate).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -211,10 +211,10 @@ of type [`Candidate`](/api/kb#candidate).
|
|||
> candidates = kb.get_alias_candidates("Douglas")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------- |
|
||||
| `alias` | The textual mention or alias. ~~str~~ |
|
||||
| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------- |
|
||||
| `alias` | The textual mention or alias. ~~str~~ |
|
||||
| **RETURNS** | The list of relevant `InMemoryCandidate` objects. ~~List[InMemoryCandidate]~~ |
|
||||
|
||||
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
|
||||
|
||||
|
|
|
@ -191,25 +191,27 @@ Restore the state of the knowledge base from a given directory. Note that the
|
|||
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
|
||||
|
||||
## Candidate {id="candidate",tag="class"}
|
||||
## InMemoryCandidate {id="candidate",tag="class"}
|
||||
|
||||
A `Candidate` object refers to a textual mention (alias) that may or may not be
|
||||
resolved to a specific entity from a `KnowledgeBase`. This will be used as input
|
||||
for the entity linking algorithm which will disambiguate the various candidates
|
||||
to the correct one. Each candidate `(alias, entity)` pair is assigned to a
|
||||
certain prior probability.
|
||||
A `InMemoryCandidate` object refers to a textual mention (alias) that may or may
|
||||
not be resolved to a specific entity from a `KnowledgeBase`. This will be used
|
||||
as input for the entity linking algorithm which will disambiguate the various
|
||||
candidates to the correct one. Each candidate `(alias, entity)` pair is assigned
|
||||
to a certain prior probability.
|
||||
|
||||
### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"}
|
||||
### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
|
||||
|
||||
Construct a `Candidate` object. Usually this constructor is not called directly,
|
||||
but instead these objects are returned by the `get_candidates` method of the
|
||||
[`entity_linker`](/api/entitylinker) pipe.
|
||||
Construct a `InMemoryCandidate` object. Usually this constructor is not called
|
||||
directly, but instead these objects are returned by the `get_candidates` method
|
||||
of the [`entity_linker`](/api/entitylinker) pipe.
|
||||
|
||||
> #### Example
|
||||
> #### Example```python
|
||||
>
|
||||
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
|
||||
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
|
||||
>
|
||||
> ```
|
||||
>
|
||||
> ```python
|
||||
> from spacy.kb import Candidate
|
||||
> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -220,7 +222,7 @@ but instead these objects are returned by the `get_candidates` method of the
|
|||
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ |
|
||||
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
||||
|
||||
## Candidate attributes {id="candidate-attributes"}
|
||||
## InMemoryCandidate attributes {id="candidate-attributes"}
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | ------------------------------------------------------------------------ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user