mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Entity linking: use SpanGroup
instead of Iterable[Span]
for mentions (#12344)
* Convert Candidate from Cython to Python class. * Format. * Fix .entity_ typo in _add_activations() usage. * Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span]. * Update docs. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update doc string of BaseCandidate.__init__(). * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate. * Adjust Candidate to support and mandate numerical entity IDs. * Format. * Fix docstring and docs. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename alias -> mention. * Refactor Candidate attribute names. Update docs and tests accordingly. * Refacor Candidate attributes and their usage. * Format. * Fix mypy error. * Update error code in line with v4 convention. * Reverse erroneous changes during merge. * Update return type in EL tests. * Re-add Candidate to setup.py. * Format updated docs. --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
9340eb8ad2
commit
3102e2e27a
|
@ -2,5 +2,4 @@ from .kb import KnowledgeBase
|
|||
from .kb_in_memory import InMemoryLookupKB
|
||||
from .candidate import Candidate, InMemoryCandidate
|
||||
|
||||
|
||||
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .candidate import Candidate
|
||||
from ..tokens import Span
|
||||
from ..tokens import Span, SpanGroup
|
||||
from ..util import SimpleFrozenList
|
||||
from ..errors import Errors
|
||||
|
||||
|
@ -30,13 +30,13 @@ cdef class KnowledgeBase:
|
|||
self.entity_vector_length = entity_vector_length
|
||||
self.mem = Pool()
|
||||
|
||||
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||
def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
|
||||
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
||||
probability of the specified mention text resolving to that entity - might be included.
|
||||
If no candidates are found for a given mention, an empty list is returned.
|
||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||
mentions (SpanGroup): Mentions for which to get candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return [self.get_candidates(span) for span in mentions]
|
||||
|
|
|
@ -8,7 +8,7 @@ from ...util import registry
|
|||
from ...kb import KnowledgeBase, InMemoryLookupKB
|
||||
from ...kb import Candidate
|
||||
from ...vocab import Vocab
|
||||
from ...tokens import Span, Doc
|
||||
from ...tokens import Doc, Span, SpanGroup
|
||||
from ..extract_spans import extract_spans
|
||||
from ...errors import Errors
|
||||
|
||||
|
@ -114,7 +114,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
|||
|
||||
@registry.misc("spacy.CandidateBatchGenerator.v1")
|
||||
def create_candidates_batch() -> Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
|
||||
]:
|
||||
return get_candidates_batch
|
||||
|
||||
|
@ -130,12 +130,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
|||
|
||||
|
||||
def get_candidates_batch(
|
||||
kb: KnowledgeBase, mentions: Iterable[Span]
|
||||
kb: KnowledgeBase, mentions: SpanGroup
|
||||
) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mentions (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||
mentions (SpanGroup): Entity mentions for which to identify candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates_batch(mentions)
|
||||
|
|
|
@ -11,6 +11,8 @@ from thinc.api import set_dropout_rate
|
|||
|
||||
from ..kb import KnowledgeBase, Candidate
|
||||
from ..tokens import Doc, Span
|
||||
from ..ml import empty_kb
|
||||
from ..tokens import Doc, Span, SpanGroup
|
||||
from .pipe import deserialize_config
|
||||
from .trainable_pipe import TrainablePipe
|
||||
from ..language import Language
|
||||
|
@ -82,7 +84,7 @@ def make_entity_linker(
|
|||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool,
|
||||
|
@ -105,7 +107,7 @@ def make_entity_linker(
|
|||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
get_candidates_batch (
|
||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||
Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
|
@ -170,7 +172,7 @@ class EntityLinker(TrainablePipe):
|
|||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool = False,
|
||||
|
@ -194,7 +196,7 @@ class EntityLinker(TrainablePipe):
|
|||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
get_candidates_batch (
|
||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||
Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
|
||||
Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
|
@ -473,7 +475,8 @@ class EntityLinker(TrainablePipe):
|
|||
|
||||
batch_candidates = list(
|
||||
self.get_candidates_batch(
|
||||
self.kb, [ent_batch[idx] for idx in valid_ent_idx]
|
||||
self.kb,
|
||||
SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
|
||||
)
|
||||
if self.candidates_batch_size > 1
|
||||
else [
|
||||
|
|
|
@ -997,7 +997,6 @@ def test_scorer_links():
|
|||
)
|
||||
# fmt: on
|
||||
def test_legacy_architectures(name, config):
|
||||
|
||||
# Ensure that the legacy architectures still work
|
||||
vector_length = 3
|
||||
nlp = English()
|
||||
|
|
|
@ -189,14 +189,15 @@ to you.
|
|||
>
|
||||
> ```python
|
||||
> from spacy.lang.en import English
|
||||
> from spacy.tokens import SpanGroup
|
||||
> nlp = English()
|
||||
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
|
||||
> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------ |
|
||||
| `mentions` | The textual mentions. ~~Iterable[Span]~~ |
|
||||
| `mentions` | The textual mentions. ~~SpanGroup~~ |
|
||||
| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
|
||||
|
||||
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
|
||||
|
|
|
@ -93,14 +93,15 @@ to you.
|
|||
>
|
||||
> ```python
|
||||
> from spacy.lang.en import English
|
||||
> from spacy.tokens import SpanGroup
|
||||
> nlp = English()
|
||||
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
|
||||
> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
||||
| `mentions` | The textual mentions. ~~SpanGroup~~ |
|
||||
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
||||
|
||||
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
|
||||
|
@ -187,13 +188,11 @@ Construct an `InMemoryCandidate` object. Usually this constructor is not called
|
|||
directly, but instead these objects are returned by the `get_candidates` method
|
||||
of the [`entity_linker`](/api/entitylinker) pipe.
|
||||
|
||||
> #### Example```python
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
|
||||
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
|
||||
>
|
||||
> ```
|
||||
>
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
|
Loading…
Reference in New Issue
Block a user