Entity linking: use SpanGroup instead of Iterable[Span] for mentions (#12344)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span].

* Update docs.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Reverse erroneous changes during merge.

* Update return type in EL tests.

* Re-add Candidate to setup.py.

* Format updated docs.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Raphael Mitsch 2023-03-20 12:25:18 +01:00 committed by GitHub
parent 9340eb8ad2
commit 3102e2e27a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 23 additions and 22 deletions

View File

@ -2,5 +2,4 @@ from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB
from .candidate import Candidate, InMemoryCandidate
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]

View File

@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union
from cymem.cymem cimport Pool
from .candidate import Candidate
from ..tokens import Span
from ..tokens import Span, SpanGroup
from ..util import SimpleFrozenList
from ..errors import Errors
@ -30,13 +30,13 @@ cdef class KnowledgeBase:
self.entity_vector_length = entity_vector_length
self.mem = Pool()
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
"""
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
probability of the specified mention text resolving to that entity - might be included.
If no candidates are found for a given mention, an empty list is returned.
mentions (Iterable[Span]): Mentions for which to get candidates.
mentions (SpanGroup): Mentions for which to get candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
"""
return [self.get_candidates(span) for span in mentions]

View File

@ -8,7 +8,7 @@ from ...util import registry
from ...kb import KnowledgeBase, InMemoryLookupKB
from ...kb import Candidate
from ...vocab import Vocab
from ...tokens import Span, Doc
from ...tokens import Doc, Span, SpanGroup
from ..extract_spans import extract_spans
from ...errors import Errors
@ -114,7 +114,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
@registry.misc("spacy.CandidateBatchGenerator.v1")
def create_candidates_batch() -> Callable[
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
]:
return get_candidates_batch
@ -130,12 +130,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
def get_candidates_batch(
kb: KnowledgeBase, mentions: Iterable[Span]
kb: KnowledgeBase, mentions: SpanGroup
) -> Iterable[Iterable[Candidate]]:
"""
Return candidate entities for the given mentions and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mentions (Iterable[Span]): Entity mentions for which to identify candidates.
mentions (SpanGroup): Entity mentions for which to identify candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
"""
return kb.get_candidates_batch(mentions)

View File

@ -11,6 +11,8 @@ from thinc.api import set_dropout_rate
from ..kb import KnowledgeBase, Candidate
from ..tokens import Doc, Span
from ..ml import empty_kb
from ..tokens import Doc, Span, SpanGroup
from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe
from ..language import Language
@ -82,7 +84,7 @@ def make_entity_linker(
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
get_candidates_batch: Callable[
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
],
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
overwrite: bool,
@ -105,7 +107,7 @@ def make_entity_linker(
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
get_candidates_batch (
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
scorer (Optional[Callable]): The scoring method.
@ -170,7 +172,7 @@ class EntityLinker(TrainablePipe):
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
get_candidates_batch: Callable[
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
],
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
overwrite: bool = False,
@ -194,7 +196,7 @@ class EntityLinker(TrainablePipe):
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
get_candidates_batch (
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
Iterable[Candidate]]
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
@ -473,7 +475,8 @@ class EntityLinker(TrainablePipe):
batch_candidates = list(
self.get_candidates_batch(
self.kb, [ent_batch[idx] for idx in valid_ent_idx]
self.kb,
SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
)
if self.candidates_batch_size > 1
else [

View File

@ -997,7 +997,6 @@ def test_scorer_links():
)
# fmt: on
def test_legacy_architectures(name, config):
# Ensure that the legacy architectures still work
vector_length = 3
nlp = English()

View File

@ -189,14 +189,15 @@ to you.
>
> ```python
> from spacy.lang.en import English
> from spacy.tokens import SpanGroup
> nlp = English()
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------ |
| `mentions` | The textual mentions. ~~Iterable[Span]~~ |
| `mentions` | The textual mentions. ~~SpanGroup~~ |
| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}

View File

@ -93,14 +93,15 @@ to you.
>
> ```python
> from spacy.lang.en import English
> from spacy.tokens import SpanGroup
> nlp = English()
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
> ```
| Name | Description |
| ----------- | -------------------------------------------------------------------------------------------- |
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
| `mentions` | The textual mentions. ~~SpanGroup~~ |
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
@ -187,13 +188,11 @@ Construct an `InMemoryCandidate` object. Usually this constructor is not called
directly, but instead these objects are returned by the `get_candidates` method
of the [`entity_linker`](/api/entitylinker) pipe.
> #### Example```python
> #### Example
>
> ```python
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
>
> ```
>
> ```
| Name | Description |