mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-25 11:23:40 +03:00
Entity linking: use SpanGroup
instead of Iterable[Span]
for mentions (#12344)
* Convert Candidate from Cython to Python class. * Format. * Fix .entity_ typo in _add_activations() usage. * Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span]. * Update docs. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update doc string of BaseCandidate.__init__(). * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate. * Adjust Candidate to support and mandate numerical entity IDs. * Format. * Fix docstring and docs. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename alias -> mention. * Refactor Candidate attribute names. Update docs and tests accordingly. * Refacor Candidate attributes and their usage. * Format. * Fix mypy error. * Update error code in line with v4 convention. * Reverse erroneous changes during merge. * Update return type in EL tests. * Re-add Candidate to setup.py. * Format updated docs. --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
9340eb8ad2
commit
3102e2e27a
|
@ -2,5 +2,4 @@ from .kb import KnowledgeBase
|
||||||
from .kb_in_memory import InMemoryLookupKB
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
from .candidate import Candidate, InMemoryCandidate
|
from .candidate import Candidate, InMemoryCandidate
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
||||||
|
|
|
@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .candidate import Candidate
|
from .candidate import Candidate
|
||||||
from ..tokens import Span
|
from ..tokens import Span, SpanGroup
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
@ -30,13 +30,13 @@ cdef class KnowledgeBase:
|
||||||
self.entity_vector_length = entity_vector_length
|
self.entity_vector_length = entity_vector_length
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
|
||||||
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
|
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
|
||||||
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
||||||
probability of the specified mention text resolving to that entity - might be included.
|
probability of the specified mention text resolving to that entity - might be included.
|
||||||
If no candidates are found for a given mention, an empty list is returned.
|
If no candidates are found for a given mention, an empty list is returned.
|
||||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
mentions (SpanGroup): Mentions for which to get candidates.
|
||||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
"""
|
"""
|
||||||
return [self.get_candidates(span) for span in mentions]
|
return [self.get_candidates(span) for span in mentions]
|
||||||
|
|
|
@ -8,7 +8,7 @@ from ...util import registry
|
||||||
from ...kb import KnowledgeBase, InMemoryLookupKB
|
from ...kb import KnowledgeBase, InMemoryLookupKB
|
||||||
from ...kb import Candidate
|
from ...kb import Candidate
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...tokens import Span, Doc
|
from ...tokens import Doc, Span, SpanGroup
|
||||||
from ..extract_spans import extract_spans
|
from ..extract_spans import extract_spans
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
||||||
|
|
||||||
@registry.misc("spacy.CandidateBatchGenerator.v1")
|
@registry.misc("spacy.CandidateBatchGenerator.v1")
|
||||||
def create_candidates_batch() -> Callable[
|
def create_candidates_batch() -> Callable[
|
||||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
|
||||||
]:
|
]:
|
||||||
return get_candidates_batch
|
return get_candidates_batch
|
||||||
|
|
||||||
|
@ -130,12 +130,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||||
|
|
||||||
|
|
||||||
def get_candidates_batch(
|
def get_candidates_batch(
|
||||||
kb: KnowledgeBase, mentions: Iterable[Span]
|
kb: KnowledgeBase, mentions: SpanGroup
|
||||||
) -> Iterable[Iterable[Candidate]]:
|
) -> Iterable[Iterable[Candidate]]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||||
kb (KnowledgeBase): Knowledge base to query.
|
kb (KnowledgeBase): Knowledge base to query.
|
||||||
mentions (Iterable[Span]): Entity mentions for which to identify candidates.
|
mentions (SpanGroup): Entity mentions for which to identify candidates.
|
||||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
"""
|
"""
|
||||||
return kb.get_candidates_batch(mentions)
|
return kb.get_candidates_batch(mentions)
|
||||||
|
|
|
@ -11,6 +11,8 @@ from thinc.api import set_dropout_rate
|
||||||
|
|
||||||
from ..kb import KnowledgeBase, Candidate
|
from ..kb import KnowledgeBase, Candidate
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
|
from ..ml import empty_kb
|
||||||
|
from ..tokens import Doc, Span, SpanGroup
|
||||||
from .pipe import deserialize_config
|
from .pipe import deserialize_config
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe import TrainablePipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -82,7 +84,7 @@ def make_entity_linker(
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
get_candidates_batch: Callable[
|
get_candidates_batch: Callable[
|
||||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
|
||||||
],
|
],
|
||||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
|
@ -105,7 +107,7 @@ def make_entity_linker(
|
||||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
get_candidates_batch (
|
get_candidates_batch (
|
||||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||||
scorer (Optional[Callable]): The scoring method.
|
scorer (Optional[Callable]): The scoring method.
|
||||||
|
@ -170,7 +172,7 @@ class EntityLinker(TrainablePipe):
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
get_candidates_batch: Callable[
|
get_candidates_batch: Callable[
|
||||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
|
||||||
],
|
],
|
||||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
|
@ -194,7 +196,7 @@ class EntityLinker(TrainablePipe):
|
||||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
get_candidates_batch (
|
get_candidates_batch (
|
||||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
|
||||||
Iterable[Candidate]]
|
Iterable[Candidate]]
|
||||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||||
|
@ -473,7 +475,8 @@ class EntityLinker(TrainablePipe):
|
||||||
|
|
||||||
batch_candidates = list(
|
batch_candidates = list(
|
||||||
self.get_candidates_batch(
|
self.get_candidates_batch(
|
||||||
self.kb, [ent_batch[idx] for idx in valid_ent_idx]
|
self.kb,
|
||||||
|
SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
|
||||||
)
|
)
|
||||||
if self.candidates_batch_size > 1
|
if self.candidates_batch_size > 1
|
||||||
else [
|
else [
|
||||||
|
|
|
@ -997,7 +997,6 @@ def test_scorer_links():
|
||||||
)
|
)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
def test_legacy_architectures(name, config):
|
def test_legacy_architectures(name, config):
|
||||||
|
|
||||||
# Ensure that the legacy architectures still work
|
# Ensure that the legacy architectures still work
|
||||||
vector_length = 3
|
vector_length = 3
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
|
@ -189,14 +189,15 @@ to you.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.lang.en import English
|
> from spacy.lang.en import English
|
||||||
|
> from spacy.tokens import SpanGroup
|
||||||
> nlp = English()
|
> nlp = English()
|
||||||
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||||
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
|
> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------------------------------------------------------------------------------ |
|
| ----------- | ------------------------------------------------------------------------------------------------------------ |
|
||||||
| `mentions` | The textual mentions. ~~Iterable[Span]~~ |
|
| `mentions` | The textual mentions. ~~SpanGroup~~ |
|
||||||
| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
|
| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
|
||||||
|
|
||||||
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
|
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
|
||||||
|
|
|
@ -93,14 +93,15 @@ to you.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.lang.en import English
|
> from spacy.lang.en import English
|
||||||
|
> from spacy.tokens import SpanGroup
|
||||||
> nlp = English()
|
> nlp = English()
|
||||||
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||||
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
|
> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | -------------------------------------------------------------------------------------------- |
|
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||||
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
| `mentions` | The textual mentions. ~~SpanGroup~~ |
|
||||||
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
||||||
|
|
||||||
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
|
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
|
||||||
|
@ -187,13 +188,11 @@ Construct an `InMemoryCandidate` object. Usually this constructor is not called
|
||||||
directly, but instead these objects are returned by the `get_candidates` method
|
directly, but instead these objects are returned by the `get_candidates` method
|
||||||
of the [`entity_linker`](/api/entitylinker) pipe.
|
of the [`entity_linker`](/api/entitylinker) pipe.
|
||||||
|
|
||||||
> #### Example```python
|
> #### Example
|
||||||
>
|
>
|
||||||
|
> ```python
|
||||||
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
|
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
|
||||||
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
|
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
|
||||||
>
|
|
||||||
> ```
|
|
||||||
>
|
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user