spaCy/spacy/ml/models/entity_linker.py
Raphael Mitsch 304b9331e6
Modify EL batching to doc-wise streaming approach (#12367)
* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span].

* Update docs.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Modify EL batching system.

* Update leftover get_candidates() mention in docs.

* Format docs.

* Format.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Updated error code.

* Simplify interface for int/str representations.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename 'alias' to 'mention'.

* Port Candidate and InMemoryCandidate to Cython.

* Remove redundant entry in setup.py.

* Add abstract class check.

* Drop storing mention.

* Update spacy/kb/candidate.pxd

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix entity_id refactoring problems in docstrings.

* Drop unused InMemoryCandidate._entity_hash.

* Update docstrings.

* Move attributes out of Candidate.

* Partially fix alias/mention terminology usage. Convert Candidate to interface.

* Remove prior_prob from supported properties in Candidate. Introduce KnowledgeBase.supports_prior_probs().

* Update docstrings related to prior_prob.

* Update alias/mention usage in doc(strings).

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Mention -> alias renaming. Drop Candidate.mentions(). Drop InMemoryLookupKB.get_alias_candidates() from docs.

* Update docstrings.

* Fix InMemoryCandidate attribute names.

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/ml/models/entity_linker.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update W401 test.

* Update spacy/errors.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Use Candidate output type for toy generators in the test suite to mimick best practices

* fix docs

* fix import

* Fix merge leftovers.

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/kb/kb.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/entitylinker.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/kb/kb_in_memory.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/inmemorylookupkb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update get_candidates() docstring.

* Reformat imports in entity_linker.py.

* Drop valid_ent_idx_per_doc.

* Update docs.

* Format.

* Simplify doc loop in predict().

* Remove E1044 comment.

* Fix merge errors.

* Format.

* Format.

* Format.

* Fix merge error & tests.

* Format.

* Apply suggestions from code review

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Use type alias.

* isort.

* isort.

* Lint.

* Add typedefs.pyx.

* Fix typedef import.

* Fix type aliases.

* Format.

* Update docstring and type usage.

* Add info on get_candidates(), get_candidates_batched().

* Readd get_candidates info to v3 changelog.

* Update website/docs/api/entitylinker.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update factory functions for backwards compatibility.

* Format.

* Ignore mypy error.

* Fix mypy error.

* Format.

* Add test for multiple docs with multiple entities.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: svlandeg <svlandeg@github.com>
2024-04-09 11:39:18 +02:00

158 lines
5.2 KiB
Python

from pathlib import Path
from typing import Callable, Iterable, Iterator, List, Optional, Tuple
from thinc.api import (
Linear,
Maxout,
Model,
Ragged,
chain,
list2ragged,
reduce_mean,
residual,
tuplify,
)
from thinc.types import Floats2d
from ...errors import Errors
from ...kb import Candidate, InMemoryLookupKB, KnowledgeBase
from ...tokens import Doc, Span, SpanGroup
from ...util import registry
from ...vocab import Vocab
from ..extract_spans import extract_spans
CandidatesForMentionT = Iterable[Candidate]
CandidatesForDocT = Iterable[CandidatesForMentionT]
@registry.architectures("spacy.EntityLinker.v2")
def build_nel_encoder(
tok2vec: Model, nO: Optional[int] = None
) -> Model[List[Doc], Floats2d]:
with Model.define_operators({">>": chain, "&": tuplify}):
token_width = tok2vec.maybe_get_dim("nO")
output_layer = Linear(nO=nO, nI=token_width)
model = (
((tok2vec >> list2ragged()) & build_span_maker())
>> extract_spans()
>> reduce_mean()
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore
>> output_layer
)
model.set_ref("output_layer", output_layer)
model.set_ref("tok2vec", tok2vec)
# flag to show this isn't legacy
model.attrs["include_span_maker"] = True
return model
def build_span_maker(n_sents: int = 0) -> Model:
model: Model = Model("span_maker", forward=span_maker_forward)
model.attrs["n_sents"] = n_sents
return model
def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callable]:
ops = model.ops
n_sents = model.attrs["n_sents"]
candidates = []
for doc in docs:
cands = []
try:
sentences = [s for s in doc.sents]
except ValueError:
# no sentence info, normal in initialization
for tok in doc:
tok.is_sent_start = tok.i == 0
sentences = [doc[:]]
for ent in doc.ents:
try:
# find the sentence in the list of sentences.
sent_index = sentences.index(ent.sent)
except AttributeError:
# Catch the exception when ent.sent is None and provide a user-friendly warning
raise RuntimeError(Errors.E030) from None
# get n previous sentences, if there are any
start_sentence = max(0, sent_index - n_sents)
# get n posterior sentences, or as many < n as there are
end_sentence = min(len(sentences) - 1, sent_index + n_sents)
# get token positions
start_token = sentences[start_sentence].start
end_token = sentences[end_sentence].end
# save positions for extraction
cands.append((start_token, end_token))
candidates.append(ops.asarray2i(cands))
lengths = model.ops.asarray1i([len(cands) for cands in candidates])
out = Ragged(model.ops.flatten(candidates), lengths)
# because this is just rearranging docs, the backprop does nothing
return out, lambda x: []
@registry.misc("spacy.KBFromFile.v1")
def load_kb(
kb_path: Path,
) -> Callable[[Vocab], KnowledgeBase]:
def kb_from_file(vocab: Vocab):
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
kb.from_disk(kb_path)
return kb
return kb_from_file
@registry.misc("spacy.EmptyKB.v2")
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
return empty_kb_factory
@registry.misc("spacy.EmptyKB.v1")
def empty_kb(
entity_vector_length: int,
) -> Callable[[Vocab], KnowledgeBase]:
def empty_kb_factory(vocab: Vocab):
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
return empty_kb_factory
@registry.misc("spacy.CandidateGenerator.v1")
def create_get_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
return get_candidates
@registry.misc("spacy.CandidateGenerator.v2")
def create_get_candidates_v2() -> Callable[
[KnowledgeBase, Iterator[SpanGroup]], Iterator[CandidatesForDocT]
]:
return get_candidates_v2
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
"""
Return candidate entities for the given mention from the KB.
kb (KnowledgeBase): Knowledge base to query.
mention (Span): Entity mention.
RETURNS (Iterable[Candidate]): Identified candidates for specified mention.
"""
cands_per_doc = next(
get_candidates_v2(kb, iter([SpanGroup(mention.doc, spans=[mention])]))
)
assert isinstance(cands_per_doc, list)
return next(cands_per_doc[0])
def get_candidates_v2(
kb: KnowledgeBase, mentions: Iterator[SpanGroup]
) -> Iterator[Iterable[Iterable[Candidate]]]:
"""
Return candidate entities for the given mentions from the KB.
kb (KnowledgeBase): Knowledge base to query.
mentions (Iterator[SpanGroup]): Mentions per doc.
RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per mentions in document/SpanGroup.
"""
return kb.get_candidates(mentions)