spaCy/spacy/ml/models/entity_linker.py
2022-11-29 22:03:53 +01:00

143 lines
5.1 KiB
Python

from pathlib import Path
from typing import Optional, Callable, Iterable, List, Tuple, Generator, Iterator
from thinc.types import Floats2d
from thinc.api import chain, list2ragged, reduce_mean, residual
from thinc.api import Model, Maxout, Linear, tuplify, Ragged
from ...util import registry
from ...kb import KnowledgeBase, InMemoryLookupKB
from ...kb import Candidate
from ...vocab import Vocab
from ...tokens import Span, Doc
from ..extract_spans import extract_spans
from ...errors import Errors
@registry.architectures("spacy.EntityLinker.v2")
def build_nel_encoder(
tok2vec: Model, nO: Optional[int] = None
) -> Model[List[Doc], Floats2d]:
with Model.define_operators({">>": chain, "&": tuplify}):
token_width = tok2vec.maybe_get_dim("nO")
output_layer = Linear(nO=nO, nI=token_width)
model = (
((tok2vec >> list2ragged()) & build_span_maker())
>> extract_spans()
>> reduce_mean()
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore
>> output_layer
)
model.set_ref("output_layer", output_layer)
model.set_ref("tok2vec", tok2vec)
# flag to show this isn't legacy
model.attrs["include_span_maker"] = True
return model
def build_span_maker(n_sents: int = 0) -> Model:
model: Model = Model("span_maker", forward=span_maker_forward)
model.attrs["n_sents"] = n_sents
return model
def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callable]:
ops = model.ops
n_sents = model.attrs["n_sents"]
candidates = []
for doc in docs:
cands = []
try:
sentences = [s for s in doc.sents]
except ValueError:
# no sentence info, normal in initialization
for tok in doc:
tok.is_sent_start = tok.i == 0
sentences = [doc[:]]
for ent in doc.ents:
try:
# find the sentence in the list of sentences.
sent_index = sentences.index(ent.sent)
except AttributeError:
# Catch the exception when ent.sent is None and provide a user-friendly warning
raise RuntimeError(Errors.E030) from None
# get n previous sentences, if there are any
start_sentence = max(0, sent_index - n_sents)
# get n posterior sentences, or as many < n as there are
end_sentence = min(len(sentences) - 1, sent_index + n_sents)
# get token positions
start_token = sentences[start_sentence].start
end_token = sentences[end_sentence].end
# save positions for extraction
cands.append((start_token, end_token))
candidates.append(ops.asarray2i(cands))
lengths = model.ops.asarray1i([len(cands) for cands in candidates])
out = Ragged(model.ops.flatten(candidates), lengths)
# because this is just rearranging docs, the backprop does nothing
return out, lambda x: []
@registry.misc("spacy.KBFromFile.v1")
def load_kb(
kb_path: Path,
) -> Callable[[Vocab], KnowledgeBase]:
def kb_from_file(vocab: Vocab):
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
kb.from_disk(kb_path)
return kb
return kb_from_file
@registry.misc("spacy.EmptyKB.v1")
def empty_kb(
entity_vector_length: int,
) -> Callable[[Vocab], KnowledgeBase]:
def empty_kb_factory(vocab: Vocab):
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
return empty_kb_factory
@registry.misc("spacy.EmptyKB.v2")
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
return empty_kb_factory
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
"""
Return candidate entities for a given mention and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (Span): Entity mention for which to identify candidates.
RETURNS (Iterable[Candidate]): Identified candidates.
"""
return kb.get_candidates(mention)
def get_candidates_all(
kb: KnowledgeBase, mentions: Iterator[Iterable[Span]]
) -> Iterator[Iterable[Iterable[Candidate]]]:
"""
Return candidate entities for the given mentions and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (Iterator[Iterable[Span]]): Entity mentions per document for which to identify candidates.
RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per document.
"""
return kb.get_candidates_all(mentions)
@registry.misc("spacy.CandidateGenerator.v1")
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
return get_candidates
@registry.misc("spacy.CandidateAllGenerator.v1")
def create_candidates_all() -> Callable[
[KnowledgeBase, Generator[Iterable[Span], None, None]],
Iterator[Iterable[Iterable[Candidate]]],
]:
return get_candidates_all