mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-03 22:06:37 +03:00
304b9331e6
* Convert Candidate from Cython to Python class. * Format. * Fix .entity_ typo in _add_activations() usage. * Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span]. * Update docs. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update doc string of BaseCandidate.__init__(). * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate. * Adjust Candidate to support and mandate numerical entity IDs. * Format. * Fix docstring and docs. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename alias -> mention. * Refactor Candidate attribute names. Update docs and tests accordingly. * Refacor Candidate attributes and their usage. * Format. * Fix mypy error. * Update error code in line with v4 convention. * Modify EL batching system. * Update leftover get_candidates() mention in docs. * Format docs. * Format. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Updated error code. * Simplify interface for int/str representations. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Rename 'alias' to 'mention'. * Port Candidate and InMemoryCandidate to Cython. * Remove redundant entry in setup.py. * Add abstract class check. * Drop storing mention. * Update spacy/kb/candidate.pxd Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix entity_id refactoring problems in docstrings. * Drop unused InMemoryCandidate._entity_hash. * Update docstrings. * Move attributes out of Candidate. * Partially fix alias/mention terminology usage. Convert Candidate to interface. * Remove prior_prob from supported properties in Candidate. Introduce KnowledgeBase.supports_prior_probs(). * Update docstrings related to prior_prob. * Update alias/mention usage in doc(strings). * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Mention -> alias renaming. Drop Candidate.mentions(). Drop InMemoryLookupKB.get_alias_candidates() from docs. * Update docstrings. * Fix InMemoryCandidate attribute names. * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update W401 test. * Update spacy/errors.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Use Candidate output type for toy generators in the test suite to mimick best practices * fix docs * fix import * Fix merge leftovers. * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update website/docs/api/entitylinker.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/kb/kb_in_memory.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update website/docs/api/inmemorylookupkb.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update get_candidates() docstring. * Reformat imports in entity_linker.py. * Drop valid_ent_idx_per_doc. * Update docs. * Format. * Simplify doc loop in predict(). * Remove E1044 comment. * Fix merge errors. * Format. * Format. * Format. * Fix merge error & tests. * Format. * Apply suggestions from code review Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> * Use type alias. * isort. * isort. * Lint. * Add typedefs.pyx. * Fix typedef import. * Fix type aliases. * Format. * Update docstring and type usage. * Add info on get_candidates(), get_candidates_batched(). * Readd get_candidates info to v3 changelog. * Update website/docs/api/entitylinker.mdx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update factory functions for backwards compatibility. * Format. * Ignore mypy error. * Fix mypy error. * Format. * Add test for multiple docs with multiple entities. --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> Co-authored-by: svlandeg <svlandeg@github.com>
158 lines
5.2 KiB
Python
158 lines
5.2 KiB
Python
from pathlib import Path
|
|
from typing import Callable, Iterable, Iterator, List, Optional, Tuple
|
|
|
|
from thinc.api import (
|
|
Linear,
|
|
Maxout,
|
|
Model,
|
|
Ragged,
|
|
chain,
|
|
list2ragged,
|
|
reduce_mean,
|
|
residual,
|
|
tuplify,
|
|
)
|
|
from thinc.types import Floats2d
|
|
|
|
from ...errors import Errors
|
|
from ...kb import Candidate, InMemoryLookupKB, KnowledgeBase
|
|
from ...tokens import Doc, Span, SpanGroup
|
|
from ...util import registry
|
|
from ...vocab import Vocab
|
|
from ..extract_spans import extract_spans
|
|
|
|
CandidatesForMentionT = Iterable[Candidate]
|
|
CandidatesForDocT = Iterable[CandidatesForMentionT]
|
|
|
|
|
|
@registry.architectures("spacy.EntityLinker.v2")
|
|
def build_nel_encoder(
|
|
tok2vec: Model, nO: Optional[int] = None
|
|
) -> Model[List[Doc], Floats2d]:
|
|
with Model.define_operators({">>": chain, "&": tuplify}):
|
|
token_width = tok2vec.maybe_get_dim("nO")
|
|
output_layer = Linear(nO=nO, nI=token_width)
|
|
model = (
|
|
((tok2vec >> list2ragged()) & build_span_maker())
|
|
>> extract_spans()
|
|
>> reduce_mean()
|
|
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore
|
|
>> output_layer
|
|
)
|
|
model.set_ref("output_layer", output_layer)
|
|
model.set_ref("tok2vec", tok2vec)
|
|
# flag to show this isn't legacy
|
|
model.attrs["include_span_maker"] = True
|
|
return model
|
|
|
|
|
|
def build_span_maker(n_sents: int = 0) -> Model:
|
|
model: Model = Model("span_maker", forward=span_maker_forward)
|
|
model.attrs["n_sents"] = n_sents
|
|
return model
|
|
|
|
|
|
def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callable]:
|
|
ops = model.ops
|
|
n_sents = model.attrs["n_sents"]
|
|
candidates = []
|
|
for doc in docs:
|
|
cands = []
|
|
try:
|
|
sentences = [s for s in doc.sents]
|
|
except ValueError:
|
|
# no sentence info, normal in initialization
|
|
for tok in doc:
|
|
tok.is_sent_start = tok.i == 0
|
|
sentences = [doc[:]]
|
|
for ent in doc.ents:
|
|
try:
|
|
# find the sentence in the list of sentences.
|
|
sent_index = sentences.index(ent.sent)
|
|
except AttributeError:
|
|
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
|
raise RuntimeError(Errors.E030) from None
|
|
# get n previous sentences, if there are any
|
|
start_sentence = max(0, sent_index - n_sents)
|
|
# get n posterior sentences, or as many < n as there are
|
|
end_sentence = min(len(sentences) - 1, sent_index + n_sents)
|
|
# get token positions
|
|
start_token = sentences[start_sentence].start
|
|
end_token = sentences[end_sentence].end
|
|
# save positions for extraction
|
|
cands.append((start_token, end_token))
|
|
|
|
candidates.append(ops.asarray2i(cands))
|
|
lengths = model.ops.asarray1i([len(cands) for cands in candidates])
|
|
out = Ragged(model.ops.flatten(candidates), lengths)
|
|
# because this is just rearranging docs, the backprop does nothing
|
|
return out, lambda x: []
|
|
|
|
|
|
@registry.misc("spacy.KBFromFile.v1")
|
|
def load_kb(
|
|
kb_path: Path,
|
|
) -> Callable[[Vocab], KnowledgeBase]:
|
|
def kb_from_file(vocab: Vocab):
|
|
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
|
kb.from_disk(kb_path)
|
|
return kb
|
|
|
|
return kb_from_file
|
|
|
|
|
|
@registry.misc("spacy.EmptyKB.v2")
|
|
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
|
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
|
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
|
|
|
return empty_kb_factory
|
|
|
|
|
|
@registry.misc("spacy.EmptyKB.v1")
|
|
def empty_kb(
|
|
entity_vector_length: int,
|
|
) -> Callable[[Vocab], KnowledgeBase]:
|
|
def empty_kb_factory(vocab: Vocab):
|
|
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
|
|
|
return empty_kb_factory
|
|
|
|
|
|
@registry.misc("spacy.CandidateGenerator.v1")
|
|
def create_get_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
|
return get_candidates
|
|
|
|
|
|
@registry.misc("spacy.CandidateGenerator.v2")
|
|
def create_get_candidates_v2() -> Callable[
|
|
[KnowledgeBase, Iterator[SpanGroup]], Iterator[CandidatesForDocT]
|
|
]:
|
|
return get_candidates_v2
|
|
|
|
|
|
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
|
"""
|
|
Return candidate entities for the given mention from the KB.
|
|
kb (KnowledgeBase): Knowledge base to query.
|
|
mention (Span): Entity mention.
|
|
RETURNS (Iterable[Candidate]): Identified candidates for specified mention.
|
|
"""
|
|
cands_per_doc = next(
|
|
get_candidates_v2(kb, iter([SpanGroup(mention.doc, spans=[mention])]))
|
|
)
|
|
assert isinstance(cands_per_doc, list)
|
|
return next(cands_per_doc[0])
|
|
|
|
|
|
def get_candidates_v2(
|
|
kb: KnowledgeBase, mentions: Iterator[SpanGroup]
|
|
) -> Iterator[Iterable[Iterable[Candidate]]]:
|
|
"""
|
|
Return candidate entities for the given mentions from the KB.
|
|
kb (KnowledgeBase): Knowledge base to query.
|
|
mentions (Iterator[SpanGroup]): Mentions per doc.
|
|
RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per mentions in document/SpanGroup.
|
|
"""
|
|
return kb.get_candidates(mentions)
|