mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
dca2e8c644
* Fix TODO about typing Fix was simple: just request an array2f. * Add type ignore Maxout has a more restrictive type than the residual layer expects (only Floats2d vs any Floats). * Various cleanup This moves a lot of lines around but doesn't change any functionality. Details: 1. use `continue` to reduce indentation 2. move sentence doc building inside conditional since it's otherwise unused 3. reduces some temporary assignments
101 lines
3.6 KiB
Python
101 lines
3.6 KiB
Python
from pathlib import Path
|
|
from typing import Optional, Callable, Iterable, List, Tuple
|
|
from thinc.types import Floats2d
|
|
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
|
from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
|
|
|
|
from ...util import registry
|
|
from ...kb import KnowledgeBase, Candidate, get_candidates
|
|
from ...vocab import Vocab
|
|
from ...tokens import Span, Doc
|
|
from ..extract_spans import extract_spans
|
|
from ...errors import Errors
|
|
|
|
|
|
@registry.architectures("spacy.EntityLinker.v2")
|
|
def build_nel_encoder(
|
|
tok2vec: Model, nO: Optional[int] = None
|
|
) -> Model[List[Doc], Floats2d]:
|
|
with Model.define_operators({">>": chain, "&": tuplify}):
|
|
token_width = tok2vec.maybe_get_dim("nO")
|
|
output_layer = Linear(nO=nO, nI=token_width)
|
|
model = (
|
|
((tok2vec >> list2ragged()) & build_span_maker())
|
|
>> extract_spans()
|
|
>> reduce_mean()
|
|
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore
|
|
>> output_layer
|
|
)
|
|
model.set_ref("output_layer", output_layer)
|
|
model.set_ref("tok2vec", tok2vec)
|
|
# flag to show this isn't legacy
|
|
model.attrs["include_span_maker"] = True
|
|
return model
|
|
|
|
|
|
def build_span_maker(n_sents: int = 0) -> Model:
|
|
model: Model = Model("span_maker", forward=span_maker_forward)
|
|
model.attrs["n_sents"] = n_sents
|
|
return model
|
|
|
|
|
|
def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callable]:
|
|
ops = model.ops
|
|
n_sents = model.attrs["n_sents"]
|
|
candidates = []
|
|
for doc in docs:
|
|
cands = []
|
|
try:
|
|
sentences = [s for s in doc.sents]
|
|
except ValueError:
|
|
# no sentence info, normal in initialization
|
|
for tok in doc:
|
|
tok.is_sent_start = tok.i == 0
|
|
sentences = [doc[:]]
|
|
for ent in doc.ents:
|
|
try:
|
|
# find the sentence in the list of sentences.
|
|
sent_index = sentences.index(ent.sent)
|
|
except AttributeError:
|
|
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
|
raise RuntimeError(Errors.E030) from None
|
|
# get n previous sentences, if there are any
|
|
start_sentence = max(0, sent_index - n_sents)
|
|
# get n posterior sentences, or as many < n as there are
|
|
end_sentence = min(len(sentences) - 1, sent_index + n_sents)
|
|
# get token positions
|
|
start_token = sentences[start_sentence].start
|
|
end_token = sentences[end_sentence].end
|
|
# save positions for extraction
|
|
cands.append((start_token, end_token))
|
|
|
|
candidates.append(ops.asarray2i(cands))
|
|
candlens = ops.asarray1i([len(cands) for cands in candidates])
|
|
candidates = ops.xp.concatenate(candidates)
|
|
outputs = Ragged(candidates, candlens)
|
|
# because this is just rearranging docs, the backprop does nothing
|
|
return outputs, lambda x: []
|
|
|
|
|
|
@registry.misc("spacy.KBFromFile.v1")
|
|
def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
|
def kb_from_file(vocab):
|
|
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
|
kb.from_disk(kb_path)
|
|
return kb
|
|
|
|
return kb_from_file
|
|
|
|
|
|
@registry.misc("spacy.EmptyKB.v1")
|
|
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
|
def empty_kb_factory(vocab):
|
|
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
|
|
|
|
return empty_kb_factory
|
|
|
|
|
|
@registry.misc("spacy.CandidateGenerator.v1")
|
|
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
|
return get_candidates
|