spaCy/spacy/pipeline/entity_linker.py

import random
import warnings
from itertools import islice, tee
from pathlib import Path
from typing import (
    Any,
    Callable,
    Dict,
    Iterable,
    Iterator,
    List,
    Optional,
    Sequence,
    Union,
    cast,
)

import srsly
from numpy import dtype
from thinc.api import Config, CosineDistance, Model, Optimizer, set_dropout_rate
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged

from .. import util
from ..errors import Errors, Warnings
from ..kb import Candidate, KnowledgeBase
from ..language import Language
from ..scorer import Scorer
from ..tokens import Doc, Span, SpanGroup
from ..training import Example, validate_examples, validate_get_examples
from ..util import SimpleFrozenList, registry
from ..vocab import Vocab
from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe

ActivationsT = Dict[str, Union[List[Ragged], List[str]]]

KNOWLEDGE_BASE_IDS = "kb_ids"

default_model_config = """
[model]
@architectures = "spacy.EntityLinker.v2"

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 2
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
"""
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]


@Language.factory(
    "entity_linker",
    requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
    assigns=["token.ent_kb_id"],
    default_config={
        "model": DEFAULT_NEL_MODEL,
        "labels_discard": [],
        "n_sents": 0,
        "incl_prior": True,
        "incl_context": True,
        "entity_vector_length": 64,
        "get_candidates": {"@misc": "spacy.CandidateGenerator.v2"},
        "overwrite": False,
        "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
        "use_gold_ents": True,
        "threshold": None,
        "save_activations": False,
    },
    default_score_weights={
        "nel_micro_f": 1.0,
        "nel_micro_r": None,
        "nel_micro_p": None,
    },
)
def make_entity_linker(
    nlp: Language,
    name: str,
    model: Model,
    *,
    labels_discard: Iterable[str],
    n_sents: int,
    incl_prior: bool,
    incl_context: bool,
    entity_vector_length: int,
    get_candidates: Callable[
        [KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]
    ],
    generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
    overwrite: bool,
    scorer: Optional[Callable],
    use_gold_ents: bool,
    threshold: Optional[float] = None,
    save_activations: bool,
):
    """Construct an EntityLinker component.

    model (Model[List[Doc], Floats2d]): A model that learns document vector
        representations. Given a batch of Doc objects, it should return a single
        array, with one row per item in the batch.
    labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
    n_sents (int): The number of neighbouring sentences to take into account.
    incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
    incl_context (bool): Whether or not to include the local context in the model.
    entity_vector_length (int): Size of encoding vectors in the KB.
    get_candidates (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]):
        Function producing a list of candidates per document, given a certain knowledge base and several textual
        documents with textual mentions.
    generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
    scorer (Optional[Callable]): The scoring method.
    use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another
        component must provide entity annotations.
    threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
        prediction is discarded. If None, predictions are not filtered by any threshold.
    save_activations (bool): save model activations in Doc when annotating.
    """
    if not model.attrs.get("include_span_maker", False):
        raise ValueError(Errors.E4005)

    return EntityLinker(
        nlp.vocab,
        model,
        name,
        labels_discard=labels_discard,
        n_sents=n_sents,
        incl_prior=incl_prior,
        incl_context=incl_context,
        entity_vector_length=entity_vector_length,
        get_candidates=get_candidates,
        generate_empty_kb=generate_empty_kb,
        overwrite=overwrite,
        scorer=scorer,
        use_gold_ents=use_gold_ents,
        threshold=threshold,
        save_activations=save_activations,
    )


def entity_linker_score(examples, **kwargs):
    return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)


@registry.scorers("spacy.entity_linker_scorer.v1")
def make_entity_linker_scorer():
    return entity_linker_score


class EntityLinker(TrainablePipe):
    """Pipeline component for named entity linking.

    DOCS: https://spacy.io/api/entitylinker
    """

    NIL = "NIL"  # string used to refer to a non-existing link

    def __init__(
        self,
        vocab: Vocab,
        model: Model,
        name: str = "entity_linker",
        *,
        labels_discard: Iterable[str],
        n_sents: int,
        incl_prior: bool,
        incl_context: bool,
        entity_vector_length: int,
        get_candidates: Callable[
            [KnowledgeBase, Iterator[SpanGroup]],
            Iterator[Iterable[Iterable[Candidate]]],
        ],
        generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
        overwrite: bool = False,
        scorer: Optional[Callable] = entity_linker_score,
        use_gold_ents: bool,
        threshold: Optional[float] = None,
        save_activations: bool = False,
    ) -> None:
        """Initialize an entity linker.

        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
        n_sents (int): The number of neighbouring sentences to take into account.
        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
        incl_context (bool): Whether or not to include the local context in the model.
        entity_vector_length (int): Size of encoding vectors in the KB.
        get_candidates (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]):
            Function producing a list of candidates per document, given a certain knowledge base and several textual
            documents with textual mentions.
        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
        overwrite (bool): Whether to overwrite existing non-empty annotations.
        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
        use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
            component must provide entity annotations.
        threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
            threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
        save_activations (bool): save model activations in Doc when annotating.
        DOCS: https://spacy.io/api/entitylinker#init
        """

        if threshold is not None and not (0 <= threshold <= 1):
            raise ValueError(
                Errors.E1043.format(
                    range_start=0,
                    range_end=1,
                    value=threshold,
                )
            )

        self.vocab = vocab
        self.model = model
        self.name = name
        self.labels_discard = list(labels_discard)
        # how many neighbour sentences to take into account
        self.n_sents = n_sents
        self.incl_prior = incl_prior
        self.incl_context = incl_context
        self.get_candidates = get_candidates
        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
        self.distance = CosineDistance(normalize=False)
        self.kb = generate_empty_kb(self.vocab, entity_vector_length)
        self.use_gold_ents = use_gold_ents
        self.threshold = threshold
        self.save_activations = save_activations

        if self.incl_prior and not self.kb.supports_prior_probs:
            warnings.warn(Warnings.W401)

        def _score_with_ents_set(examples: Iterable[Example], **kwargs):
            # Because of how spaCy works, we can't just score immediately, because Language.evaluate
            # calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline.
            if not scorer:
                return scorer
            if not self.use_gold_ents:
                return scorer(examples, **kwargs)
            else:
                examples = self._ensure_ents(examples)
                docs = self.pipe(
                    (eg.predicted for eg in examples),
                )
                for eg, doc in zip(examples, docs):
                    eg.predicted = doc
                return scorer(examples, **kwargs)

        self.scorer = _score_with_ents_set

    def _ensure_ents(self, examples: Iterable[Example]) -> Iterable[Example]:
        """If use_gold_ents is true, set the gold entities to (a copy of) eg.predicted."""
        if not self.use_gold_ents:
            return examples

        new_examples = []
        for eg in examples:
            ents, _ = eg.get_aligned_ents_and_ner()
            new_eg = eg.copy()
            new_eg.predicted.ents = ents
            new_examples.append(new_eg)
        return new_examples

    def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
        """Define the KB of this pipe by providing a function that will
        create it using this object's vocab."""
        if not callable(kb_loader):
            raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))

        self.kb = kb_loader(self.vocab)  # type: ignore

    def validate_kb(self) -> None:
        # Raise an error if the knowledge base is not initialized.
        if self.kb is None:
            raise ValueError(Errors.E1018.format(name=self.name))
        if hasattr(self.kb, "is_empty") and self.kb.is_empty():
            raise ValueError(Errors.E139.format(name=self.name))

    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
        nlp: Optional[Language] = None,
        kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
    ):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab
            instance. Note that providing this argument will overwrite all data accumulated in the current KB.
            Use this only when loading a KB as-such from file.

        DOCS: https://spacy.io/api/entitylinker#initialize
        """
        validate_get_examples(get_examples, "EntityLinker.initialize")
        if kb_loader is not None:
            self.set_kb(kb_loader)
        self.validate_kb()
        nO = self.kb.entity_vector_length
        doc_sample = []
        vector_sample = []
        examples = self._ensure_ents(islice(get_examples(), 10))
        for eg in examples:
            doc = eg.x
            doc_sample.append(doc)
            vector_sample.append(self.model.ops.alloc1f(nO))
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        assert len(vector_sample) > 0, Errors.E923.format(name=self.name)

        # XXX In order for size estimation to work, there has to be at least
        # one entity. It's not used for training so it doesn't have to be real,
        # so we add a fake one if none are present.
        # We can't use Doc.has_annotation here because it can be True for docs
        # that have been through an NER component but got no entities.
        has_annotations = any([doc.ents for doc in doc_sample])
        if not has_annotations:
            doc = doc_sample[0]
            ent = doc[0:1]
            ent.label_ = "XXX"
            doc.ents = (ent,)

        self.model.initialize(
            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
        )

        if not has_annotations:
            # Clean up dummy annotation
            doc.ents = []

    def batch_has_learnable_example(self, examples):
        """Check if a batch contains a learnable example.

        If one isn't present, then the update step needs to be skipped.
        """
        for candidates_for_doc in self.get_candidates(
            self.kb,
            (SpanGroup(doc=eg.predicted, spans=eg.predicted.ents) for eg in examples),
        ):
            for candidates_for_mention in candidates_for_doc:
                if list(candidates_for_mention):
                    return True

        return False

    def update(
        self,
        examples: Iterable[Example],
        *,
        drop: float = 0.0,
        sgd: Optional[Optimizer] = None,
        losses: Optional[Dict[str, float]] = None,
    ) -> Dict[str, float]:
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to predict and get_loss.

        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        sgd (thinc.api.Optimizer): The optimizer.
        losses (Dict[str, float]): Optional record of the loss during training.
            Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.

        DOCS: https://spacy.io/api/entitylinker#update
        """
        self.validate_kb()
        if losses is None:
            losses = {}
        losses.setdefault(self.name, 0.0)
        if not examples:
            return losses
        examples = self._ensure_ents(examples)
        validate_examples(examples, "EntityLinker.update")

        # make sure we have something to learn from, if not, short-circuit
        if not self.batch_has_learnable_example(examples):
            return losses

        set_dropout_rate(self.model, drop)
        docs = [eg.predicted for eg in examples]
        sentence_encodings, bp_context = self.model.begin_update(docs)

        loss, d_scores = self.get_loss(
            sentence_encodings=sentence_encodings, examples=examples
        )
        bp_context(d_scores)
        if sgd is not None:
            self.finish_update(sgd)
        losses[self.name] += loss

        return losses

    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
        validate_examples(examples, "EntityLinker.get_loss")
        entity_encodings = []
        # We assume that get_loss is called with gold ents set in the examples if need be
        eidx = 0  # indices in gold entities to keep
        keep_ents = []  # indices in sentence_encodings to keep

        for eg in examples:
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)

            for ent in eg.get_matching_ents():
                kb_id = kb_ids[ent.start]
                if kb_id:
                    entity_encoding = self.kb.get_vector(kb_id)
                    entity_encodings.append(entity_encoding)
                    keep_ents.append(eidx)

                eidx += 1
        entity_encodings = self.model.ops.asarray2f(entity_encodings, dtype="float32")
        selected_encodings = sentence_encodings[keep_ents]

        # if there are no matches, short circuit
        if not keep_ents:
            out = self.model.ops.alloc2f(*sentence_encodings.shape)
            return 0, out

        if selected_encodings.shape != entity_encodings.shape:
            err = Errors.E147.format(
                method="get_loss", msg="gold entities do not match up"
            )
            raise RuntimeError(err)
        gradients = self.distance.get_grad(selected_encodings, entity_encodings)
        # to match the input size, we need to give a zero gradient for items not in the kb
        out = self.model.ops.alloc2f(*sentence_encodings.shape)
        out[keep_ents] = gradients

        loss = self.distance.get_loss(selected_encodings, entity_encodings)
        loss = loss / len(entity_encodings)
        return float(loss), out

    def predict(self, docs: Iterable[Doc]) -> ActivationsT:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        Returns the KB IDs for each entity in each doc, including NIL if there is
        no prediction.

        docs (Iterable[Doc]): The documents to predict.
        RETURNS (List[str]): The models prediction for each document.

        DOCS: https://spacy.io/api/entitylinker#predict
        """
        self.validate_kb()
        entity_count = 0
        final_kb_ids: List[str] = []
        ops = self.model.ops
        xp = ops.xp
        docs_ents: List[Ragged] = []
        docs_scores: List[Ragged] = []
        if not docs:
            return {
                KNOWLEDGE_BASE_IDS: final_kb_ids,
                "ents": docs_ents,
                "scores": docs_scores,
            }
        if isinstance(docs, Doc):
            docs = [docs]

        docs_iters = tee(docs, 2)

        # Call candidate generator.
        all_ent_cands = self.get_candidates(
            self.kb,
            (
                SpanGroup(
                    doc,
                    spans=[
                        ent for ent in doc.ents if ent.label_ not in self.labels_discard
                    ],
                )
                for doc in docs_iters[0]
            ),
        )

        for doc in docs_iters[1]:
            doc_ents: List[Ints1d] = []
            doc_scores: List[Floats1d] = []
            if len(doc) == 0 or len(doc.ents) == 0:
                docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
                docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
                continue
            sentences = [s for s in doc.sents]
            doc_ent_cands = list(next(all_ent_cands))

            # Looping over candidate entities for this doc. (TODO: rewrite)
            for ent_cand_idx, ent in enumerate(doc.ents):
                assert hasattr(ent, "sents")
                sents = list(ent.sents)
                sent_indices = (
                    sentences.index(sents[0]),
                    sentences.index(sents[-1]),
                )
                assert sent_indices[1] >= sent_indices[0] >= 0

                if self.incl_context:
                    # get n_neighbour sentences, clipped to the length of the document
                    start_sentence = max(0, sent_indices[0] - self.n_sents)
                    end_sentence = min(
                        len(sentences) - 1, sent_indices[1] + self.n_sents
                    )
                    start_token = sentences[start_sentence].start
                    end_token = sentences[end_sentence].end
                    sent_doc = doc[start_token:end_token].as_doc()
                    # currently, the context is the same for each entity in a sentence (should be refined)
                    sentence_encoding = self.model.predict([sent_doc])[0]
                    sentence_encoding_t = sentence_encoding.T
                    sentence_norm = xp.linalg.norm(sentence_encoding_t)
                entity_count += 1
                if ent.label_ in self.labels_discard:
                    # ignoring this entity - setting to NIL
                    final_kb_ids.append(self.NIL)
                    self._add_activations(
                        doc_scores=doc_scores,
                        doc_ents=doc_ents,
                        scores=[0.0],
                        ents=[0],
                    )
                else:
                    candidates = list(doc_ent_cands[ent_cand_idx])
                    if not candidates:
                        # no prediction possible for this entity - setting to NIL
                        final_kb_ids.append(self.NIL)
                        self._add_activations(
                            doc_scores=doc_scores,
                            doc_ents=doc_ents,
                            scores=[0.0],
                            ents=[0],
                        )
                    elif len(candidates) == 1 and self.threshold is None:
                        # shortcut for efficiency reasons: take the 1 candidate
                        final_kb_ids.append(candidates[0].entity_id_)
                        self._add_activations(
                            doc_scores=doc_scores,
                            doc_ents=doc_ents,
                            scores=[1.0],
                            ents=[candidates[0].entity_id],
                        )
                    else:
                        random.shuffle(candidates)
                        # set all prior probabilities to 0 if incl_prior=False
                        scores = prior_probs = xp.asarray(
                            [
                                c.prior_prob if self.incl_prior else 0.0
                                for c in candidates
                            ]
                        )
                        # add in similarity from the context
                        if self.incl_context:
                            entity_encodings = xp.asarray(
                                [c.entity_vector for c in candidates]
                            )
                            entity_norm = xp.linalg.norm(entity_encodings, axis=1)
                            if len(entity_encodings) != len(prior_probs):
                                raise RuntimeError(
                                    Errors.E147.format(
                                        method="predict",
                                        msg="vectors not of equal length",
                                    )
                                )
                            # cosine similarity
                            sims = xp.dot(entity_encodings, sentence_encoding_t) / (
                                sentence_norm * entity_norm
                            )
                            if sims.shape != prior_probs.shape:
                                raise ValueError(Errors.E161)
                            scores = prior_probs + sims - (prior_probs * sims)
                        final_kb_ids.append(
                            candidates[scores.argmax().item()].entity_id_
                            if self.threshold is None or scores.max() >= self.threshold
                            else EntityLinker.NIL
                        )
                        self._add_activations(
                            doc_scores=doc_scores,
                            doc_ents=doc_ents,
                            scores=scores,
                            ents=[c.entity_id for c in candidates],
                        )

            self._add_doc_activations(
                docs_scores=docs_scores,
                docs_ents=docs_ents,
                doc_scores=doc_scores,
                doc_ents=doc_ents,
            )
        if not (len(final_kb_ids) == entity_count):
            err = Errors.E147.format(
                method="predict", msg="result variables not of equal length"
            )
            raise RuntimeError(err)

        return {
            KNOWLEDGE_BASE_IDS: final_kb_ids,
            "ents": docs_ents,
            "scores": docs_scores,
        }

    def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
        """Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
        activations (ActivationsT): The activations used for setting annotations, produced
                                 by EntityLinker.predict.

        DOCS: https://spacy.io/api/entitylinker#set_annotations
        """
        kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
        count_ents = len([ent for doc in docs for ent in doc.ents])
        if count_ents != len(kb_ids):
            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
        i = 0
        overwrite = self.cfg["overwrite"]
        for j, doc in enumerate(docs):
            if self.save_activations:
                doc.activations[self.name] = {}
                for act_name, acts in activations.items():
                    if act_name != KNOWLEDGE_BASE_IDS:
                        # We only copy activations that are Ragged.
                        doc.activations[self.name][act_name] = cast(Ragged, acts[j])

            for ent in doc.ents:
                kb_id = kb_ids[i]
                i += 1
                for token in ent:
                    if token.ent_kb_id == 0 or overwrite:
                        token.ent_kb_id_ = kb_id

    def to_bytes(self, *, exclude=tuple()):
        """Serialize the pipe to a bytestring.

        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (bytes): The serialized object.

        DOCS: https://spacy.io/api/entitylinker#to_bytes
        """
        self._validate_serialization_attrs()
        serialize = {}
        if hasattr(self, "cfg") and self.cfg is not None:
            serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
        serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
        serialize["kb"] = self.kb.to_bytes
        serialize["model"] = self.model.to_bytes
        return util.to_bytes(serialize, exclude)

    def from_bytes(self, bytes_data, *, exclude=tuple()):
        """Load the pipe from a bytestring.

        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (TrainablePipe): The loaded object.

        DOCS: https://spacy.io/api/entitylinker#from_bytes
        """
        self._validate_serialization_attrs()

        def load_model(b):
            try:
                self.model.from_bytes(b)
            except AttributeError:
                raise ValueError(Errors.E149) from None

        deserialize = {}
        if hasattr(self, "cfg") and self.cfg is not None:
            deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
        deserialize["kb"] = lambda b: self.kb.from_bytes(b)
        deserialize["model"] = load_model
        util.from_bytes(bytes_data, deserialize, exclude)
        return self

    def to_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> None:
        """Serialize the pipe to disk.

        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.

        DOCS: https://spacy.io/api/entitylinker#to_disk
        """
        serialize = {}
        serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
        serialize["kb"] = lambda p: self.kb.to_disk(p)
        serialize["model"] = lambda p: self.model.to_disk(p)
        util.to_disk(path, serialize, exclude)

    def from_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
    ) -> "EntityLinker":
        """Load the pipe from disk. Modifies the object in place and returns it.

        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (EntityLinker): The modified EntityLinker object.

        DOCS: https://spacy.io/api/entitylinker#from_disk
        """

        def load_model(p):
            try:
                with p.open("rb") as infile:
                    self.model.from_bytes(infile.read())
            except AttributeError:
                raise ValueError(Errors.E149) from None

        deserialize: Dict[str, Callable[[Any], Any]] = {}
        deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
        deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
        deserialize["kb"] = lambda p: self.kb.from_disk(p)
        deserialize["model"] = load_model
        util.from_disk(path, deserialize, exclude)
        return self

    def rehearse(self, examples, *, sgd=None, losses=None, **config):
        raise NotImplementedError

    def add_label(self, label):
        raise NotImplementedError

    def _add_doc_activations(
        self,
        *,
        docs_scores: List[Ragged],
        docs_ents: List[Ragged],
        doc_scores: List[Floats1d],
        doc_ents: List[Ints1d],
    ):
        if not self.save_activations:
            return
        ops = self.model.ops
        lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
        docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
        docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))

    def _add_activations(
        self,
        *,
        doc_scores: List[Floats1d],
        doc_ents: List[Ints1d],
        scores: Sequence[float],
        ents: Sequence[int],
    ):
        if not self.save_activations:
            return
        ops = self.model.ops
        doc_scores.append(ops.asarray1f(scores))
        doc_ents.append(ops.asarray1i(ents, dtype="uint64"))