Annotate TrainablePipe subclasses with NVTX ranges

2025-08-13 16:44:56 +03:00 · 2022-06-14 15:29:40 +02:00 · 2022-06-14 15:29:40 +02:00 · d8684f7372
commit d8684f7372
parent 26536eb6b8
9 changed files with 76 additions and 9 deletions
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -12,7 +12,7 @@ from thinc.types import Floats2d, Ints1d, Ints2d
 from ._edit_tree_internals.edit_trees import EditTrees
 from ._edit_tree_internals.schemas import validate_edit_tree
 from .lemmatizer import lemmatizer_score
-from .trainable_pipe import TrainablePipe
+from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
 from ..errors import Errors
 from ..language import Language
 from ..tokens import Doc
@ -117,6 +117,7 @@ class EditTreeLemmatizer(TrainablePipe):
        self.cfg: Dict[str, Any] = {"labels": []}
        self.scorer = scorer

+    @trainable_pipe_nvtx_range
    def get_loss(
        self, examples: Iterable[Example], scores: List[Floats2d]
    ) -> Tuple[float, List[Floats2d]]:
@ -144,6 +145,7 @@ class EditTreeLemmatizer(TrainablePipe):

        return float(loss), d_scores

+    @trainable_pipe_nvtx_range
    def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
        n_docs = len(list(docs))
        if not any(len(doc) for doc in docs):
@ -186,6 +188,7 @@ class EditTreeLemmatizer(TrainablePipe):

        return guesses

+    @trainable_pipe_nvtx_range
    def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
        for i, doc in enumerate(docs):
            doc_tree_ids = batch_tree_ids[i]
@ -224,6 +227,7 @@ class EditTreeLemmatizer(TrainablePipe):
            trees.append(tree)
        return dict(trees=trees, labels=tuple(self.cfg["labels"]))

+    @trainable_pipe_nvtx_range
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -12,7 +12,7 @@ from ..ml import empty_kb
 from ..tokens import Doc, Span
 from .pipe import deserialize_config
 from .legacy.entity_linker import EntityLinker_v1
-from .trainable_pipe import TrainablePipe
+from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
 from ..language import Language
 from ..vocab import Vocab
 from ..training import Example, validate_examples, validate_get_examples
@ -208,6 +208,7 @@ class EntityLinker(TrainablePipe):
        if len(self.kb) == 0:
            raise ValueError(Errors.E139.format(name=self.name))

+    @trainable_pipe_nvtx_range
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
@ -278,6 +279,7 @@ class EntityLinker(TrainablePipe):

        return False

+    @trainable_pipe_nvtx_range
    def update(
        self,
        examples: Iterable[Example],
@ -338,6 +340,7 @@ class EntityLinker(TrainablePipe):
        losses[self.name] += loss
        return losses

+    @trainable_pipe_nvtx_range
    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
        validate_examples(examples, "EntityLinker.get_loss")
        entity_encodings = []
@ -377,6 +380,7 @@ class EntityLinker(TrainablePipe):
        loss = loss / len(entity_encodings)
        return float(loss), out

+    @trainable_pipe_nvtx_range
    def predict(self, docs: Iterable[Doc]) -> List[str]:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        Returns the KB IDs for each entity in each doc, including NIL if there is
@ -466,6 +470,7 @@ class EntityLinker(TrainablePipe):
            raise RuntimeError(err)
        return final_kb_ids

+    @trainable_pipe_nvtx_range
    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
        """Modify a batch of documents, using pre-computed scores.

@ -573,8 +578,10 @@ class EntityLinker(TrainablePipe):
        util.from_disk(path, deserialize, exclude)
        return self

+    @trainable_pipe_nvtx_range
    def rehearse(self, examples, *, sgd=None, losses=None, **config):
        raise NotImplementedError

+    @trainable_pipe_nvtx_range
    def add_label(self, label):
        raise NotImplementedError
--- a/spacy/pipeline/legacy/entity_linker.py
+++ b/spacy/pipeline/legacy/entity_linker.py
@ -15,7 +15,7 @@ from ...kb import KnowledgeBase, Candidate
 from ...ml import empty_kb
 from ...tokens import Doc, Span
 from ..pipe import deserialize_config
-from ..trainable_pipe import TrainablePipe
+from ..trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
 from ...language import Language
 from ...vocab import Vocab
 from ...training import Example, validate_examples, validate_get_examples
@ -103,6 +103,7 @@ class EntityLinker_v1(TrainablePipe):
        if len(self.kb) == 0:
            raise ValueError(Errors.E139.format(name=self.name))

+    @trainable_pipe_nvtx_range
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
@ -138,6 +139,7 @@ class EntityLinker_v1(TrainablePipe):
            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
        )

+    @trainable_pipe_nvtx_range
    def update(
        self,
        examples: Iterable[Example],
@ -203,6 +205,7 @@ class EntityLinker_v1(TrainablePipe):
        losses[self.name] += loss
        return losses

+    @trainable_pipe_nvtx_range
    def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
        validate_examples(examples, "EntityLinker_v1.get_loss")
        entity_encodings = []
@ -224,6 +227,7 @@ class EntityLinker_v1(TrainablePipe):
        loss = loss / len(entity_encodings)
        return float(loss), gradients

+    @trainable_pipe_nvtx_range
    def predict(self, docs: Iterable[Doc]) -> List[str]:
        """Apply the pipeline's model to a batch of docs, without modifying them.
        Returns the KB IDs for each entity in each doc, including NIL if there is
@ -312,6 +316,7 @@ class EntityLinker_v1(TrainablePipe):
            raise RuntimeError(err)
        return final_kb_ids

+    @trainable_pipe_nvtx_range
    def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
        """Modify a batch of documents, using pre-computed scores.

@ -419,8 +424,10 @@ class EntityLinker_v1(TrainablePipe):
        util.from_disk(path, deserialize, exclude)
        return self

+    @trainable_pipe_nvtx_range
    def rehearse(self, examples, *, sgd=None, losses=None, **config):
        raise NotImplementedError

+    @trainable_pipe_nvtx_range
    def add_label(self, label):
        raise NotImplementedError
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -6,7 +6,7 @@ from thinc.api import set_dropout_rate

 from ..tokens.doc cimport Doc

-from .trainable_pipe import TrainablePipe
+from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
 from .tagger import Tagger
 from ..training import validate_examples
 from ..language import Language
@ -78,9 +78,11 @@ class MultitaskObjective(Tagger):
    def labels(self, value):
        self.cfg["labels"] = value

+    @trainable_pipe_nvtx_range
    def set_annotations(self, docs, dep_ids):
        pass

+    @trainable_pipe_nvtx_range
    def initialize(self, get_examples, nlp=None, labels=None):
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
@ -95,11 +97,13 @@ class MultitaskObjective(Tagger):
                        self.labels[label] = len(self.labels)
        self.model.initialize()   # TODO: fix initialization by defining X and Y

+    @trainable_pipe_nvtx_range
    def predict(self, docs):
        tokvecs = self.model.get_ref("tok2vec")(docs)
        scores = self.model.get_ref("softmax")(tokvecs)
        return tokvecs, scores

+    @trainable_pipe_nvtx_range
    def get_loss(self, examples, scores):
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype="i")
@ -174,19 +178,23 @@ class ClozeMultitask(TrainablePipe):
        self.cfg = cfg
        self.distance = CosineDistance(ignore_zeros=True, normalize=False)  # TODO: in config

+    @trainable_pipe_nvtx_range
    def set_annotations(self, docs, dep_ids):
        pass

+    @trainable_pipe_nvtx_range
    def initialize(self, get_examples, nlp=None):
        self.model.initialize()  # TODO: fix initialization by defining X and Y
        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
        self.model.output_layer.initialize(X)

+    @trainable_pipe_nvtx_range
    def predict(self, docs):
        tokvecs = self.model.get_ref("tok2vec")(docs)
        vectors = self.model.get_ref("output_layer")(tokvecs)
        return tokvecs, vectors

+    @trainable_pipe_nvtx_range
    def get_loss(self, examples, vectors, prediction):
        validate_examples(examples, "ClozeMultitask.get_loss")
        # The simplest way to implement this would be to vstack the
@ -199,9 +207,11 @@ class ClozeMultitask(TrainablePipe):
        loss = self.distance.get_loss(prediction, target)
        return float(loss), gradient

+    @trainable_pipe_nvtx_range
    def update(self, examples, *, drop=0., sgd=None, losses=None):
        pass

+    @trainable_pipe_nvtx_range
    def rehearse(self, examples, drop=0., sgd=None, losses=None):
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
@ -217,5 +227,6 @@ class ClozeMultitask(TrainablePipe):
            losses[self.name] += loss
        return losses

+    @trainable_pipe_nvtx_range
    def add_label(self, label):
        raise NotImplementedError
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@ -8,7 +8,7 @@ import numpy
 from ..compat import Protocol, runtime_checkable
 from ..scorer import Scorer
 from ..language import Language
-from .trainable_pipe import TrainablePipe
+from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
 from ..tokens import Doc, SpanGroup, Span
 from ..vocab import Vocab
 from ..training import Example, validate_examples
@ -227,6 +227,7 @@ class SpanCategorizer(TrainablePipe):
        """
        return str(self.cfg["spans_key"])

+    @trainable_pipe_nvtx_range
    def add_label(self, label: str) -> int:
        """Add a new label to the pipe.

@ -260,6 +261,7 @@ class SpanCategorizer(TrainablePipe):
        """
        return list(self.labels)

+    @trainable_pipe_nvtx_range
    def predict(self, docs: Iterable[Doc]):
        """Apply the pipeline's model to a batch of docs, without modifying them.

@ -272,6 +274,7 @@ class SpanCategorizer(TrainablePipe):
        scores = self.model.predict((docs, indices))  # type: ignore
        return indices, scores

+    @trainable_pipe_nvtx_range
    def set_candidates(
        self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
    ) -> None:
@ -290,6 +293,7 @@ class SpanCategorizer(TrainablePipe):
            for index in candidates.dataXd:
                doc.spans[candidates_key].append(doc[index[0] : index[1]])

+    @trainable_pipe_nvtx_range
    def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
        """Modify a batch of Doc objects, using pre-computed scores.

@ -308,6 +312,7 @@ class SpanCategorizer(TrainablePipe):
            )
            offset += indices.lengths[i]

+    @trainable_pipe_nvtx_range
    def update(
        self,
        examples: Iterable[Example],
@ -349,6 +354,7 @@ class SpanCategorizer(TrainablePipe):
        losses[self.name] += loss
        return losses

+    @trainable_pipe_nvtx_range
    def get_loss(
        self, examples: Iterable[Example], spans_scores: Tuple[Ragged, Floats2d]
    ) -> Tuple[float, float]:
@ -399,6 +405,7 @@ class SpanCategorizer(TrainablePipe):
        loss = float((d_scores**2).sum())
        return loss, d_scores

+    @trainable_pipe_nvtx_range
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -11,7 +11,7 @@ from ..tokens.doc cimport Doc
 from ..morphology cimport Morphology
 from ..vocab cimport Vocab

-from .trainable_pipe import TrainablePipe
+from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
 from .pipe import deserialize_config
 from ..language import Language
 from ..attrs import POS, ID
@ -126,6 +126,7 @@ class Tagger(TrainablePipe):
        """Data about the labels currently added to the component."""
        return tuple(self.cfg["labels"])

+    @trainable_pipe_nvtx_range
    def predict(self, docs):
        """Apply the pipeline's model to a batch of docs, without modifying them.

@ -155,6 +156,7 @@ class Tagger(TrainablePipe):
            guesses.append(doc_guesses)
        return guesses

+    @trainable_pipe_nvtx_range
    def set_annotations(self, docs, batch_tag_ids):
        """Modify a batch of documents, using pre-computed scores.

@ -177,6 +179,7 @@ class Tagger(TrainablePipe):
                if doc.c[j].tag == 0 or overwrite:
                    doc.c[j].tag = self.vocab.strings[labels[tag_id]]

+    @trainable_pipe_nvtx_range
    def update(self, examples, *, drop=0., sgd=None, losses=None):
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to predict and get_loss.
@ -210,6 +213,7 @@ class Tagger(TrainablePipe):
        losses[self.name] += loss
        return losses

+    @trainable_pipe_nvtx_range
    def rehearse(self, examples, *, drop=0., sgd=None, losses=None):
        """Perform a "rehearsal" update from a batch of data. Rehearsal updates
        teach the current model to make predictions similar to an initial model,
@ -245,6 +249,7 @@ class Tagger(TrainablePipe):
        losses[self.name] += loss
        return losses

+    @trainable_pipe_nvtx_range
    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
        their predicted scores.
@ -269,6 +274,7 @@ class Tagger(TrainablePipe):
            raise ValueError(Errors.E910.format(name=self.name))
        return float(loss), d_scores

+    @trainable_pipe_nvtx_range
    def initialize(self, get_examples, *, nlp=None, labels=None):
        """Initialize the pipe for training, using a representative set
        of data examples.
@ -307,6 +313,7 @@ class Tagger(TrainablePipe):
        assert len(label_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(X=doc_sample, Y=label_sample)

+    @trainable_pipe_nvtx_range
    def add_label(self, label):
        """Add a new label to the pipe.

--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -4,7 +4,7 @@ from thinc.types import Floats2d
 import numpy
 from itertools import islice

-from .trainable_pipe import TrainablePipe
+from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
 from ..language import Language
 from ..training import Example, validate_examples, validate_get_examples
 from ..errors import Errors
@ -181,6 +181,7 @@ class TextCategorizer(TrainablePipe):
        """
        return self.labels  # type: ignore[return-value]

+    @trainable_pipe_nvtx_range
    def predict(self, docs: Iterable[Doc]):
        """Apply the pipeline's model to a batch of docs, without modifying them.

@ -199,6 +200,7 @@ class TextCategorizer(TrainablePipe):
        scores = self.model.ops.asarray(scores)
        return scores

+    @trainable_pipe_nvtx_range
    def set_annotations(self, docs: Iterable[Doc], scores) -> None:
        """Modify a batch of Doc objects, using pre-computed scores.

@ -211,6 +213,7 @@ class TextCategorizer(TrainablePipe):
            for j, label in enumerate(self.labels):
                doc.cats[label] = float(scores[i, j])

+    @trainable_pipe_nvtx_range
    def update(
        self,
        examples: Iterable[Example],
@ -248,6 +251,7 @@ class TextCategorizer(TrainablePipe):
        losses[self.name] += loss
        return losses

+    @trainable_pipe_nvtx_range
    def rehearse(
        self,
        examples: Iterable[Example],
@ -306,6 +310,7 @@ class TextCategorizer(TrainablePipe):
        truths = self.model.ops.asarray(truths)  # type: ignore
        return truths, not_missing  # type: ignore

+    @trainable_pipe_nvtx_range
    def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
        """Find the loss and gradient of loss for the batch of documents and
        their predicted scores.
@ -325,6 +330,7 @@ class TextCategorizer(TrainablePipe):
        mean_square_error = (d_scores**2).mean()
        return float(mean_square_error), d_scores

+    @trainable_pipe_nvtx_range
    def add_label(self, label: str) -> int:
        """Add a new label to the pipe.

@ -344,6 +350,7 @@ class TextCategorizer(TrainablePipe):
        self.vocab.strings.add(label)
        return 1

+    @trainable_pipe_nvtx_range
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -2,7 +2,7 @@ from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any
 from thinc.api import Model, set_dropout_rate, Optimizer, Config
 from itertools import islice

-from .trainable_pipe import TrainablePipe
+from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
 from ..training import Example, validate_examples, validate_get_examples
 from ..tokens import Doc
 from ..vocab import Vocab
@ -109,6 +109,7 @@ class Tok2Vec(TrainablePipe):
                if isinstance(node, Tok2VecListener) and node.upstream_name in names:
                    self.add_listener(node, component.name)

+    @trainable_pipe_nvtx_range
    def predict(self, docs: Iterable[Doc]):
        """Apply the pipeline's model to a batch of docs, without modifying them.
        Returns a single tensor for a batch of documents.
@ -128,6 +129,7 @@ class Tok2Vec(TrainablePipe):
            listener.receive(batch_id, tokvecs, _empty_backprop)
        return tokvecs

+    @trainable_pipe_nvtx_range
    def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
        """Modify a batch of documents, using pre-computed scores.

@ -140,6 +142,7 @@ class Tok2Vec(TrainablePipe):
            assert tokvecs.shape[0] == len(doc)
            doc.tensor = tokvecs

+    @trainable_pipe_nvtx_range
    def update(
        self,
        examples: Iterable[Example],
@ -194,9 +197,11 @@ class Tok2Vec(TrainablePipe):
            self.listeners[-1].receive(batch_id, tokvecs, backprop)
        return losses

+    @trainable_pipe_nvtx_range
    def get_loss(self, examples, scores) -> None:
        pass

+    @trainable_pipe_nvtx_range
    def initialize(
        self,
        get_examples: Callable[[], Iterable[Example]],
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -21,7 +21,7 @@ from ..ml.parser_model cimport predict_states, arg_max_if_valid
 from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
 from ..ml.parser_model cimport get_c_weights, get_c_sizes
 from ..tokens.doc cimport Doc
-from .trainable_pipe import TrainablePipe
+from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
 from ._parser_internals cimport _beam_utils
 from ._parser_internals import _beam_utils

@ -159,6 +159,7 @@ cdef class Parser(TrainablePipe):
    def incorrect_spans_key(self):
        return self.cfg["incorrect_spans_key"]

+    @trainable_pipe_nvtx_range
    def add_label(self, label):
        resized = False
        for action in self.moves.action_types:
@ -214,6 +215,7 @@ cdef class Parser(TrainablePipe):
        with self.model.use_params(params):
            yield

+    @trainable_pipe_nvtx_range
    def pipe(self, docs, *, int batch_size=256):
        """Process a stream of documents.

@ -240,6 +242,7 @@ cdef class Parser(TrainablePipe):
                error_handler(self.name, self, batch_in_order, e)


+    @trainable_pipe_nvtx_range
    def predict(self, docs):
        if isinstance(docs, Doc):
            docs = [docs]
@ -256,6 +259,7 @@ cdef class Parser(TrainablePipe):
                beam_density=self.cfg["beam_density"]
            )

+    @trainable_pipe_nvtx_range
    def greedy_parse(self, docs, drop=0.):
        cdef vector[StateC*] states
        cdef StateClass state
@ -280,6 +284,7 @@ cdef class Parser(TrainablePipe):
        del model
        return batch

+    @trainable_pipe_nvtx_range
    def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
        cdef Beam beam
        cdef Doc doc
@ -321,6 +326,7 @@ cdef class Parser(TrainablePipe):
            unfinished.clear()
        free_activations(&activations)

+    @trainable_pipe_nvtx_range
    def set_annotations(self, docs, states_or_beams):
        cdef StateClass state
        cdef Beam beam
@ -331,6 +337,7 @@ cdef class Parser(TrainablePipe):
            for hook in self.postprocesses:
                hook(doc)

+    @trainable_pipe_nvtx_range
    def transition_states(self, states, float[:, ::1] scores):
        cdef StateClass state
        cdef float* c_scores = &scores[0, 0]
@ -360,6 +367,7 @@ cdef class Parser(TrainablePipe):
                action.do(states[i], action.label)
        free(is_valid)

+    @trainable_pipe_nvtx_range
    def update(self, examples, *, drop=0., sgd=None, losses=None):
        cdef StateClass state
        if losses is None:
@ -432,6 +440,7 @@ cdef class Parser(TrainablePipe):
        del model
        return losses

+    @trainable_pipe_nvtx_range
    def rehearse(self, examples, sgd=None, losses=None, **cfg):
        """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
        if losses is None:
@ -481,6 +490,7 @@ cdef class Parser(TrainablePipe):
        del tutor
        return losses

+    @trainable_pipe_nvtx_range
    def update_beam(self, examples, *, beam_width,
            drop=0., sgd=None, losses=None, beam_density=0.0):
        states, golds, _ = self.moves.init_gold_batch(examples)
@ -502,6 +512,7 @@ cdef class Parser(TrainablePipe):
        if sgd is not None:
            self.finish_update(sgd)

+    @trainable_pipe_nvtx_range
    def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
        cdef StateClass state
        cdef Pool mem = Pool()
@ -535,6 +546,7 @@ cdef class Parser(TrainablePipe):
    def set_output(self, nO):
        self.model.attrs["resize_output"](self.model, nO)

+    @trainable_pipe_nvtx_range
    def initialize(self, get_examples, nlp=None, labels=None):
        validate_get_examples(get_examples, "Parser.initialize")
        util.check_lexeme_norms(self.vocab, "parser or NER")