Annotate TrainablePipe subclasses with NVTX ranges

This commit is contained in:
shademe 2022-06-14 15:29:40 +02:00
parent 26536eb6b8
commit d8684f7372
9 changed files with 76 additions and 9 deletions

View File

@ -12,7 +12,7 @@ from thinc.types import Floats2d, Ints1d, Ints2d
from ._edit_tree_internals.edit_trees import EditTrees
from ._edit_tree_internals.schemas import validate_edit_tree
from .lemmatizer import lemmatizer_score
from .trainable_pipe import TrainablePipe
from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
from ..errors import Errors
from ..language import Language
from ..tokens import Doc
@ -117,6 +117,7 @@ class EditTreeLemmatizer(TrainablePipe):
self.cfg: Dict[str, Any] = {"labels": []}
self.scorer = scorer
@trainable_pipe_nvtx_range
def get_loss(
self, examples: Iterable[Example], scores: List[Floats2d]
) -> Tuple[float, List[Floats2d]]:
@ -144,6 +145,7 @@ class EditTreeLemmatizer(TrainablePipe):
return float(loss), d_scores
@trainable_pipe_nvtx_range
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
n_docs = len(list(docs))
if not any(len(doc) for doc in docs):
@ -186,6 +188,7 @@ class EditTreeLemmatizer(TrainablePipe):
return guesses
@trainable_pipe_nvtx_range
def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
for i, doc in enumerate(docs):
doc_tree_ids = batch_tree_ids[i]
@ -224,6 +227,7 @@ class EditTreeLemmatizer(TrainablePipe):
trees.append(tree)
return dict(trees=trees, labels=tuple(self.cfg["labels"]))
@trainable_pipe_nvtx_range
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],

View File

@ -12,7 +12,7 @@ from ..ml import empty_kb
from ..tokens import Doc, Span
from .pipe import deserialize_config
from .legacy.entity_linker import EntityLinker_v1
from .trainable_pipe import TrainablePipe
from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
from ..language import Language
from ..vocab import Vocab
from ..training import Example, validate_examples, validate_get_examples
@ -208,6 +208,7 @@ class EntityLinker(TrainablePipe):
if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name))
@trainable_pipe_nvtx_range
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
@ -278,6 +279,7 @@ class EntityLinker(TrainablePipe):
return False
@trainable_pipe_nvtx_range
def update(
self,
examples: Iterable[Example],
@ -338,6 +340,7 @@ class EntityLinker(TrainablePipe):
losses[self.name] += loss
return losses
@trainable_pipe_nvtx_range
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
validate_examples(examples, "EntityLinker.get_loss")
entity_encodings = []
@ -377,6 +380,7 @@ class EntityLinker(TrainablePipe):
loss = loss / len(entity_encodings)
return float(loss), out
@trainable_pipe_nvtx_range
def predict(self, docs: Iterable[Doc]) -> List[str]:
"""Apply the pipeline's model to a batch of docs, without modifying them.
Returns the KB IDs for each entity in each doc, including NIL if there is
@ -466,6 +470,7 @@ class EntityLinker(TrainablePipe):
raise RuntimeError(err)
return final_kb_ids
@trainable_pipe_nvtx_range
def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
"""Modify a batch of documents, using pre-computed scores.
@ -573,8 +578,10 @@ class EntityLinker(TrainablePipe):
util.from_disk(path, deserialize, exclude)
return self
@trainable_pipe_nvtx_range
def rehearse(self, examples, *, sgd=None, losses=None, **config):
raise NotImplementedError
@trainable_pipe_nvtx_range
def add_label(self, label):
raise NotImplementedError

View File

@ -15,7 +15,7 @@ from ...kb import KnowledgeBase, Candidate
from ...ml import empty_kb
from ...tokens import Doc, Span
from ..pipe import deserialize_config
from ..trainable_pipe import TrainablePipe
from ..trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
from ...language import Language
from ...vocab import Vocab
from ...training import Example, validate_examples, validate_get_examples
@ -103,6 +103,7 @@ class EntityLinker_v1(TrainablePipe):
if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name))
@trainable_pipe_nvtx_range
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
@ -138,6 +139,7 @@ class EntityLinker_v1(TrainablePipe):
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
)
@trainable_pipe_nvtx_range
def update(
self,
examples: Iterable[Example],
@ -203,6 +205,7 @@ class EntityLinker_v1(TrainablePipe):
losses[self.name] += loss
return losses
@trainable_pipe_nvtx_range
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
validate_examples(examples, "EntityLinker_v1.get_loss")
entity_encodings = []
@ -224,6 +227,7 @@ class EntityLinker_v1(TrainablePipe):
loss = loss / len(entity_encodings)
return float(loss), gradients
@trainable_pipe_nvtx_range
def predict(self, docs: Iterable[Doc]) -> List[str]:
"""Apply the pipeline's model to a batch of docs, without modifying them.
Returns the KB IDs for each entity in each doc, including NIL if there is
@ -312,6 +316,7 @@ class EntityLinker_v1(TrainablePipe):
raise RuntimeError(err)
return final_kb_ids
@trainable_pipe_nvtx_range
def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
"""Modify a batch of documents, using pre-computed scores.
@ -419,8 +424,10 @@ class EntityLinker_v1(TrainablePipe):
util.from_disk(path, deserialize, exclude)
return self
@trainable_pipe_nvtx_range
def rehearse(self, examples, *, sgd=None, losses=None, **config):
raise NotImplementedError
@trainable_pipe_nvtx_range
def add_label(self, label):
raise NotImplementedError

View File

@ -6,7 +6,7 @@ from thinc.api import set_dropout_rate
from ..tokens.doc cimport Doc
from .trainable_pipe import TrainablePipe
from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
from .tagger import Tagger
from ..training import validate_examples
from ..language import Language
@ -78,9 +78,11 @@ class MultitaskObjective(Tagger):
def labels(self, value):
self.cfg["labels"] = value
@trainable_pipe_nvtx_range
def set_annotations(self, docs, dep_ids):
pass
@trainable_pipe_nvtx_range
def initialize(self, get_examples, nlp=None, labels=None):
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
@ -95,11 +97,13 @@ class MultitaskObjective(Tagger):
self.labels[label] = len(self.labels)
self.model.initialize() # TODO: fix initialization by defining X and Y
@trainable_pipe_nvtx_range
def predict(self, docs):
tokvecs = self.model.get_ref("tok2vec")(docs)
scores = self.model.get_ref("softmax")(tokvecs)
return tokvecs, scores
@trainable_pipe_nvtx_range
def get_loss(self, examples, scores):
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype="i")
@ -174,19 +178,23 @@ class ClozeMultitask(TrainablePipe):
self.cfg = cfg
self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config
@trainable_pipe_nvtx_range
def set_annotations(self, docs, dep_ids):
pass
@trainable_pipe_nvtx_range
def initialize(self, get_examples, nlp=None):
self.model.initialize() # TODO: fix initialization by defining X and Y
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.initialize(X)
@trainable_pipe_nvtx_range
def predict(self, docs):
tokvecs = self.model.get_ref("tok2vec")(docs)
vectors = self.model.get_ref("output_layer")(tokvecs)
return tokvecs, vectors
@trainable_pipe_nvtx_range
def get_loss(self, examples, vectors, prediction):
validate_examples(examples, "ClozeMultitask.get_loss")
# The simplest way to implement this would be to vstack the
@ -199,9 +207,11 @@ class ClozeMultitask(TrainablePipe):
loss = self.distance.get_loss(prediction, target)
return float(loss), gradient
@trainable_pipe_nvtx_range
def update(self, examples, *, drop=0., sgd=None, losses=None):
pass
@trainable_pipe_nvtx_range
def rehearse(self, examples, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
@ -217,5 +227,6 @@ class ClozeMultitask(TrainablePipe):
losses[self.name] += loss
return losses
@trainable_pipe_nvtx_range
def add_label(self, label):
raise NotImplementedError

View File

@ -8,7 +8,7 @@ import numpy
from ..compat import Protocol, runtime_checkable
from ..scorer import Scorer
from ..language import Language
from .trainable_pipe import TrainablePipe
from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
from ..tokens import Doc, SpanGroup, Span
from ..vocab import Vocab
from ..training import Example, validate_examples
@ -227,6 +227,7 @@ class SpanCategorizer(TrainablePipe):
"""
return str(self.cfg["spans_key"])
@trainable_pipe_nvtx_range
def add_label(self, label: str) -> int:
"""Add a new label to the pipe.
@ -260,6 +261,7 @@ class SpanCategorizer(TrainablePipe):
"""
return list(self.labels)
@trainable_pipe_nvtx_range
def predict(self, docs: Iterable[Doc]):
"""Apply the pipeline's model to a batch of docs, without modifying them.
@ -272,6 +274,7 @@ class SpanCategorizer(TrainablePipe):
scores = self.model.predict((docs, indices)) # type: ignore
return indices, scores
@trainable_pipe_nvtx_range
def set_candidates(
self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
) -> None:
@ -290,6 +293,7 @@ class SpanCategorizer(TrainablePipe):
for index in candidates.dataXd:
doc.spans[candidates_key].append(doc[index[0] : index[1]])
@trainable_pipe_nvtx_range
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
"""Modify a batch of Doc objects, using pre-computed scores.
@ -308,6 +312,7 @@ class SpanCategorizer(TrainablePipe):
)
offset += indices.lengths[i]
@trainable_pipe_nvtx_range
def update(
self,
examples: Iterable[Example],
@ -349,6 +354,7 @@ class SpanCategorizer(TrainablePipe):
losses[self.name] += loss
return losses
@trainable_pipe_nvtx_range
def get_loss(
self, examples: Iterable[Example], spans_scores: Tuple[Ragged, Floats2d]
) -> Tuple[float, float]:
@ -399,6 +405,7 @@ class SpanCategorizer(TrainablePipe):
loss = float((d_scores**2).sum())
return loss, d_scores
@trainable_pipe_nvtx_range
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],

View File

@ -11,7 +11,7 @@ from ..tokens.doc cimport Doc
from ..morphology cimport Morphology
from ..vocab cimport Vocab
from .trainable_pipe import TrainablePipe
from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
from .pipe import deserialize_config
from ..language import Language
from ..attrs import POS, ID
@ -126,6 +126,7 @@ class Tagger(TrainablePipe):
"""Data about the labels currently added to the component."""
return tuple(self.cfg["labels"])
@trainable_pipe_nvtx_range
def predict(self, docs):
"""Apply the pipeline's model to a batch of docs, without modifying them.
@ -155,6 +156,7 @@ class Tagger(TrainablePipe):
guesses.append(doc_guesses)
return guesses
@trainable_pipe_nvtx_range
def set_annotations(self, docs, batch_tag_ids):
"""Modify a batch of documents, using pre-computed scores.
@ -177,6 +179,7 @@ class Tagger(TrainablePipe):
if doc.c[j].tag == 0 or overwrite:
doc.c[j].tag = self.vocab.strings[labels[tag_id]]
@trainable_pipe_nvtx_range
def update(self, examples, *, drop=0., sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
@ -210,6 +213,7 @@ class Tagger(TrainablePipe):
losses[self.name] += loss
return losses
@trainable_pipe_nvtx_range
def rehearse(self, examples, *, drop=0., sgd=None, losses=None):
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
teach the current model to make predictions similar to an initial model,
@ -245,6 +249,7 @@ class Tagger(TrainablePipe):
losses[self.name] += loss
return losses
@trainable_pipe_nvtx_range
def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and
their predicted scores.
@ -269,6 +274,7 @@ class Tagger(TrainablePipe):
raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores
@trainable_pipe_nvtx_range
def initialize(self, get_examples, *, nlp=None, labels=None):
"""Initialize the pipe for training, using a representative set
of data examples.
@ -307,6 +313,7 @@ class Tagger(TrainablePipe):
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
@trainable_pipe_nvtx_range
def add_label(self, label):
"""Add a new label to the pipe.

View File

@ -4,7 +4,7 @@ from thinc.types import Floats2d
import numpy
from itertools import islice
from .trainable_pipe import TrainablePipe
from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
from ..language import Language
from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors
@ -181,6 +181,7 @@ class TextCategorizer(TrainablePipe):
"""
return self.labels # type: ignore[return-value]
@trainable_pipe_nvtx_range
def predict(self, docs: Iterable[Doc]):
"""Apply the pipeline's model to a batch of docs, without modifying them.
@ -199,6 +200,7 @@ class TextCategorizer(TrainablePipe):
scores = self.model.ops.asarray(scores)
return scores
@trainable_pipe_nvtx_range
def set_annotations(self, docs: Iterable[Doc], scores) -> None:
"""Modify a batch of Doc objects, using pre-computed scores.
@ -211,6 +213,7 @@ class TextCategorizer(TrainablePipe):
for j, label in enumerate(self.labels):
doc.cats[label] = float(scores[i, j])
@trainable_pipe_nvtx_range
def update(
self,
examples: Iterable[Example],
@ -248,6 +251,7 @@ class TextCategorizer(TrainablePipe):
losses[self.name] += loss
return losses
@trainable_pipe_nvtx_range
def rehearse(
self,
examples: Iterable[Example],
@ -306,6 +310,7 @@ class TextCategorizer(TrainablePipe):
truths = self.model.ops.asarray(truths) # type: ignore
return truths, not_missing # type: ignore
@trainable_pipe_nvtx_range
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
"""Find the loss and gradient of loss for the batch of documents and
their predicted scores.
@ -325,6 +330,7 @@ class TextCategorizer(TrainablePipe):
mean_square_error = (d_scores**2).mean()
return float(mean_square_error), d_scores
@trainable_pipe_nvtx_range
def add_label(self, label: str) -> int:
"""Add a new label to the pipe.
@ -344,6 +350,7 @@ class TextCategorizer(TrainablePipe):
self.vocab.strings.add(label)
return 1
@trainable_pipe_nvtx_range
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],

View File

@ -2,7 +2,7 @@ from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any
from thinc.api import Model, set_dropout_rate, Optimizer, Config
from itertools import islice
from .trainable_pipe import TrainablePipe
from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
from ..training import Example, validate_examples, validate_get_examples
from ..tokens import Doc
from ..vocab import Vocab
@ -109,6 +109,7 @@ class Tok2Vec(TrainablePipe):
if isinstance(node, Tok2VecListener) and node.upstream_name in names:
self.add_listener(node, component.name)
@trainable_pipe_nvtx_range
def predict(self, docs: Iterable[Doc]):
"""Apply the pipeline's model to a batch of docs, without modifying them.
Returns a single tensor for a batch of documents.
@ -128,6 +129,7 @@ class Tok2Vec(TrainablePipe):
listener.receive(batch_id, tokvecs, _empty_backprop)
return tokvecs
@trainable_pipe_nvtx_range
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
"""Modify a batch of documents, using pre-computed scores.
@ -140,6 +142,7 @@ class Tok2Vec(TrainablePipe):
assert tokvecs.shape[0] == len(doc)
doc.tensor = tokvecs
@trainable_pipe_nvtx_range
def update(
self,
examples: Iterable[Example],
@ -194,9 +197,11 @@ class Tok2Vec(TrainablePipe):
self.listeners[-1].receive(batch_id, tokvecs, backprop)
return losses
@trainable_pipe_nvtx_range
def get_loss(self, examples, scores) -> None:
pass
@trainable_pipe_nvtx_range
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],

View File

@ -21,7 +21,7 @@ from ..ml.parser_model cimport predict_states, arg_max_if_valid
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
from ..ml.parser_model cimport get_c_weights, get_c_sizes
from ..tokens.doc cimport Doc
from .trainable_pipe import TrainablePipe
from .trainable_pipe import TrainablePipe, trainable_pipe_nvtx_range
from ._parser_internals cimport _beam_utils
from ._parser_internals import _beam_utils
@ -159,6 +159,7 @@ cdef class Parser(TrainablePipe):
def incorrect_spans_key(self):
return self.cfg["incorrect_spans_key"]
@trainable_pipe_nvtx_range
def add_label(self, label):
resized = False
for action in self.moves.action_types:
@ -214,6 +215,7 @@ cdef class Parser(TrainablePipe):
with self.model.use_params(params):
yield
@trainable_pipe_nvtx_range
def pipe(self, docs, *, int batch_size=256):
"""Process a stream of documents.
@ -240,6 +242,7 @@ cdef class Parser(TrainablePipe):
error_handler(self.name, self, batch_in_order, e)
@trainable_pipe_nvtx_range
def predict(self, docs):
if isinstance(docs, Doc):
docs = [docs]
@ -256,6 +259,7 @@ cdef class Parser(TrainablePipe):
beam_density=self.cfg["beam_density"]
)
@trainable_pipe_nvtx_range
def greedy_parse(self, docs, drop=0.):
cdef vector[StateC*] states
cdef StateClass state
@ -280,6 +284,7 @@ cdef class Parser(TrainablePipe):
del model
return batch
@trainable_pipe_nvtx_range
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
cdef Beam beam
cdef Doc doc
@ -321,6 +326,7 @@ cdef class Parser(TrainablePipe):
unfinished.clear()
free_activations(&activations)
@trainable_pipe_nvtx_range
def set_annotations(self, docs, states_or_beams):
cdef StateClass state
cdef Beam beam
@ -331,6 +337,7 @@ cdef class Parser(TrainablePipe):
for hook in self.postprocesses:
hook(doc)
@trainable_pipe_nvtx_range
def transition_states(self, states, float[:, ::1] scores):
cdef StateClass state
cdef float* c_scores = &scores[0, 0]
@ -360,6 +367,7 @@ cdef class Parser(TrainablePipe):
action.do(states[i], action.label)
free(is_valid)
@trainable_pipe_nvtx_range
def update(self, examples, *, drop=0., sgd=None, losses=None):
cdef StateClass state
if losses is None:
@ -432,6 +440,7 @@ cdef class Parser(TrainablePipe):
del model
return losses
@trainable_pipe_nvtx_range
def rehearse(self, examples, sgd=None, losses=None, **cfg):
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
if losses is None:
@ -481,6 +490,7 @@ cdef class Parser(TrainablePipe):
del tutor
return losses
@trainable_pipe_nvtx_range
def update_beam(self, examples, *, beam_width,
drop=0., sgd=None, losses=None, beam_density=0.0):
states, golds, _ = self.moves.init_gold_batch(examples)
@ -502,6 +512,7 @@ cdef class Parser(TrainablePipe):
if sgd is not None:
self.finish_update(sgd)
@trainable_pipe_nvtx_range
def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
cdef StateClass state
cdef Pool mem = Pool()
@ -535,6 +546,7 @@ cdef class Parser(TrainablePipe):
def set_output(self, nO):
self.model.attrs["resize_output"](self.model, nO)
@trainable_pipe_nvtx_range
def initialize(self, get_examples, nlp=None, labels=None):
validate_get_examples(get_examples, "Parser.initialize")
util.check_lexeme_norms(self.vocab, "parser or NER")