Use chars loss in ClozeMultitask

2025-07-15 18:52:29 +03:00 · 2019-10-20 17:47:15 +02:00 · 2019-10-20 17:47:15 +02:00 · eba89f08bd
commit eba89f08bd
parent 77af446d04
1 changed files with 24 additions and 13 deletions
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -30,6 +30,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier
 from .._ml import build_bow_text_classifier, build_nel_encoder
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
+from .._ml import MultiSoftmax, get_characters_loss
 from ..errors import Errors, TempErrors, user_warning, Warnings
 from .. import util

@ -837,11 +838,15 @@ class MultitaskObjective(Tagger):
 class ClozeMultitask(Pipe):
    @classmethod
    def Model(cls, vocab, tok2vec, **cfg):
-        output_size = vocab.vectors.data.shape[1]
-        output_layer = chain(
-            LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
-            zero_init(Affine(output_size, output_size, drop_factor=0.0))
-        )
+        if cfg["objective"] == "characters":
+            out_sizes = [256] * cfg.get("nr_char", 10)
+            output_layer = MultiSoftmax(out_sizes)
+        else:
+            output_size = vocab.vectors.data.shape[1]
+            output_layer = chain(
+                LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
+                zero_init(Affine(output_size, output_size, drop_factor=0.0))
+            )
        model = chain(tok2vec, output_layer)
        model = masked_language_model(vocab, model)
        model.tok2vec = tok2vec
@ -852,6 +857,8 @@ class ClozeMultitask(Pipe):
        self.vocab = vocab
        self.model = model
        self.cfg = cfg
+        self.cfg.setdefault("objective", "characters")
+        self.cfg.setdefault("nr_char", 10)

    def set_annotations(self, docs, dep_ids, tensors=None):
        pass
@ -860,7 +867,8 @@ class ClozeMultitask(Pipe):
                        tok2vec=None, sgd=None, **kwargs):
        link_vectors_to_models(self.vocab)
        if self.model is True:
-            self.model = self.Model(self.vocab, tok2vec)
+            kwargs.update(self.cfg)
+            self.model = self.Model(self.vocab, tok2vec, **kwargs)
        X = self.model.ops.allocate((5, self.model.tok2vec.nO))
        self.model.output_layer.begin_training(X)
        if sgd is None:
@ -874,13 +882,16 @@ class ClozeMultitask(Pipe):
        return tokvecs, vectors

    def get_loss(self, docs, vectors, prediction):
-        # The simplest way to implement this would be to vstack the
-        # token.vector values, but that's a bit inefficient, especially on GPU.
-        # Instead we fetch the index into the vectors table for each of our tokens,
-        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
-        target = vectors[ids]
-        loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
+        if self.cfg["objective"] == "characters":
+            loss, gradient = get_characters_loss(self.model.ops, docs, prediction)
+        else:
+            # The simplest way to implement this would be to vstack the
+            # token.vector values, but that's a bit inefficient, especially on GPU.
+            # Instead we fetch the index into the vectors table for each of our tokens,
+            # and look them up all at once. This prevents data copying.
+            ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+            target = vectors[ids]
+            loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
        return float(loss), gradient

    def update(self, docs, golds, drop=0., sgd=None, losses=None):