Use chars loss in ClozeMultitask

2025-07-16 03:02:41 +03:00 · 2019-10-20 17:47:15 +02:00 · 2019-10-20 17:47:15 +02:00 · eba89f08bd
commit eba89f08bd
parent 77af446d04
1 changed files with 24 additions and 13 deletions
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -30,6 +30,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier
 from .._ml import build_bow_text_classifier, build_nel_encoder
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
+from .._ml import MultiSoftmax, get_characters_loss
 from ..errors import Errors, TempErrors, user_warning, Warnings
 from .. import util

@ -837,6 +838,10 @@ class MultitaskObjective(Tagger):
 class ClozeMultitask(Pipe):
    @classmethod
    def Model(cls, vocab, tok2vec, **cfg):
+        if cfg["objective"] == "characters":
+            out_sizes = [256] * cfg.get("nr_char", 10)
+            output_layer = MultiSoftmax(out_sizes)
+        else:
            output_size = vocab.vectors.data.shape[1]
            output_layer = chain(
                LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
@ -852,6 +857,8 @@ class ClozeMultitask(Pipe):
        self.vocab = vocab
        self.model = model
        self.cfg = cfg
+        self.cfg.setdefault("objective", "characters")
+        self.cfg.setdefault("nr_char", 10)

    def set_annotations(self, docs, dep_ids, tensors=None):
        pass
@ -860,7 +867,8 @@ class ClozeMultitask(Pipe):
                        tok2vec=None, sgd=None, **kwargs):
        link_vectors_to_models(self.vocab)
        if self.model is True:
-            self.model = self.Model(self.vocab, tok2vec)
+            kwargs.update(self.cfg)
+            self.model = self.Model(self.vocab, tok2vec, **kwargs)
        X = self.model.ops.allocate((5, self.model.tok2vec.nO))
        self.model.output_layer.begin_training(X)
        if sgd is None:
@ -874,6 +882,9 @@ class ClozeMultitask(Pipe):
        return tokvecs, vectors

    def get_loss(self, docs, vectors, prediction):
+        if self.cfg["objective"] == "characters":
+            loss, gradient = get_characters_loss(self.model.ops, docs, prediction)
+        else:
            # The simplest way to implement this would be to vstack the
            # token.vector values, but that's a bit inefficient, especially on GPU.
            # Instead we fetch the index into the vectors table for each of our tokens,