From eba89f08bd90a743631dbdfa9fef86d22cf2cbcb Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Oct 2019 17:47:15 +0200
Subject: [PATCH] Use chars loss in ClozeMultitask

---
 spacy/pipeline/pipes.pyx | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 97c3a0cdf..7aca98010 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -30,6 +30,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier
 from .._ml import build_bow_text_classifier, build_nel_encoder
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
+from .._ml import MultiSoftmax, get_characters_loss
 from ..errors import Errors, TempErrors, user_warning, Warnings
 from .. import util
 
@@ -837,11 +838,15 @@ class MultitaskObjective(Tagger):
 class ClozeMultitask(Pipe):
     @classmethod
     def Model(cls, vocab, tok2vec, **cfg):
-        output_size = vocab.vectors.data.shape[1]
-        output_layer = chain(
-            LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
-            zero_init(Affine(output_size, output_size, drop_factor=0.0))
-        )
+        if cfg["objective"] == "characters":
+            out_sizes = [256] * cfg.get("nr_char", 10)
+            output_layer = MultiSoftmax(out_sizes)
+        else:
+            output_size = vocab.vectors.data.shape[1]
+            output_layer = chain(
+                LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
+                zero_init(Affine(output_size, output_size, drop_factor=0.0))
+            )
         model = chain(tok2vec, output_layer)
         model = masked_language_model(vocab, model)
         model.tok2vec = tok2vec
@@ -852,6 +857,8 @@ class ClozeMultitask(Pipe):
         self.vocab = vocab
         self.model = model
         self.cfg = cfg
+        self.cfg.setdefault("objective", "characters")
+        self.cfg.setdefault("nr_char", 10)
 
     def set_annotations(self, docs, dep_ids, tensors=None):
         pass
@@ -860,7 +867,8 @@ class ClozeMultitask(Pipe):
                         tok2vec=None, sgd=None, **kwargs):
         link_vectors_to_models(self.vocab)
         if self.model is True:
-            self.model = self.Model(self.vocab, tok2vec)
+            kwargs.update(self.cfg)
+            self.model = self.Model(self.vocab, tok2vec, **kwargs)
         X = self.model.ops.allocate((5, self.model.tok2vec.nO))
         self.model.output_layer.begin_training(X)
         if sgd is None:
@@ -874,13 +882,16 @@ class ClozeMultitask(Pipe):
         return tokvecs, vectors
 
     def get_loss(self, docs, vectors, prediction):
-        # The simplest way to implement this would be to vstack the
-        # token.vector values, but that's a bit inefficient, especially on GPU.
-        # Instead we fetch the index into the vectors table for each of our tokens,
-        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
-        target = vectors[ids]
-        loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
+        if self.cfg["objective"] == "characters":
+            loss, gradient = get_characters_loss(self.model.ops, docs, prediction)
+        else:
+            # The simplest way to implement this would be to vstack the
+            # token.vector values, but that's a bit inefficient, especially on GPU.
+            # Instead we fetch the index into the vectors table for each of our tokens,
+            # and look them up all at once. This prevents data copying.
+            ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+            target = vectors[ids]
+            loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
         return float(loss), gradient
 
     def update(self, docs, golds, drop=0., sgd=None, losses=None):