From eba89f08bd90a743631dbdfa9fef86d22cf2cbcb Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Sun, 20 Oct 2019 17:47:15 +0200 Subject: [PATCH] Use chars loss in ClozeMultitask --- spacy/pipeline/pipes.pyx | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 97c3a0cdf..7aca98010 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -30,6 +30,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier from .._ml import build_bow_text_classifier, build_nel_encoder from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss +from .._ml import MultiSoftmax, get_characters_loss from ..errors import Errors, TempErrors, user_warning, Warnings from .. import util @@ -837,11 +838,15 @@ class MultitaskObjective(Tagger): class ClozeMultitask(Pipe): @classmethod def Model(cls, vocab, tok2vec, **cfg): - output_size = vocab.vectors.data.shape[1] - output_layer = chain( - LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)), - zero_init(Affine(output_size, output_size, drop_factor=0.0)) - ) + if cfg["objective"] == "characters": + out_sizes = [256] * cfg.get("nr_char", 10) + output_layer = MultiSoftmax(out_sizes) + else: + output_size = vocab.vectors.data.shape[1] + output_layer = chain( + LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)), + zero_init(Affine(output_size, output_size, drop_factor=0.0)) + ) model = chain(tok2vec, output_layer) model = masked_language_model(vocab, model) model.tok2vec = tok2vec @@ -852,6 +857,8 @@ class ClozeMultitask(Pipe): self.vocab = vocab self.model = model self.cfg = cfg + self.cfg.setdefault("objective", "characters") + self.cfg.setdefault("nr_char", 10) def set_annotations(self, docs, dep_ids, tensors=None): pass @@ -860,7 +867,8 @@ class ClozeMultitask(Pipe): tok2vec=None, sgd=None, **kwargs): link_vectors_to_models(self.vocab) if self.model is True: - self.model = self.Model(self.vocab, tok2vec) + kwargs.update(self.cfg) + self.model = self.Model(self.vocab, tok2vec, **kwargs) X = self.model.ops.allocate((5, self.model.tok2vec.nO)) self.model.output_layer.begin_training(X) if sgd is None: @@ -874,13 +882,16 @@ class ClozeMultitask(Pipe): return tokvecs, vectors def get_loss(self, docs, vectors, prediction): - # The simplest way to implement this would be to vstack the - # token.vector values, but that's a bit inefficient, especially on GPU. - # Instead we fetch the index into the vectors table for each of our tokens, - # and look them up all at once. This prevents data copying. - ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) - target = vectors[ids] - loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True) + if self.cfg["objective"] == "characters": + loss, gradient = get_characters_loss(self.model.ops, docs, prediction) + else: + # The simplest way to implement this would be to vstack the + # token.vector values, but that's a bit inefficient, especially on GPU. + # Instead we fetch the index into the vectors table for each of our tokens, + # and look them up all at once. This prevents data copying. + ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) + target = vectors[ids] + loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True) return float(loss), gradient def update(self, docs, golds, drop=0., sgd=None, losses=None):