Use chars loss in ClozeMultitask

This commit is contained in:
Matthw Honnibal 2019-10-20 17:47:15 +02:00
parent 77af446d04
commit eba89f08bd

View File

@ -30,6 +30,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier
from .._ml import build_bow_text_classifier, build_nel_encoder from .._ml import build_bow_text_classifier, build_nel_encoder
from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import link_vectors_to_models, zero_init, flatten
from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
from .._ml import MultiSoftmax, get_characters_loss
from ..errors import Errors, TempErrors, user_warning, Warnings from ..errors import Errors, TempErrors, user_warning, Warnings
from .. import util from .. import util
@ -837,11 +838,15 @@ class MultitaskObjective(Tagger):
class ClozeMultitask(Pipe): class ClozeMultitask(Pipe):
@classmethod @classmethod
def Model(cls, vocab, tok2vec, **cfg): def Model(cls, vocab, tok2vec, **cfg):
output_size = vocab.vectors.data.shape[1] if cfg["objective"] == "characters":
output_layer = chain( out_sizes = [256] * cfg.get("nr_char", 10)
LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)), output_layer = MultiSoftmax(out_sizes)
zero_init(Affine(output_size, output_size, drop_factor=0.0)) else:
) output_size = vocab.vectors.data.shape[1]
output_layer = chain(
LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
zero_init(Affine(output_size, output_size, drop_factor=0.0))
)
model = chain(tok2vec, output_layer) model = chain(tok2vec, output_layer)
model = masked_language_model(vocab, model) model = masked_language_model(vocab, model)
model.tok2vec = tok2vec model.tok2vec = tok2vec
@ -852,6 +857,8 @@ class ClozeMultitask(Pipe):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.cfg = cfg self.cfg = cfg
self.cfg.setdefault("objective", "characters")
self.cfg.setdefault("nr_char", 10)
def set_annotations(self, docs, dep_ids, tensors=None): def set_annotations(self, docs, dep_ids, tensors=None):
pass pass
@ -860,7 +867,8 @@ class ClozeMultitask(Pipe):
tok2vec=None, sgd=None, **kwargs): tok2vec=None, sgd=None, **kwargs):
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if self.model is True: if self.model is True:
self.model = self.Model(self.vocab, tok2vec) kwargs.update(self.cfg)
self.model = self.Model(self.vocab, tok2vec, **kwargs)
X = self.model.ops.allocate((5, self.model.tok2vec.nO)) X = self.model.ops.allocate((5, self.model.tok2vec.nO))
self.model.output_layer.begin_training(X) self.model.output_layer.begin_training(X)
if sgd is None: if sgd is None:
@ -874,13 +882,16 @@ class ClozeMultitask(Pipe):
return tokvecs, vectors return tokvecs, vectors
def get_loss(self, docs, vectors, prediction): def get_loss(self, docs, vectors, prediction):
# The simplest way to implement this would be to vstack the if self.cfg["objective"] == "characters":
# token.vector values, but that's a bit inefficient, especially on GPU. loss, gradient = get_characters_loss(self.model.ops, docs, prediction)
# Instead we fetch the index into the vectors table for each of our tokens, else:
# and look them up all at once. This prevents data copying. # The simplest way to implement this would be to vstack the
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) # token.vector values, but that's a bit inefficient, especially on GPU.
target = vectors[ids] # Instead we fetch the index into the vectors table for each of our tokens,
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True) # and look them up all at once. This prevents data copying.
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = vectors[ids]
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
return float(loss), gradient return float(loss), gradient
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):