Implement character-based pretraining objective

This commit is contained in:
Matthw Honnibal 2019-10-19 11:42:38 +02:00
parent 36de9bf72a
commit ee56c6a4e1

View File

@ -13,12 +13,14 @@ from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu
from wasabi import Printer
import srsly
from thinc.neural.util import to_categorical
from ..errors import Errors
from ..tokens import Doc
from ..attrs import ID, HEAD
from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
from .._ml import masked_language_model, get_cossim_loss
from .._ml import MultiSoftmax
from .. import util
from .train import _load_pretrained_tok2vec
@ -121,11 +123,7 @@ def pretrain(
msg = Printer()
util.fix_random_seed(seed)
has_gpu = prefer_gpu()
if has_gpu:
import torch
torch.set_default_tensor_type("torch.cuda.FloatTensor")
has_gpu = prefer_gpu(gpu_id=1)
msg.info("Using GPU" if has_gpu else "Not using GPU")
output_dir = Path(output_dir)
@ -167,6 +165,7 @@ def pretrain(
cnn_maxout_pieces=3, # You can try setting this higher
subword_features=not use_chars, # Set to False for Chinese etc
),
objective=loss_func
)
# Load in pretrained weights
if init_tok2vec is not None:
@ -257,7 +256,10 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
RETURNS loss: A float for the loss.
"""
predictions, backprop = model.begin_update(docs, drop=drop)
loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
if objective == "characters":
loss, gradients = get_characters_loss(model.ops, docs, predictions)
else:
loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
backprop(gradients, sgd=optimizer)
# Don't want to return a cupy object here
# The gradients are modified in-place by the BERT MLM,
@ -295,6 +297,17 @@ def make_docs(nlp, batch, min_length, max_length):
return docs, skip_count
def get_characters_loss(ops, docs, prediction, nr_char=10):
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
target_ids = target_ids.reshape((-1,))
target = ops.asarray(to_categorical(target_ids, nb_classes=256), dtype="f")
target = target.reshape((-1, 256*nr_char))
diff = prediction - target
loss = (diff**2).sum()
d_target = diff / float(prediction.shape[0])
return loss, d_target
def get_vectors_loss(ops, docs, prediction, objective="L2"):
"""Compute a mean-squared error loss between the documents' vectors and
the prediction.
@ -319,16 +332,23 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
return loss, d_target
def create_pretraining_model(nlp, tok2vec):
def create_pretraining_model(nlp, tok2vec, objective="cosine", nr_char=10):
"""Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays.
Each array in the output needs to have one row per token in the doc.
"""
output_size = nlp.vocab.vectors.data.shape[1]
output_layer = chain(
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
)
if objective == "characters":
out_sizes = [256] * nr_char
output_layer = chain(
LN(Maxout(300, pieces=3)),
MultiSoftmax(out_sizes, 300)
)
else:
output_size = nlp.vocab.vectors.data.shape[1]
output_layer = chain(
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
)
# This is annoying, but the parser etc have the flatten step after
# the tok2vec. To load the weights in cleanly, we need to match
# the shape of the models' components exactly. So what we cann