Remove unused code from spacy pretrain

This commit is contained in:
Matthew Honnibal 2018-12-18 19:19:26 +01:00
parent 5f0c5fbfa4
commit 0f83b98afa

View File

@ -100,11 +100,11 @@ def pretrain(
pretrained_vectors=pretrained_vectors, pretrained_vectors=pretrained_vectors,
bilstm_depth=0, # Requires PyTorch. Experimental. bilstm_depth=0, # Requires PyTorch. Experimental.
cnn_maxout_pieces=3, # You can try setting this higher cnn_maxout_pieces=3, # You can try setting this higher
subword_features=True, subword_features=True, # Set to False for Chinese etc
), ),
) # Set to False for character models, e.g. Chinese )
optimizer = create_default_optimizer(model.ops) optimizer = create_default_optimizer(model.ops)
tracker = ProgressTracker() tracker = ProgressTracker(frequency=10000)
msg.divider("Pre-training tok2vec layer") msg.divider("Pre-training tok2vec layer")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
@ -136,7 +136,7 @@ def pretrain(
random.shuffle(texts) random.shuffle(texts)
def make_update(model, docs, optimizer, drop=0.0, objective='cosine'): def make_update(model, docs, optimizer, drop=0.0, objective='L2'):
"""Perform an update over a single batch of documents. """Perform an update over a single batch of documents.
docs (iterable): A batch of `Doc` objects. docs (iterable): A batch of `Doc` objects.
@ -145,13 +145,12 @@ def make_update(model, docs, optimizer, drop=0.0, objective='cosine'):
RETURNS loss: A float for the loss. RETURNS loss: A float for the loss.
""" """
predictions, backprop = model.begin_update(docs, drop=drop) predictions, backprop = model.begin_update(docs, drop=drop)
gradients = get_vectors_loss(model.ops, docs, predictions, objective) loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
backprop(gradients, sgd=optimizer) backprop(gradients, sgd=optimizer)
# Don't want to return a cupy object here # Don't want to return a cupy object here
# The gradients are modified in-place by the BERT MLM, # The gradients are modified in-place by the BERT MLM,
# so we get an accurate loss # so we get an accurate loss
loss = float((gradients ** 2).sum()) return float(loss)
return loss
def make_docs(nlp, batch, min_length=1, max_length=500): def make_docs(nlp, batch, min_length=1, max_length=500):
@ -172,7 +171,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
return docs return docs
def get_vectors_loss(ops, docs, prediction, objective): def get_vectors_loss(ops, docs, prediction, objective='L2'):
"""Compute a mean-squared error loss between the documents' vectors and """Compute a mean-squared error loss between the documents' vectors and
the prediction. the prediction.
@ -188,80 +187,23 @@ def get_vectors_loss(ops, docs, prediction, objective):
target = docs[0].vocab.vectors.data[ids] target = docs[0].vocab.vectors.data[ids]
if objective == 'L2': if objective == 'L2':
d_scores = prediction - target d_scores = prediction - target
elif objective == 'nllvmf': loss = (d_scores**2).sum()
d_scores = get_nllvmf_loss(prediction, target)
else: else:
d_scores = get_cossim_loss(prediction, target) raise NotImplementedError(objective)
return d_scores return loss, d_scores
def get_cossim_loss(yh, y): def create_pretraining_model(nlp, tok2vec):
# Add a small constant to avoid 0 vectors
yh = yh + 1e-8
y = y + 1e-8
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
xp = get_array_module(yh)
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
mul_norms = norm_yh * norm_y
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh**2))
return d_yh
def get_nllvmf_loss(Yh, Y):
"""Compute the gradient of the negative log likelihood von Mises-Fisher loss,
from Kumar and Tsetskov.
Yh: Predicted vectors.
Y: True vectors
Returns dYh: Gradient of loss with respect to prediction.
"""
# Warning: Probably wrong? Also needs normalization
xp = get_array_module(Yh)
assert not xp.isnan(Yh).any()
assert not xp.isnan(Y).any()
return _backprop_bessel(Yh) * Y
def _backprop_bessel(k, approximate=True):
if approximate:
return -_ratio(k.shape[1]/2, k)
from scipy.special import ive
xp = get_array_module(k)
if not isinstance(k, numpy.ndarray):
k = k.get()
k = numpy.asarray(k, dtype='float64')
assert not numpy.isnan(k).any()
m = k.shape[1]
numerator = ive(m/2, k)
assert not numpy.isnan(numerator).any()
denom = ive(m/2-1, k)
assert not numpy.isnan(denom).any()
x = -(numerator / (denom+1e-8))
assert not numpy.isnan(x).any()
return xp.array(x, dtype='f')
def _ratio(v, z):
return z/(v-1+numpy.sqrt((v+1)**2 + z**2, dtype='f'))
def create_pretraining_model(nlp, tok2vec, normalized=False):
"""Define a network for the pretraining. We simply add an output layer onto """Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays. takes a batch of Doc objects (as a list), and returns a list of arrays.
Each array in the output needs to have one row per token in the doc. Each array in the output needs to have one row per token in the doc.
""" """
if normalized:
normalize_vectors(nlp.vocab.vectors.data)
output_size = nlp.vocab.vectors.data.shape[1] output_size = nlp.vocab.vectors.data.shape[1]
output_layer = chain( output_layer = chain(
LN(Maxout(300, pieces=3)), LN(Maxout(300, pieces=3)),
Affine(output_size, drop_factor=0.0), Affine(output_size, drop_factor=0.0),
) )
if normalized:
output_layer = chain(output_layer, normalize)
# This is annoying, but the parser etc have the flatten step after # This is annoying, but the parser etc have the flatten step after
# the tok2vec. To load the weights in cleanly, we need to match # the tok2vec. To load the weights in cleanly, we need to match
# the shape of the models' components exactly. So what we cann # the shape of the models' components exactly. So what we cann
@ -275,28 +217,6 @@ def create_pretraining_model(nlp, tok2vec, normalized=False):
return model return model
@layerize
def normalize(X, drop=0.):
xp = get_array_module(X)
norms = xp.sqrt((X**2).sum(axis=1, keepdims=True)+1e-8)
Y = X / norms
def backprop_normalize(dY, sgd=None):
d_norms = 2 * norms
#dY = (dX * norms - X * d_norms) / norms**2
#dY * norms**2 = dX * norms - X * d_norms
#dY * norms**2 + X * d_norms = dX * norms
#(dY * norms**2 + X * d_norms) / norms = dX
dX = (dY * norms**2 + X * d_norms) / norms
return dX
return Y, backprop_normalize
def normalize_vectors(vectors_data):
xp = get_array_module(vectors_data)
norms = xp.sqrt((vectors_data**2).sum(axis=1, keepdims=True)+1e-8)
vectors_data /= norms
class ProgressTracker(object): class ProgressTracker(object):
def __init__(self, frequency=1000000): def __init__(self, frequency=1000000):
self.loss = 0.0 self.loss = 0.0