mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 19:08:06 +03:00
Try to implement more losses for pretraining
* Try to implement cosine loss This one seems to be correct? Still unsure, but it performs okay * Try to implement the von Mises-Fisher loss This one's definitely not right yet.
This commit is contained in:
parent
ab9494b2a3
commit
7c504b6ddb
|
@ -8,9 +8,9 @@ import time
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.v2v import Affine, Maxout
|
from thinc.v2v import Affine, Maxout
|
||||||
from thinc.api import wrap
|
from thinc.api import wrap, layerize
|
||||||
from thinc.misc import LayerNorm as LN
|
from thinc.misc import LayerNorm as LN
|
||||||
from thinc.neural.util import prefer_gpu
|
from thinc.neural.util import prefer_gpu, get_array_module
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
@ -99,7 +99,7 @@ def pretrain(
|
||||||
conv_depth=depth,
|
conv_depth=depth,
|
||||||
pretrained_vectors=pretrained_vectors,
|
pretrained_vectors=pretrained_vectors,
|
||||||
bilstm_depth=0, # Requires PyTorch. Experimental.
|
bilstm_depth=0, # Requires PyTorch. Experimental.
|
||||||
cnn_maxout_pieces=2, # You can try setting this higher
|
cnn_maxout_pieces=3, # You can try setting this higher
|
||||||
subword_features=True,
|
subword_features=True,
|
||||||
),
|
),
|
||||||
) # Set to False for character models, e.g. Chinese
|
) # Set to False for character models, e.g. Chinese
|
||||||
|
@ -136,7 +136,7 @@ def pretrain(
|
||||||
random.shuffle(texts)
|
random.shuffle(texts)
|
||||||
|
|
||||||
|
|
||||||
def make_update(model, docs, optimizer, drop=0.0):
|
def make_update(model, docs, optimizer, drop=0.0, objective='cosine'):
|
||||||
"""Perform an update over a single batch of documents.
|
"""Perform an update over a single batch of documents.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -145,12 +145,12 @@ def make_update(model, docs, optimizer, drop=0.0):
|
||||||
RETURNS loss: A float for the loss.
|
RETURNS loss: A float for the loss.
|
||||||
"""
|
"""
|
||||||
predictions, backprop = model.begin_update(docs, drop=drop)
|
predictions, backprop = model.begin_update(docs, drop=drop)
|
||||||
gradients = get_vectors_loss(model.ops, docs, predictions)
|
gradients = get_vectors_loss(model.ops, docs, predictions, objective)
|
||||||
backprop(gradients, sgd=optimizer)
|
backprop(gradients, sgd=optimizer)
|
||||||
# Don't want to return a cupy object here
|
# Don't want to return a cupy object here
|
||||||
# The gradients are modified in-place by the BERT MLM,
|
# The gradients are modified in-place by the BERT MLM,
|
||||||
# so we get an accurate loss
|
# so we get an accurate loss
|
||||||
loss = float((gradients ** 2).mean())
|
loss = float((gradients ** 2).sum())
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
||||||
|
@ -172,7 +172,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
||||||
def get_vectors_loss(ops, docs, prediction):
|
def get_vectors_loss(ops, docs, prediction, objective):
|
||||||
"""Compute a mean-squared error loss between the documents' vectors and
|
"""Compute a mean-squared error loss between the documents' vectors and
|
||||||
the prediction.
|
the prediction.
|
||||||
|
|
||||||
|
@ -186,20 +186,82 @@ def get_vectors_loss(ops, docs, prediction):
|
||||||
# and look them up all at once. This prevents data copying.
|
# and look them up all at once. This prevents data copying.
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
target = docs[0].vocab.vectors.data[ids]
|
||||||
d_scores = prediction - target
|
if objective == 'L2':
|
||||||
|
d_scores = prediction - target
|
||||||
|
elif objective == 'nllvmf':
|
||||||
|
d_scores = get_nllvmf_loss(prediction, target)
|
||||||
|
else:
|
||||||
|
d_scores = get_cossim_loss(prediction, target)
|
||||||
return d_scores
|
return d_scores
|
||||||
|
|
||||||
|
|
||||||
def create_pretraining_model(nlp, tok2vec):
|
def get_cossim_loss(yh, y):
|
||||||
|
# Add a small constant to avoid 0 vectors
|
||||||
|
yh = yh + 1e-8
|
||||||
|
y = y + 1e-8
|
||||||
|
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
|
||||||
|
xp = get_array_module(yh)
|
||||||
|
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
|
||||||
|
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
|
||||||
|
mul_norms = norm_yh * norm_y
|
||||||
|
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
|
||||||
|
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh**2))
|
||||||
|
return d_yh
|
||||||
|
|
||||||
|
|
||||||
|
def get_nllvmf_loss(Yh, Y):
|
||||||
|
"""Compute the gradient of the negative log likelihood von Mises-Fisher loss,
|
||||||
|
from Kumar and Tsetskov.
|
||||||
|
Yh: Predicted vectors.
|
||||||
|
Y: True vectors
|
||||||
|
Returns dYh: Gradient of loss with respect to prediction.
|
||||||
|
"""
|
||||||
|
# Warning: Probably wrong? Also needs normalization
|
||||||
|
xp = get_array_module(Yh)
|
||||||
|
assert not xp.isnan(Yh).any()
|
||||||
|
assert not xp.isnan(Y).any()
|
||||||
|
return _backprop_bessel(Yh) * Y
|
||||||
|
|
||||||
|
|
||||||
|
def _backprop_bessel(k, approximate=True):
|
||||||
|
if approximate:
|
||||||
|
return -_ratio(k.shape[1]/2, k)
|
||||||
|
from scipy.special import ive
|
||||||
|
xp = get_array_module(k)
|
||||||
|
if not isinstance(k, numpy.ndarray):
|
||||||
|
k = k.get()
|
||||||
|
k = numpy.asarray(k, dtype='float64')
|
||||||
|
assert not numpy.isnan(k).any()
|
||||||
|
m = k.shape[1]
|
||||||
|
numerator = ive(m/2, k)
|
||||||
|
assert not numpy.isnan(numerator).any()
|
||||||
|
denom = ive(m/2-1, k)
|
||||||
|
assert not numpy.isnan(denom).any()
|
||||||
|
x = -(numerator / (denom+1e-8))
|
||||||
|
assert not numpy.isnan(x).any()
|
||||||
|
return xp.array(x, dtype='f')
|
||||||
|
|
||||||
|
|
||||||
|
def _ratio(v, z):
|
||||||
|
return z/(v-1+numpy.sqrt((v+1)**2 + z**2, dtype='f'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def create_pretraining_model(nlp, tok2vec, normalized=False):
|
||||||
"""Define a network for the pretraining. We simply add an output layer onto
|
"""Define a network for the pretraining. We simply add an output layer onto
|
||||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||||
Each array in the output needs to have one row per token in the doc.
|
Each array in the output needs to have one row per token in the doc.
|
||||||
"""
|
"""
|
||||||
|
if normalized:
|
||||||
|
normalize_vectors(nlp.vocab.vectors.data)
|
||||||
output_size = nlp.vocab.vectors.data.shape[1]
|
output_size = nlp.vocab.vectors.data.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
LN(Maxout(300, pieces=3)), zero_init(Affine(output_size, drop_factor=0.0))
|
LN(Maxout(300, pieces=3)),
|
||||||
|
Affine(output_size, drop_factor=0.0),
|
||||||
)
|
)
|
||||||
|
if normalized:
|
||||||
|
output_layer = chain(output_layer, normalize)
|
||||||
# This is annoying, but the parser etc have the flatten step after
|
# This is annoying, but the parser etc have the flatten step after
|
||||||
# the tok2vec. To load the weights in cleanly, we need to match
|
# the tok2vec. To load the weights in cleanly, we need to match
|
||||||
# the shape of the models' components exactly. So what we cann
|
# the shape of the models' components exactly. So what we cann
|
||||||
|
@ -213,6 +275,28 @@ def create_pretraining_model(nlp, tok2vec):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def normalize(X, drop=0.):
|
||||||
|
xp = get_array_module(X)
|
||||||
|
norms = xp.sqrt((X**2).sum(axis=1, keepdims=True)+1e-8)
|
||||||
|
Y = X / norms
|
||||||
|
def backprop_normalize(dY, sgd=None):
|
||||||
|
d_norms = 2 * norms
|
||||||
|
#dY = (dX * norms - X * d_norms) / norms**2
|
||||||
|
#dY * norms**2 = dX * norms - X * d_norms
|
||||||
|
#dY * norms**2 + X * d_norms = dX * norms
|
||||||
|
#(dY * norms**2 + X * d_norms) / norms = dX
|
||||||
|
dX = (dY * norms**2 + X * d_norms) / norms
|
||||||
|
return dX
|
||||||
|
return Y, backprop_normalize
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_vectors(vectors_data):
|
||||||
|
xp = get_array_module(vectors_data)
|
||||||
|
norms = xp.sqrt((vectors_data**2).sum(axis=1, keepdims=True)+1e-8)
|
||||||
|
vectors_data /= norms
|
||||||
|
|
||||||
|
|
||||||
class ProgressTracker(object):
|
class ProgressTracker(object):
|
||||||
def __init__(self, frequency=1000000):
|
def __init__(self, frequency=1000000):
|
||||||
self.loss = 0.0
|
self.loss = 0.0
|
||||||
|
@ -239,8 +323,8 @@ class ProgressTracker(object):
|
||||||
status = (
|
status = (
|
||||||
epoch,
|
epoch,
|
||||||
self.nr_word,
|
self.nr_word,
|
||||||
"%.5f" % self.loss,
|
"%.8f" % self.loss,
|
||||||
"%.4f" % loss_per_word,
|
"%.8f" % loss_per_word,
|
||||||
int(wps),
|
int(wps),
|
||||||
)
|
)
|
||||||
self.prev_loss = float(self.loss)
|
self.prev_loss = float(self.loss)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user