Implement cosine loss for spacy pretrain. Make default

This commit is contained in:
Matthew Honnibal 2019-03-20 11:06:35 +00:00
parent 685fff40cf
commit 1612990e88

View File

@ -9,7 +9,7 @@ from collections import Counter
from pathlib import Path from pathlib import Path
from thinc.v2v import Affine, Maxout from thinc.v2v import Affine, Maxout
from thinc.misc import LayerNorm as LN from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu from thinc.neural.util import prefer_gpu, get_array_module
from wasabi import Printer from wasabi import Printer
import srsly import srsly
@ -27,6 +27,7 @@ from .. import util
width=("Width of CNN layers", "option", "cw", int), width=("Width of CNN layers", "option", "cw", int),
depth=("Depth of CNN layers", "option", "cd", int), depth=("Depth of CNN layers", "option", "cd", int),
embed_rows=("Embedding rows", "option", "er", int), embed_rows=("Embedding rows", "option", "er", int),
loss_func=("Loss to use for the objective. L2 or cosine", "option", "L", str),
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
dropout=("Dropout", "option", "d", float), dropout=("Dropout", "option", "d", float),
batch_size=("Number of words per training batch", "option", "bs", int), batch_size=("Number of words per training batch", "option", "bs", int),
@ -42,6 +43,7 @@ def pretrain(
width=96, width=96,
depth=4, depth=4,
embed_rows=2000, embed_rows=2000,
loss_func="cosine",
use_vectors=False, use_vectors=False,
dropout=0.2, dropout=0.2,
nr_iter=1000, nr_iter=1000,
@ -123,7 +125,7 @@ def pretrain(
max_length=max_length, max_length=max_length,
min_length=min_length, min_length=min_length,
) )
loss = make_update(model, docs, optimizer, drop=dropout) loss = make_update(model, docs, optimizer, objective=loss_func, drop=dropout)
progress = tracker.update(epoch, loss, docs) progress = tracker.update(epoch, loss, docs)
if progress: if progress:
msg.row(progress, **row_settings) msg.row(progress, **row_settings)
@ -196,11 +198,26 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids] target = docs[0].vocab.vectors.data[ids]
if objective == "L2": if objective == "L2":
d_scores = prediction - target d_target = prediction - target
loss = (d_scores ** 2).sum() loss = (d_target ** 2).sum()
else: elif objective == "cosine":
raise NotImplementedError(objective) loss, d_target = get_cossim_loss(prediction, target)
return loss, d_scores return loss, d_target
def get_cossim_loss(yh, y):
# Add a small constant to avoid 0 vectors
yh = yh + 1e-8
y = y + 1e-8
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
xp = get_array_module(yh)
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
mul_norms = norm_yh * norm_y
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh**2))
loss = xp.abs(cosine-1).sum()
return loss, -d_yh
def create_pretraining_model(nlp, tok2vec): def create_pretraining_model(nlp, tok2vec):