diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 0ea895597..e44af8b48 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -9,7 +9,7 @@ from collections import Counter from pathlib import Path from thinc.v2v import Affine, Maxout from thinc.misc import LayerNorm as LN -from thinc.neural.util import prefer_gpu +from thinc.neural.util import prefer_gpu, get_array_module from wasabi import Printer import srsly @@ -27,6 +27,7 @@ from .. import util width=("Width of CNN layers", "option", "cw", int), depth=("Depth of CNN layers", "option", "cd", int), embed_rows=("Embedding rows", "option", "er", int), + loss_func=("Loss to use for the objective. L2 or cosine", "option", "L", str), use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), dropout=("Dropout", "option", "d", float), batch_size=("Number of words per training batch", "option", "bs", int), @@ -42,6 +43,7 @@ def pretrain( width=96, depth=4, embed_rows=2000, + loss_func="cosine", use_vectors=False, dropout=0.2, nr_iter=1000, @@ -123,7 +125,7 @@ def pretrain( max_length=max_length, min_length=min_length, ) - loss = make_update(model, docs, optimizer, drop=dropout) + loss = make_update(model, docs, optimizer, objective=loss_func, drop=dropout) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) @@ -196,11 +198,26 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"): ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] if objective == "L2": - d_scores = prediction - target - loss = (d_scores ** 2).sum() - else: - raise NotImplementedError(objective) - return loss, d_scores + d_target = prediction - target + loss = (d_target ** 2).sum() + elif objective == "cosine": + loss, d_target = get_cossim_loss(prediction, target) + return loss, d_target + + +def get_cossim_loss(yh, y): + # Add a small constant to avoid 0 vectors + yh = yh + 1e-8 + y = y + 1e-8 + # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity + xp = get_array_module(yh) + norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) + norm_y = xp.linalg.norm(y, axis=1, keepdims=True) + mul_norms = norm_yh * norm_y + cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms + d_yh = (y / mul_norms) - (cosine * (yh / norm_yh**2)) + loss = xp.abs(cosine-1).sum() + return loss, -d_yh def create_pretraining_model(nlp, tok2vec):