mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Improve spacy pretrain (#4393)
* Support bilstm_depth arg in spacy pretrain * Add option to ignore zero vectors in get_cossim_loss * Use cosine loss in Cloze multitask
This commit is contained in:
parent
9cd6ca3e4d
commit
29f9fec267
14
spacy/_ml.py
14
spacy/_ml.py
|
@ -953,16 +953,24 @@ class CharacterEmbed(Model):
|
||||||
return output, backprop_character_embed
|
return output, backprop_character_embed
|
||||||
|
|
||||||
|
|
||||||
def get_cossim_loss(yh, y):
|
def get_cossim_loss(yh, y, ignore_zeros=False):
|
||||||
|
xp = get_array_module(yh)
|
||||||
|
# Find the zero vectors
|
||||||
|
if ignore_zeros:
|
||||||
|
zero_indices = xp.abs(y).sum(axis=1) == 0
|
||||||
# Add a small constant to avoid 0 vectors
|
# Add a small constant to avoid 0 vectors
|
||||||
yh = yh + 1e-8
|
yh = yh + 1e-8
|
||||||
y = y + 1e-8
|
y = y + 1e-8
|
||||||
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
|
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
|
||||||
xp = get_array_module(yh)
|
|
||||||
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
|
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
|
||||||
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
|
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
|
||||||
mul_norms = norm_yh * norm_y
|
mul_norms = norm_yh * norm_y
|
||||||
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
|
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
|
||||||
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
|
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
|
||||||
loss = xp.abs(cosine - 1).sum()
|
losses = xp.abs(cosine - 1)
|
||||||
|
if ignore_zeros:
|
||||||
|
# If the target was a zero vector, don't count it in the loss.
|
||||||
|
d_yh[zero_indices] = 0
|
||||||
|
losses[zero_indices] = 0
|
||||||
|
loss = losses.sum()
|
||||||
return loss, -d_yh
|
return loss, -d_yh
|
||||||
|
|
|
@ -35,6 +35,7 @@ from .train import _load_pretrained_tok2vec
|
||||||
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
||||||
width=("Width of CNN layers", "option", "cw", int),
|
width=("Width of CNN layers", "option", "cw", int),
|
||||||
depth=("Depth of CNN layers", "option", "cd", int),
|
depth=("Depth of CNN layers", "option", "cd", int),
|
||||||
|
bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
|
||||||
embed_rows=("Number of embedding rows", "option", "er", int),
|
embed_rows=("Number of embedding rows", "option", "er", int),
|
||||||
loss_func=(
|
loss_func=(
|
||||||
"Loss function to use for the objective. Either 'L2' or 'cosine'",
|
"Loss function to use for the objective. Either 'L2' or 'cosine'",
|
||||||
|
@ -80,6 +81,7 @@ def pretrain(
|
||||||
output_dir,
|
output_dir,
|
||||||
width=96,
|
width=96,
|
||||||
depth=4,
|
depth=4,
|
||||||
|
bilstm_depth=2,
|
||||||
embed_rows=2000,
|
embed_rows=2000,
|
||||||
loss_func="cosine",
|
loss_func="cosine",
|
||||||
use_vectors=False,
|
use_vectors=False,
|
||||||
|
@ -116,6 +118,10 @@ def pretrain(
|
||||||
util.fix_random_seed(seed)
|
util.fix_random_seed(seed)
|
||||||
|
|
||||||
has_gpu = prefer_gpu()
|
has_gpu = prefer_gpu()
|
||||||
|
if has_gpu:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||||
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||||
|
|
||||||
output_dir = Path(output_dir)
|
output_dir = Path(output_dir)
|
||||||
|
@ -151,7 +157,7 @@ def pretrain(
|
||||||
embed_rows,
|
embed_rows,
|
||||||
conv_depth=depth,
|
conv_depth=depth,
|
||||||
pretrained_vectors=pretrained_vectors,
|
pretrained_vectors=pretrained_vectors,
|
||||||
bilstm_depth=0, # Requires PyTorch. Experimental.
|
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
|
||||||
cnn_maxout_pieces=3, # You can try setting this higher
|
cnn_maxout_pieces=3, # You can try setting this higher
|
||||||
subword_features=True, # Set to False for Chinese etc
|
subword_features=True, # Set to False for Chinese etc
|
||||||
),
|
),
|
||||||
|
|
|
@ -29,7 +29,7 @@ from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss
|
||||||
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
|
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
|
||||||
from .._ml import build_bow_text_classifier, build_nel_encoder
|
from .._ml import build_bow_text_classifier, build_nel_encoder
|
||||||
from .._ml import link_vectors_to_models, zero_init, flatten
|
from .._ml import link_vectors_to_models, zero_init, flatten
|
||||||
from .._ml import masked_language_model, create_default_optimizer
|
from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
|
||||||
from ..errors import Errors, TempErrors, user_warning, Warnings
|
from ..errors import Errors, TempErrors, user_warning, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -880,8 +880,7 @@ class ClozeMultitask(Pipe):
|
||||||
# and look them up all at once. This prevents data copying.
|
# and look them up all at once. This prevents data copying.
|
||||||
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
target = vectors[ids]
|
target = vectors[ids]
|
||||||
gradient = (prediction - target) / prediction.shape[0]
|
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
|
||||||
loss = (gradient**2).sum()
|
|
||||||
return float(loss), gradient
|
return float(loss), gradient
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user