From ab7f85dfa28422fbf24dceb4f7a04344b466648f Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Tue, 22 Oct 2019 03:26:27 +0200 Subject: [PATCH] Use Mish layer if pieces==1 in CNN --- spacy/_ml.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 31e0767c7..4ed683861 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import numpy -from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu +from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, Mish from thinc.i2v import HashEmbed, StaticVectors from thinc.t2t import ExtractWindow, ParametricAttention from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool @@ -55,7 +55,8 @@ def create_default_optimizer(ops, **cfg): eps = util.env_opt("optimizer_eps", 1e-8) L2 = util.env_opt("L2_penalty", 1e-6) max_grad_norm = util.env_opt("grad_norm_clip", 1.0) - optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps) + optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, + lookahead_k=6, lookahead_alpha=0.5, use_lars=True, use_radam=True) optimizer.max_grad_norm = max_grad_norm optimizer.device = ops.device return optimizer @@ -375,9 +376,15 @@ def Tok2Vec_chars_bilstm(width, embed_size, **kwargs): def CNN(width, depth, pieces, nW=1): - layer = chain( - ExtractWindow(nW=nW), - LN(Maxout(width, width * (nW*2+1), pieces=pieces))) + if pieces == 1: + layer = chain( + ExtractWindow(nW=nW), + LN(Mish(width, width * (nW*2+1))) + ) + else: + layer = chain( + ExtractWindow(nW=nW), + LN(Maxout(width, width * (nW*2+1), pieces=pieces))) return clone(Residual(layer), depth)