diff --git a/spacy/_ml.py b/spacy/_ml.py index 31e0767c7..4ed683861 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import numpy -from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu +from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, Mish from thinc.i2v import HashEmbed, StaticVectors from thinc.t2t import ExtractWindow, ParametricAttention from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool @@ -55,7 +55,8 @@ def create_default_optimizer(ops, **cfg): eps = util.env_opt("optimizer_eps", 1e-8) L2 = util.env_opt("L2_penalty", 1e-6) max_grad_norm = util.env_opt("grad_norm_clip", 1.0) - optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps) + optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, + lookahead_k=6, lookahead_alpha=0.5, use_lars=True, use_radam=True) optimizer.max_grad_norm = max_grad_norm optimizer.device = ops.device return optimizer @@ -375,9 +376,15 @@ def Tok2Vec_chars_bilstm(width, embed_size, **kwargs): def CNN(width, depth, pieces, nW=1): - layer = chain( - ExtractWindow(nW=nW), - LN(Maxout(width, width * (nW*2+1), pieces=pieces))) + if pieces == 1: + layer = chain( + ExtractWindow(nW=nW), + LN(Mish(width, width * (nW*2+1))) + ) + else: + layer = chain( + ExtractWindow(nW=nW), + LN(Maxout(width, width * (nW*2+1), pieces=pieces))) return clone(Residual(layer), depth)