Update hyper-parameters after NER random search (#2972)

These experiments were completed a few weeks ago, but I didn't make the PR, pending model release.

    Token vector width: 128->96
    Hidden width: 128->64
    Embed size: 5000->2000
    Dropout: 0.2->0.1
    Updated optimizer defaults (unclear how important?)

This should improve speed, model size and load time, while keeping
similar or slightly better accuracy.

The tl;dr is we prefer to prevent over-fitting by reducing model size,
rather than using more dropout.
This commit is contained in:
Matthew Honnibal 2018-11-27 18:49:52 +01:00 committed by GitHub
parent b6e991440c
commit ef0820827a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 15 additions and 15 deletions

View File

@ -48,11 +48,11 @@ def cosine(vec1, vec2):
def create_default_optimizer(ops, **cfg): def create_default_optimizer(ops, **cfg):
learn_rate = util.env_opt('learn_rate', 0.001) learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9) beta1 = util.env_opt('optimizer_B1', 0.8)
beta2 = util.env_opt('optimizer_B2', 0.9) beta2 = util.env_opt('optimizer_B2', 0.8)
eps = util.env_opt('optimizer_eps', 1e-12) eps = util.env_opt('optimizer_eps', 0.00001)
L2 = util.env_opt('L2_penalty', 1e-6) L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.) max_grad_norm = util.env_opt('grad_norm_clip', 5.)
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps) beta2=beta2, eps=eps)
optimizer.max_grad_norm = max_grad_norm optimizer.max_grad_norm = max_grad_norm
@ -445,11 +445,11 @@ def getitem(i):
def build_tagger_model(nr_class, **cfg): def build_tagger_model(nr_class, **cfg):
embed_size = util.env_opt('embed_size', 7000) embed_size = util.env_opt('embed_size', 2000)
if 'token_vector_width' in cfg: if 'token_vector_width' in cfg:
token_vector_width = cfg['token_vector_width'] token_vector_width = cfg['token_vector_width']
else: else:
token_vector_width = util.env_opt('token_vector_width', 128) token_vector_width = util.env_opt('token_vector_width', 96)
pretrained_vectors = cfg.get('pretrained_vectors') pretrained_vectors = cfg.get('pretrained_vectors')
subword_features = cfg.get('subword_features', True) subword_features = cfg.get('subword_features', True)
with Model.define_operators({'>>': chain, '+': add}): with Model.define_operators({'>>': chain, '+': add}):

View File

@ -90,11 +90,11 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
# starts high and decays sharply, to force the optimizer to explore. # starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly # Batch size starts at 1 and grows, so that we make updates quickly
# at the beginning of training. # at the beginning of training.
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1),
util.env_opt('dropout_to', 0.2), util.env_opt('dropout_to', 0.1),
util.env_opt('dropout_decay', 0.0)) util.env_opt('dropout_decay', 0.0))
batch_sizes = util.compounding(util.env_opt('batch_from', 1000), batch_sizes = util.compounding(util.env_opt('batch_from', 750),
util.env_opt('batch_to', 1000), util.env_opt('batch_to', 750),
util.env_opt('batch_compound', 1.001)) util.env_opt('batch_compound', 1.001))
lang_class = util.get_lang_class(lang) lang_class = util.get_lang_class(lang)
nlp = lang_class() nlp = lang_class()

View File

@ -759,7 +759,7 @@ class Tagger(Pipe):
if self.model is True: if self.model is True:
token_vector_width = util.env_opt( token_vector_width = util.env_opt(
'token_vector_width', 'token_vector_width',
self.cfg.get('token_vector_width', 128)) self.cfg.get('token_vector_width', 96))
self.model = self.Model(self.vocab.morphology.n_tags, self.model = self.Model(self.vocab.morphology.n_tags,
**self.cfg) **self.cfg)
self.model.from_bytes(b) self.model.from_bytes(b)
@ -878,7 +878,7 @@ class MultitaskObjective(Tagger):
@classmethod @classmethod
def Model(cls, n_tags, tok2vec=None, **cfg): def Model(cls, n_tags, tok2vec=None, **cfg):
token_vector_width = util.env_opt('token_vector_width', 128) token_vector_width = util.env_opt('token_vector_width', 96)
softmax = Softmax(n_tags, token_vector_width) softmax = Softmax(n_tags, token_vector_width)
model = chain( model = chain(
tok2vec, tok2vec,

View File

@ -63,9 +63,9 @@ cdef class Parser:
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
cfg.get('maxout_pieces', 2)) cfg.get('maxout_pieces', 2))
token_vector_width = util.env_opt('token_vector_width', token_vector_width = util.env_opt('token_vector_width',
cfg.get('token_vector_width', 128)) cfg.get('token_vector_width', 96))
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128)) hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64))
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 5000)) embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000))
pretrained_vectors = cfg.get('pretrained_vectors', None) pretrained_vectors = cfg.get('pretrained_vectors', None)
tok2vec = Tok2Vec(token_vector_width, embed_size, tok2vec = Tok2Vec(token_vector_width, embed_size,
conv_depth=conv_depth, conv_depth=conv_depth,