mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Update hyper-parameters after NER random search (#2972)
These experiments were completed a few weeks ago, but I didn't make the PR, pending model release. Token vector width: 128->96 Hidden width: 128->64 Embed size: 5000->2000 Dropout: 0.2->0.1 Updated optimizer defaults (unclear how important?) This should improve speed, model size and load time, while keeping similar or slightly better accuracy. The tl;dr is we prefer to prevent over-fitting by reducing model size, rather than using more dropout.
This commit is contained in:
parent
b6e991440c
commit
ef0820827a
12
spacy/_ml.py
12
spacy/_ml.py
|
@ -48,11 +48,11 @@ def cosine(vec1, vec2):
|
||||||
|
|
||||||
def create_default_optimizer(ops, **cfg):
|
def create_default_optimizer(ops, **cfg):
|
||||||
learn_rate = util.env_opt('learn_rate', 0.001)
|
learn_rate = util.env_opt('learn_rate', 0.001)
|
||||||
beta1 = util.env_opt('optimizer_B1', 0.9)
|
beta1 = util.env_opt('optimizer_B1', 0.8)
|
||||||
beta2 = util.env_opt('optimizer_B2', 0.9)
|
beta2 = util.env_opt('optimizer_B2', 0.8)
|
||||||
eps = util.env_opt('optimizer_eps', 1e-12)
|
eps = util.env_opt('optimizer_eps', 0.00001)
|
||||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
max_grad_norm = util.env_opt('grad_norm_clip', 5.)
|
||||||
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1,
|
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1,
|
||||||
beta2=beta2, eps=eps)
|
beta2=beta2, eps=eps)
|
||||||
optimizer.max_grad_norm = max_grad_norm
|
optimizer.max_grad_norm = max_grad_norm
|
||||||
|
@ -445,11 +445,11 @@ def getitem(i):
|
||||||
|
|
||||||
|
|
||||||
def build_tagger_model(nr_class, **cfg):
|
def build_tagger_model(nr_class, **cfg):
|
||||||
embed_size = util.env_opt('embed_size', 7000)
|
embed_size = util.env_opt('embed_size', 2000)
|
||||||
if 'token_vector_width' in cfg:
|
if 'token_vector_width' in cfg:
|
||||||
token_vector_width = cfg['token_vector_width']
|
token_vector_width = cfg['token_vector_width']
|
||||||
else:
|
else:
|
||||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
token_vector_width = util.env_opt('token_vector_width', 96)
|
||||||
pretrained_vectors = cfg.get('pretrained_vectors')
|
pretrained_vectors = cfg.get('pretrained_vectors')
|
||||||
subword_features = cfg.get('subword_features', True)
|
subword_features = cfg.get('subword_features', True)
|
||||||
with Model.define_operators({'>>': chain, '+': add}):
|
with Model.define_operators({'>>': chain, '+': add}):
|
||||||
|
|
|
@ -90,11 +90,11 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
# starts high and decays sharply, to force the optimizer to explore.
|
# starts high and decays sharply, to force the optimizer to explore.
|
||||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||||
# at the beginning of training.
|
# at the beginning of training.
|
||||||
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
|
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1),
|
||||||
util.env_opt('dropout_to', 0.2),
|
util.env_opt('dropout_to', 0.1),
|
||||||
util.env_opt('dropout_decay', 0.0))
|
util.env_opt('dropout_decay', 0.0))
|
||||||
batch_sizes = util.compounding(util.env_opt('batch_from', 1000),
|
batch_sizes = util.compounding(util.env_opt('batch_from', 750),
|
||||||
util.env_opt('batch_to', 1000),
|
util.env_opt('batch_to', 750),
|
||||||
util.env_opt('batch_compound', 1.001))
|
util.env_opt('batch_compound', 1.001))
|
||||||
lang_class = util.get_lang_class(lang)
|
lang_class = util.get_lang_class(lang)
|
||||||
nlp = lang_class()
|
nlp = lang_class()
|
||||||
|
|
|
@ -759,7 +759,7 @@ class Tagger(Pipe):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt(
|
token_vector_width = util.env_opt(
|
||||||
'token_vector_width',
|
'token_vector_width',
|
||||||
self.cfg.get('token_vector_width', 128))
|
self.cfg.get('token_vector_width', 96))
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags,
|
self.model = self.Model(self.vocab.morphology.n_tags,
|
||||||
**self.cfg)
|
**self.cfg)
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
@ -878,7 +878,7 @@ class MultitaskObjective(Tagger):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, tok2vec=None, **cfg):
|
def Model(cls, n_tags, tok2vec=None, **cfg):
|
||||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
token_vector_width = util.env_opt('token_vector_width', 96)
|
||||||
softmax = Softmax(n_tags, token_vector_width)
|
softmax = Softmax(n_tags, token_vector_width)
|
||||||
model = chain(
|
model = chain(
|
||||||
tok2vec,
|
tok2vec,
|
||||||
|
|
|
@ -63,9 +63,9 @@ cdef class Parser:
|
||||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
||||||
cfg.get('maxout_pieces', 2))
|
cfg.get('maxout_pieces', 2))
|
||||||
token_vector_width = util.env_opt('token_vector_width',
|
token_vector_width = util.env_opt('token_vector_width',
|
||||||
cfg.get('token_vector_width', 128))
|
cfg.get('token_vector_width', 96))
|
||||||
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
|
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64))
|
||||||
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 5000))
|
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000))
|
||||||
pretrained_vectors = cfg.get('pretrained_vectors', None)
|
pretrained_vectors = cfg.get('pretrained_vectors', None)
|
||||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||||
conv_depth=conv_depth,
|
conv_depth=conv_depth,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user