diff --git a/spacy/_ml.py b/spacy/_ml.py index 679b1aef6..0cfdec7e9 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -48,11 +48,11 @@ def cosine(vec1, vec2): def create_default_optimizer(ops, **cfg): learn_rate = util.env_opt('learn_rate', 0.001) - beta1 = util.env_opt('optimizer_B1', 0.9) - beta2 = util.env_opt('optimizer_B2', 0.9) - eps = util.env_opt('optimizer_eps', 1e-12) + beta1 = util.env_opt('optimizer_B1', 0.8) + beta2 = util.env_opt('optimizer_B2', 0.8) + eps = util.env_opt('optimizer_eps', 0.00001) L2 = util.env_opt('L2_penalty', 1e-6) - max_grad_norm = util.env_opt('grad_norm_clip', 1.) + max_grad_norm = util.env_opt('grad_norm_clip', 5.) optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps) optimizer.max_grad_norm = max_grad_norm @@ -445,11 +445,11 @@ def getitem(i): def build_tagger_model(nr_class, **cfg): - embed_size = util.env_opt('embed_size', 7000) + embed_size = util.env_opt('embed_size', 2000) if 'token_vector_width' in cfg: token_vector_width = cfg['token_vector_width'] else: - token_vector_width = util.env_opt('token_vector_width', 128) + token_vector_width = util.env_opt('token_vector_width', 96) pretrained_vectors = cfg.get('pretrained_vectors') subword_features = cfg.get('subword_features', True) with Model.define_operators({'>>': chain, '+': add}): diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 01aebfae8..01c8cb199 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -90,11 +90,11 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. - dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), - util.env_opt('dropout_to', 0.2), + dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1), + util.env_opt('dropout_to', 0.1), util.env_opt('dropout_decay', 0.0)) - batch_sizes = util.compounding(util.env_opt('batch_from', 1000), - util.env_opt('batch_to', 1000), + batch_sizes = util.compounding(util.env_opt('batch_from', 750), + util.env_opt('batch_to', 750), util.env_opt('batch_compound', 1.001)) lang_class = util.get_lang_class(lang) nlp = lang_class() diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index f7c4ec4e0..e2a244080 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -759,7 +759,7 @@ class Tagger(Pipe): if self.model is True: token_vector_width = util.env_opt( 'token_vector_width', - self.cfg.get('token_vector_width', 128)) + self.cfg.get('token_vector_width', 96)) self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) self.model.from_bytes(b) @@ -878,7 +878,7 @@ class MultitaskObjective(Tagger): @classmethod def Model(cls, n_tags, tok2vec=None, **cfg): - token_vector_width = util.env_opt('token_vector_width', 128) + token_vector_width = util.env_opt('token_vector_width', 96) softmax = Softmax(n_tags, token_vector_width) model = chain( tok2vec, diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 82e87ae61..0cecdb93b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -63,9 +63,9 @@ cdef class Parser: parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) token_vector_width = util.env_opt('token_vector_width', - cfg.get('token_vector_width', 128)) - hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128)) - embed_size = util.env_opt('embed_size', cfg.get('embed_size', 5000)) + cfg.get('token_vector_width', 96)) + hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64)) + embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000)) pretrained_vectors = cfg.get('pretrained_vectors', None) tok2vec = Tok2Vec(token_vector_width, embed_size, conv_depth=conv_depth,