Merge pull request #1497 from explosion/feature/improve-optimizer-handling

💫 Improve optimizer handling
This commit is contained in:
Matthew Honnibal 2017-11-06 16:41:15 +01:00 committed by GitHub
commit 6fdffd7246
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 119 additions and 58 deletions

View File

@ -15,12 +15,12 @@ from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module, copy_array from thinc.neural.util import get_array_module, copy_array
from thinc.neural._lsuv import svd_orthonormal from thinc.neural._lsuv import svd_orthonormal
from thinc.neural.optimizers import Adam
from thinc import describe from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed from thinc.neural._classes.affine import _set_dimensions_if_needed
import thinc.extra.load_nlp import thinc.extra.load_nlp
from thinc.neural._lsuv import svd_orthonormal
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
from . import util from . import util
@ -39,6 +39,19 @@ def cosine(vec1, vec2):
return vec1.dot(vec2) / (norm1 * norm2) return vec1.dot(vec2) / (norm1 * norm2)
def create_default_optimizer(ops, **cfg):
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
optimizer.max_grad_norm = max_grad_norm
optimizer.device = ops.device
return optimizer
@layerize @layerize
def _flatten_add_lengths(seqs, pad=0, drop=0.): def _flatten_add_lengths(seqs, pad=0, drop=0.):
ops = Model.ops ops = Model.ops

View File

@ -19,7 +19,7 @@ from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
from .compat import json_dumps, izip from .compat import json_dumps, izip
from .scorer import Scorer from .scorer import Scorer
from ._ml import link_vectors_to_models from ._ml import link_vectors_to_models, create_default_optimizer
from .attrs import IS_STOP from .attrs import IS_STOP
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_INFIXES
@ -407,27 +407,7 @@ class Language(object):
for doc, gold in docs_golds: for doc, gold in docs_golds:
yield doc, gold yield doc, gold
def resume_training(self, **cfg): def begin_training(self, get_gold_tuples=None, sgd=None, **cfg):
if cfg.get('device', -1) >= 0:
device = util.use_gpu(cfg['device'])
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray(
self.vocab.vectors.data)
else:
device = None
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
def begin_training(self, get_gold_tuples=None, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and """Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager. optimizer. Used as a contextmanager.
@ -452,21 +432,14 @@ class Language(object):
else: else:
device = None device = None
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if sgd is None:
sgd = create_default_optimizer(Model.ops)
self._optimizer = sgd
for name, proc in self.pipeline: for name, proc in self.pipeline:
if hasattr(proc, 'begin_training'): if hasattr(proc, 'begin_training'):
context = proc.begin_training(get_gold_tuples(), proc.begin_training(get_gold_tuples(),
pipeline=self.pipeline) pipeline=self.pipeline,
contexts.append(context) sgd=self._optimizer)
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer return self._optimizer
def evaluate(self, docs_golds, verbose=False): def evaluate(self, docs_golds, verbose=False):

View File

@ -30,6 +30,7 @@ from .attrs import POS
from .parts_of_speech import X from .parts_of_speech import X
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models, zero_init, flatten from ._ml import link_vectors_to_models, zero_init, flatten
from ._ml import create_default_optimizer
from . import util from . import util
@ -139,12 +140,19 @@ class Pipe(object):
""" """
raise NotImplementedError raise NotImplementedError
def begin_training(self, gold_tuples=tuple(), pipeline=None): def create_optimizer(self):
return create_default_optimizer(self.model.ops,
**self.cfg.get('optimizer', {}))
def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None):
"""Initialize the pipe for training, using data exampes if available. """Initialize the pipe for training, using data exampes if available.
If no model has been initialized yet, the model is added.""" If no model has been initialized yet, the model is added."""
if self.model is True: if self.model is True:
self.model = self.Model(**self.cfg) self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
def use_params(self, params): def use_params(self, params):
"""Modify the pipe's model, to use the given parameter values.""" """Modify the pipe's model, to use the given parameter values."""
@ -336,8 +344,8 @@ class Tensorizer(Pipe):
loss = (d_scores**2).sum() loss = (d_scores**2).sum()
return loss, d_scores return loss, d_scores
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None):
"""Allocate models, pre-process training data and acquire a trainer and """Allocate models, pre-process training data and acquire an
optimizer. optimizer.
gold_tuples (iterable): Gold-standard training data. gold_tuples (iterable): Gold-standard training data.
@ -349,9 +357,11 @@ class Tensorizer(Pipe):
if self.model is True: if self.model is True:
self.cfg['input_size'] = 384 self.cfg['input_size'] = 384
self.cfg['output_size'] = 300 self.cfg['output_size'] = 300
#self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(**self.cfg) self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
class Tagger(Pipe): class Tagger(Pipe):
@ -457,7 +467,7 @@ class Tagger(Pipe):
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None):
orig_tag_map = dict(self.vocab.morphology.tag_map) orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = {} new_tag_map = {}
for raw_text, annots_brackets in gold_tuples: for raw_text, annots_brackets in gold_tuples:
@ -477,6 +487,9 @@ class Tagger(Pipe):
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
@classmethod @classmethod
def Model(cls, n_tags, **cfg): def Model(cls, n_tags, **cfg):
@ -627,7 +640,8 @@ class MultitaskObjective(Tagger):
def set_annotations(self, docs, dep_ids, tensors=None): def set_annotations(self, docs, dep_ids, tensors=None):
pass pass
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None): def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None,
sgd=None):
gold_tuples = nonproj.preprocess_training_data(gold_tuples) gold_tuples = nonproj.preprocess_training_data(gold_tuples)
for raw_text, annots_brackets in gold_tuples: for raw_text, annots_brackets in gold_tuples:
for annots, brackets in annots_brackets: for annots, brackets in annots_brackets:
@ -643,6 +657,9 @@ class MultitaskObjective(Tagger):
Softmax(len(self.labels), token_vector_width) Softmax(len(self.labels), token_vector_width)
) )
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
@classmethod @classmethod
def Model(cls, n_tags, tok2vec=None, **cfg): def Model(cls, n_tags, tok2vec=None, **cfg):
@ -739,7 +756,7 @@ class SimilarityHook(Pipe):
def update(self, doc1_doc2, golds, sgd=None, drop=0.): def update(self, doc1_doc2, golds, sgd=None, drop=0.):
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
def begin_training(self, _=tuple(), pipeline=None): def begin_training(self, _=tuple(), pipeline=None, sgd=None):
"""Allocate model, using width from tensorizer in pipeline. """Allocate model, using width from tensorizer in pipeline.
gold_tuples (iterable): Gold-standard training data. gold_tuples (iterable): Gold-standard training data.
@ -748,6 +765,9 @@ class SimilarityHook(Pipe):
if self.model is True: if self.model is True:
self.model = self.Model(pipeline[0].model.nO) self.model = self.Model(pipeline[0].model.nO)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
class TextCategorizer(Pipe): class TextCategorizer(Pipe):
@ -831,7 +851,7 @@ class TextCategorizer(Pipe):
self.labels.append(label) self.labels.append(label)
return 1 return 1
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None):
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer': if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
token_vector_width = pipeline[0].model.nO token_vector_width = pipeline[0].model.nO
else: else:
@ -841,6 +861,9 @@ class TextCategorizer(Pipe):
self.model = self.Model(len(self.labels), token_vector_width, self.model = self.Model(len(self.labels), token_vector_width,
**self.cfg) **self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
cdef class DependencyParser(Parser): cdef class DependencyParser(Parser):
@ -851,12 +874,12 @@ cdef class DependencyParser(Parser):
def postprocesses(self): def postprocesses(self):
return [nonproj.deprojectivize] return [nonproj.deprojectivize]
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
for target in []: for target in []:
labeller = MultitaskObjective(self.vocab, target=target) labeller = MultitaskObjective(self.vocab, target=target)
tok2vec = self.model[0] tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, labeller.begin_training(gold_tuples, pipeline=pipeline,
tok2vec=tok2vec) tok2vec=tok2vec, sgd=sgd)
pipeline.append(labeller) pipeline.append(labeller)
self._multitasks.append(labeller) self._multitasks.append(labeller)
@ -871,7 +894,7 @@ cdef class EntityRecognizer(Parser):
nr_feature = 6 nr_feature = 6
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
for target in []: for target in []:
labeller = MultitaskObjective(self.vocab, target=target) labeller = MultitaskObjective(self.vocab, target=target)
tok2vec = self.model[0] tok2vec = self.model[0]

View File

@ -30,7 +30,7 @@ from thinc.neural.util import get_array_module
from thinc.linalg cimport Vec, VecVec from thinc.linalg cimport Vec, VecVec
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
from .._ml import link_vectors_to_models from .._ml import link_vectors_to_models, create_default_optimizer
from ..compat import json_dumps, copy_array from ..compat import json_dumps, copy_array
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..gold cimport GoldParse from ..gold cimport GoldParse
@ -273,6 +273,10 @@ cdef class Parser:
} }
return (tok2vec, lower, upper), cfg return (tok2vec, lower, upper), cfg
def create_optimizer(self):
return create_default_optimizer(self.model[0].ops,
**self.cfg.get('optimizer', {}))
def __init__(self, Vocab vocab, moves=True, model=True, **cfg): def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
"""Create a Parser. """Create a Parser.
@ -793,7 +797,7 @@ cdef class Parser:
copy_array(larger.b[:smaller.nO], smaller.b) copy_array(larger.b[:smaller.nO], smaller.b)
self.model[-1]._layers[-1] = larger self.model[-1]._layers[-1] = larger
def begin_training(self, gold_tuples, pipeline=None, **cfg): def begin_training(self, gold_tuples, pipeline=None, sgd=None, **cfg):
if 'model' in cfg: if 'model' in cfg:
self.model = cfg['model'] self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples, gold_tuples = nonproj.preprocess_training_data(gold_tuples,
@ -805,9 +809,14 @@ cdef class Parser:
if self.model is True: if self.model is True:
cfg['pretrained_dims'] = self.vocab.vectors_length cfg['pretrained_dims'] = self.vocab.vectors_length
self.model, cfg = self.Model(self.moves.n_moves, **cfg) self.model, cfg = self.Model(self.moves.n_moves, **cfg)
self.init_multitask_objectives(gold_tuples, pipeline, **cfg) if sgd is None:
sgd = self.create_optimizer()
self.init_multitask_objectives(gold_tuples, pipeline, sgd=sgd, **cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
self.cfg.update(cfg) self.cfg.update(cfg)
elif sgd is None:
sgd = self.create_optimizer()
return sgd
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
'''Setup models for secondary objectives, to benefit from multi-task '''Setup models for secondary objectives, to benefit from multi-task

View File

@ -200,8 +200,8 @@ p
+cell Config parameters. +cell Config parameters.
+row("foot") +row("foot")
+cell yields +cell returns
+cell tuple +cell callable
+cell An optimizer. +cell An optimizer.
+h(2, "use_params") Language.use_params +h(2, "use_params") Language.use_params

View File

@ -262,13 +262,13 @@ p
+tag method +tag method
p p
| Initialize the pipe for training, using data exampes if available. If no | Initialise the pipe for training, using data exampes if available. If no
| model has been initialized yet, the model is added. | model has been initialised yet, the model is added.
+aside-code("Example"). +aside-code("Example").
#{VARNAME} = #{CLASSNAME}(nlp.vocab) #{VARNAME} = #{CLASSNAME}(nlp.vocab)
nlp.pipeline.append(#{VARNAME}) nlp.pipeline.append(#{VARNAME})
#{VARNAME}.begin_training(pipeline=nlp.pipeline) optimizer = #{VARNAME}.begin_training(pipeline=nlp.pipeline)
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -285,6 +285,36 @@ p
| Optional list of #[+api("pipe") #[code Pipe]] components that | Optional list of #[+api("pipe") #[code Pipe]] components that
| this component is part of. | this component is part of.
+row
+cell #[code sgd]
+cell callable
+cell
| An optional optimizer. Should take two arguments #[code weights]
| and #[code gradient], and an optional ID. Will be created via
| #[+api(CLASSNAME.toLowerCase() + "#create_optimizer") #[code create_optimizer]]
| if not set.
+row("foot")
+cell returns
+cell callable
+cell An optimizer.
+h(2, "create_optimizer") #{CLASSNAME}.create_optimizer
+tag method
p
| Create an optmizer for the pipeline component.
+aside-code("Example").
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
optimizer = #{VARNAME}.create_optimizer()
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell callable
+cell The optimizer.
+h(2, "use_params") #{CLASSNAME}.use_params +h(2, "use_params") #{CLASSNAME}.use_params
+tag method +tag method
+tag contextmanager +tag contextmanager
@ -309,7 +339,12 @@ p Modify the pipe's model, to use the given parameter values.
p Add a new label to the pipe. p Add a new label to the pipe.
+aside-code("Example"). if CLASSNAME == "Tagger"
+aside-code("Example").
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
#{VARNAME}.add_label('MY_LABEL', {POS: 'NOUN'})
else
+aside-code("Example").
#{VARNAME} = #{CLASSNAME}(nlp.vocab) #{VARNAME} = #{CLASSNAME}(nlp.vocab)
#{VARNAME}.add_label('MY_LABEL') #{VARNAME}.add_label('MY_LABEL')
@ -319,6 +354,14 @@ p Add a new label to the pipe.
+cell unicode +cell unicode
+cell The label to add. +cell The label to add.
if CLASSNAME == "Tagger"
+row
+cell #[code values]
+cell dict
+cell
| Optional values to map to the label, e.g. a tag map
| dictionary.
+h(2, "to_disk") #{CLASSNAME}.to_disk +h(2, "to_disk") #{CLASSNAME}.to_disk
+tag method +tag method