Pass data as a function in begin_training methods

This commit is contained in:
Matthew Honnibal 2018-03-27 09:39:59 +00:00
parent d9ebd78e11
commit dd54511c4f
3 changed files with 16 additions and 16 deletions

View File

@ -463,7 +463,7 @@ class Language(object):
self._optimizer = sgd self._optimizer = sgd
for name, proc in self.pipeline: for name, proc in self.pipeline:
if hasattr(proc, 'begin_training'): if hasattr(proc, 'begin_training'):
proc.begin_training(get_gold_tuples(), proc.begin_training(get_gold_tuples,
pipeline=self.pipeline, pipeline=self.pipeline,
sgd=self._optimizer, sgd=self._optimizer,
**cfg) **cfg)

View File

@ -184,7 +184,7 @@ class Pipe(object):
return create_default_optimizer(self.model.ops, return create_default_optimizer(self.model.ops,
**self.cfg.get('optimizer', {})) **self.cfg.get('optimizer', {}))
def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None, def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
**kwargs): **kwargs):
"""Initialize the pipe for training, using data exampes if available. """Initialize the pipe for training, using data exampes if available.
If no model has been initialized yet, the model is added.""" If no model has been initialized yet, the model is added."""
@ -386,7 +386,7 @@ class Tensorizer(Pipe):
loss = (d_scores**2).sum() loss = (d_scores**2).sum()
return loss, d_scores return loss, d_scores
def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None, def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None,
**kwargs): **kwargs):
"""Allocate models, pre-process training data and acquire an """Allocate models, pre-process training data and acquire an
optimizer. optimizer.
@ -510,11 +510,11 @@ class Tagger(Pipe):
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None, def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
**kwargs): **kwargs):
orig_tag_map = dict(self.vocab.morphology.tag_map) orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = OrderedDict() new_tag_map = OrderedDict()
for raw_text, annots_brackets in gold_tuples: for raw_text, annots_brackets in get_gold_tuples():
for annots, brackets in annots_brackets: for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots ids, words, tags, heads, deps, ents = annots
for tag in tags: for tag in tags:
@ -687,9 +687,9 @@ class MultitaskObjective(Tagger):
def set_annotations(self, docs, dep_ids, tensors=None): def set_annotations(self, docs, dep_ids, tensors=None):
pass pass
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None, def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, tok2vec=None,
sgd=None, **kwargs): sgd=None, **kwargs):
gold_tuples = nonproj.preprocess_training_data(gold_tuples) gold_tuples = nonproj.preprocess_training_data(get_gold_tuples())
for raw_text, annots_brackets in gold_tuples: for raw_text, annots_brackets in gold_tuples:
for annots, brackets in annots_brackets: for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots ids, words, tags, heads, deps, ents = annots
@ -962,7 +962,7 @@ class TextCategorizer(Pipe):
self.labels.append(label) self.labels.append(label)
return 1 return 1
def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None): def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None):
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer': if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
token_vector_width = pipeline[0].model.nO token_vector_width = pipeline[0].model.nO
else: else:
@ -990,10 +990,10 @@ cdef class DependencyParser(Parser):
labeller = MultitaskObjective(self.vocab, target=target) labeller = MultitaskObjective(self.vocab, target=target)
self._multitasks.append(labeller) self._multitasks.append(labeller)
def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg): def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
for labeller in self._multitasks: for labeller in self._multitasks:
tok2vec = self.model[0] tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, labeller.begin_training(get_gold_tuples, pipeline=pipeline,
tok2vec=tok2vec, sgd=sgd) tok2vec=tok2vec, sgd=sgd)
def __reduce__(self): def __reduce__(self):
@ -1011,10 +1011,10 @@ cdef class EntityRecognizer(Parser):
labeller = MultitaskObjective(self.vocab, target=target) labeller = MultitaskObjective(self.vocab, target=target)
self._multitasks.append(labeller) self._multitasks.append(labeller)
def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg): def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
for labeller in self._multitasks: for labeller in self._multitasks:
tok2vec = self.model[0] tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, labeller.begin_training(get_gold_tuples, pipeline=pipeline,
tok2vec=tok2vec) tok2vec=tok2vec)
def __reduce__(self): def __reduce__(self):

View File

@ -833,11 +833,11 @@ cdef class Parser:
copy_array(larger.b[:smaller.nO], smaller.b) copy_array(larger.b[:smaller.nO], smaller.b)
self.model[-1]._layers[-1] = larger self.model[-1]._layers[-1] = larger
def begin_training(self, gold_tuples, pipeline=None, sgd=None, **cfg): def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg):
if 'model' in cfg: if 'model' in cfg:
self.model = cfg['model'] self.model = cfg['model']
cfg.setdefault('min_action_freq', 30) cfg.setdefault('min_action_freq', 30)
actions = self.moves.get_actions(gold_parses=gold_tuples, actions = self.moves.get_actions(gold_parses=get_gold_tuples(),
min_freq=cfg.get('min_action_freq', 30)) min_freq=cfg.get('min_action_freq', 30))
self.moves.initialize_actions(actions) self.moves.initialize_actions(actions)
cfg.setdefault('token_vector_width', 128) cfg.setdefault('token_vector_width', 128)
@ -849,7 +849,7 @@ cdef class Parser:
self.model[1].begin_training( self.model[1].begin_training(
self.model[1].ops.allocate((5, cfg['token_vector_width']))) self.model[1].ops.allocate((5, cfg['token_vector_width'])))
if pipeline is not None: if pipeline is not None:
self.init_multitask_objectives(gold_tuples, pipeline, sgd=sgd, **cfg) self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
else: else:
if sgd is None: if sgd is None:
@ -863,7 +863,7 @@ cdef class Parser:
# Defined in subclasses, to avoid circular import # Defined in subclasses, to avoid circular import
raise NotImplementedError raise NotImplementedError
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): def init_multitask_objectives(self, get_gold_tuples, pipeline, **cfg):
'''Setup models for secondary objectives, to benefit from multi-task '''Setup models for secondary objectives, to benefit from multi-task
learning. This method is intended to be overridden by subclasses. learning. This method is intended to be overridden by subclasses.