mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-02 23:03:41 +03:00
Pass data as a function in begin_training methods
This commit is contained in:
parent
d9ebd78e11
commit
dd54511c4f
|
@ -463,7 +463,7 @@ class Language(object):
|
||||||
self._optimizer = sgd
|
self._optimizer = sgd
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, 'begin_training'):
|
if hasattr(proc, 'begin_training'):
|
||||||
proc.begin_training(get_gold_tuples(),
|
proc.begin_training(get_gold_tuples,
|
||||||
pipeline=self.pipeline,
|
pipeline=self.pipeline,
|
||||||
sgd=self._optimizer,
|
sgd=self._optimizer,
|
||||||
**cfg)
|
**cfg)
|
||||||
|
|
|
@ -184,7 +184,7 @@ class Pipe(object):
|
||||||
return create_default_optimizer(self.model.ops,
|
return create_default_optimizer(self.model.ops,
|
||||||
**self.cfg.get('optimizer', {}))
|
**self.cfg.get('optimizer', {}))
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
|
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Initialize the pipe for training, using data exampes if available.
|
"""Initialize the pipe for training, using data exampes if available.
|
||||||
If no model has been initialized yet, the model is added."""
|
If no model has been initialized yet, the model is added."""
|
||||||
|
@ -386,7 +386,7 @@ class Tensorizer(Pipe):
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
return loss, d_scores
|
return loss, d_scores
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
|
def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""Allocate models, pre-process training data and acquire an
|
"""Allocate models, pre-process training data and acquire an
|
||||||
optimizer.
|
optimizer.
|
||||||
|
@ -510,11 +510,11 @@ class Tagger(Pipe):
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
|
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||||
new_tag_map = OrderedDict()
|
new_tag_map = OrderedDict()
|
||||||
for raw_text, annots_brackets in gold_tuples:
|
for raw_text, annots_brackets in get_gold_tuples():
|
||||||
for annots, brackets in annots_brackets:
|
for annots, brackets in annots_brackets:
|
||||||
ids, words, tags, heads, deps, ents = annots
|
ids, words, tags, heads, deps, ents = annots
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
|
@ -687,9 +687,9 @@ class MultitaskObjective(Tagger):
|
||||||
def set_annotations(self, docs, dep_ids, tensors=None):
|
def set_annotations(self, docs, dep_ids, tensors=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None,
|
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, tok2vec=None,
|
||||||
sgd=None, **kwargs):
|
sgd=None, **kwargs):
|
||||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
gold_tuples = nonproj.preprocess_training_data(get_gold_tuples())
|
||||||
for raw_text, annots_brackets in gold_tuples:
|
for raw_text, annots_brackets in gold_tuples:
|
||||||
for annots, brackets in annots_brackets:
|
for annots, brackets in annots_brackets:
|
||||||
ids, words, tags, heads, deps, ents = annots
|
ids, words, tags, heads, deps, ents = annots
|
||||||
|
@ -962,7 +962,7 @@ class TextCategorizer(Pipe):
|
||||||
self.labels.append(label)
|
self.labels.append(label)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None):
|
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None):
|
||||||
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
|
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
|
||||||
token_vector_width = pipeline[0].model.nO
|
token_vector_width = pipeline[0].model.nO
|
||||||
else:
|
else:
|
||||||
|
@ -990,10 +990,10 @@ cdef class DependencyParser(Parser):
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
labeller = MultitaskObjective(self.vocab, target=target)
|
||||||
self._multitasks.append(labeller)
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
tok2vec = self.model[0]
|
tok2vec = self.model[0]
|
||||||
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
labeller.begin_training(get_gold_tuples, pipeline=pipeline,
|
||||||
tok2vec=tok2vec, sgd=sgd)
|
tok2vec=tok2vec, sgd=sgd)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
@ -1011,10 +1011,10 @@ cdef class EntityRecognizer(Parser):
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
labeller = MultitaskObjective(self.vocab, target=target)
|
||||||
self._multitasks.append(labeller)
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
tok2vec = self.model[0]
|
tok2vec = self.model[0]
|
||||||
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
labeller.begin_training(get_gold_tuples, pipeline=pipeline,
|
||||||
tok2vec=tok2vec)
|
tok2vec=tok2vec)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
|
|
@ -833,11 +833,11 @@ cdef class Parser:
|
||||||
copy_array(larger.b[:smaller.nO], smaller.b)
|
copy_array(larger.b[:smaller.nO], smaller.b)
|
||||||
self.model[-1]._layers[-1] = larger
|
self.model[-1]._layers[-1] = larger
|
||||||
|
|
||||||
def begin_training(self, gold_tuples, pipeline=None, sgd=None, **cfg):
|
def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg):
|
||||||
if 'model' in cfg:
|
if 'model' in cfg:
|
||||||
self.model = cfg['model']
|
self.model = cfg['model']
|
||||||
cfg.setdefault('min_action_freq', 30)
|
cfg.setdefault('min_action_freq', 30)
|
||||||
actions = self.moves.get_actions(gold_parses=gold_tuples,
|
actions = self.moves.get_actions(gold_parses=get_gold_tuples(),
|
||||||
min_freq=cfg.get('min_action_freq', 30))
|
min_freq=cfg.get('min_action_freq', 30))
|
||||||
self.moves.initialize_actions(actions)
|
self.moves.initialize_actions(actions)
|
||||||
cfg.setdefault('token_vector_width', 128)
|
cfg.setdefault('token_vector_width', 128)
|
||||||
|
@ -849,7 +849,7 @@ cdef class Parser:
|
||||||
self.model[1].begin_training(
|
self.model[1].begin_training(
|
||||||
self.model[1].ops.allocate((5, cfg['token_vector_width'])))
|
self.model[1].ops.allocate((5, cfg['token_vector_width'])))
|
||||||
if pipeline is not None:
|
if pipeline is not None:
|
||||||
self.init_multitask_objectives(gold_tuples, pipeline, sgd=sgd, **cfg)
|
self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
else:
|
else:
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
|
@ -863,7 +863,7 @@ cdef class Parser:
|
||||||
# Defined in subclasses, to avoid circular import
|
# Defined in subclasses, to avoid circular import
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
def init_multitask_objectives(self, get_gold_tuples, pipeline, **cfg):
|
||||||
'''Setup models for secondary objectives, to benefit from multi-task
|
'''Setup models for secondary objectives, to benefit from multi-task
|
||||||
learning. This method is intended to be overridden by subclasses.
|
learning. This method is intended to be overridden by subclasses.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user