Allow multi-task objectives during training

This commit is contained in:
Matthew Honnibal 2017-09-26 05:42:52 -05:00
parent 4ae9ea7684
commit bf917225ab
3 changed files with 99 additions and 27 deletions

View File

@ -291,7 +291,7 @@ class TokenVectorEncoder(BaseThincComponent):
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(**self.cfg) self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
class NeuralTagger(BaseThincComponent): class NeuralTagger(BaseThincComponent):
@ -395,7 +395,7 @@ class NeuralTagger(BaseThincComponent):
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
@classmethod @classmethod
def Model(cls, n_tags, **cfg): def Model(cls, n_tags, **cfg):
@ -477,9 +477,25 @@ class NeuralTagger(BaseThincComponent):
class NeuralLabeller(NeuralTagger): class NeuralLabeller(NeuralTagger):
name = 'nn_labeller' name = 'nn_labeller'
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
if target == 'dep':
self.make_label = self.make_dep
elif target == 'tag':
self.make_label = self.make_tag
elif target == 'ent':
self.make_label = self.make_ent
elif target == 'dep_tag_offset':
self.make_label = self.make_dep_tag_offset
elif target == 'ent_tag':
self.make_label = self.make_ent_tag
elif hasattr(target, '__call__'):
self.make_label = target
else:
raise ValueError(
"NeuralLabeller target should be function or one of "
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2) self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
@ -495,43 +511,78 @@ class NeuralLabeller(NeuralTagger):
def set_annotations(self, docs, dep_ids): def set_annotations(self, docs, dep_ids):
pass pass
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
gold_tuples = nonproj.preprocess_training_data(gold_tuples) gold_tuples = nonproj.preprocess_training_data(gold_tuples)
for raw_text, annots_brackets in gold_tuples: for raw_text, annots_brackets in gold_tuples:
for annots, brackets in annots_brackets: for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots ids, words, tags, heads, deps, ents = annots
for dep in deps: for i in range(len(ids)):
if dep not in self.labels: label = self.make_label(i, words, tags, heads, deps, ents)
self.labels[dep] = len(self.labels) if label is not None and label not in self.labels:
token_vector_width = pipeline[0].model.nO self.labels[label] = len(self.labels)
print(len(self.labels))
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.model = chain(
self.model = self.Model(len(self.labels), **self.cfg) tok2vec,
link_vectors_to_models(self.vocab) Softmax(len(self.labels), 128)
)
link_vectors_to_models(self.vocab)
@classmethod @classmethod
def Model(cls, n_tags, **cfg): def Model(cls, n_tags, tok2vec=None, **cfg):
return build_tagger_model(n_tags, **cfg) return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg)
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
cdef int idx = 0 cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i') correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
for gold in golds: for gold in golds:
for tag in gold.labels: for i in range(len(gold.labels)):
if tag is None or tag not in self.labels: label = self.make_label(i, gold.words, gold.tags, gold.heads,
gold.labels, gold.ents)
if label is None or label not in self.labels:
correct[idx] = guesses[idx] correct[idx] = guesses[idx]
else: else:
correct[idx] = self.labels[tag] correct[idx] = self.labels[label]
idx += 1 idx += 1
correct = self.model.ops.xp.array(correct, dtype='i') correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0] d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum() loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
@staticmethod
def make_dep(i, words, tags, heads, deps, ents):
if deps[i] is None or heads[i] is None:
return None
return deps[i]
@staticmethod
def make_tag(i, words, tags, heads, deps, ents):
return tags[i]
@staticmethod
def make_ent(i, words, tags, heads, deps, ents):
if ents is None:
return None
return ents[i]
@staticmethod
def make_dep_tag_offset(i, words, tags, heads, deps, ents):
if deps[i] is None or heads[i] is None:
return None
offset = heads[i] - i
offset = min(offset, 2)
offset = max(offset, -2)
return '%s-%s:%d' % (deps[i], tags[i], offset)
@staticmethod
def make_ent_tag(i, words, tags, heads, deps, ents):
if ents is None or ents[i] is None:
return None
else:
return '%s-%s' % (tags[i], ents[i])
class SimilarityHook(BaseThincComponent): class SimilarityHook(BaseThincComponent):
""" """
@ -695,6 +746,14 @@ cdef class NeuralDependencyParser(NeuralParser):
name = 'parser' name = 'parser'
TransitionSystem = ArcEager TransitionSystem = ArcEager
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
for target in ['dep']:
labeller = NeuralLabeller(self.vocab, target=target)
tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
pipeline.append(labeller)
self._multitasks.append(labeller)
def __reduce__(self): def __reduce__(self):
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
@ -705,13 +764,13 @@ cdef class NeuralEntityRecognizer(NeuralParser):
nr_feature = 6 nr_feature = 6
def predict_confidences(self, docs): def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
tensors = [d.tensor for d in docs] for target in []:
samples = [] labeller = NeuralLabeller(self.vocab, target=target)
for i in range(10): tok2vec = self.model[0]
states = self.parse_batch(docs, tensors, drop=0.3) labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
for state in states: pipeline.append(labeller)
samples.append(self._get_entities(state)) self._multitasks.append(labeller)
def __reduce__(self): def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)

View File

@ -13,6 +13,7 @@ cdef class Parser:
cdef public object model cdef public object model
cdef readonly TransitionSystem moves cdef readonly TransitionSystem moves
cdef readonly object cfg cdef readonly object cfg
cdef public object _multitasks
cdef void _parse_step(self, StateC* state, cdef void _parse_step(self, StateC* state,
const float* feat_weights, const float* feat_weights,

View File

@ -318,6 +318,7 @@ cdef class Parser:
for label in labels: for label in labels:
self.moves.add_action(action, label) self.moves.add_action(action, label)
self.model = model self.model = model
self._multitasks = []
def __reduce__(self): def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None) return (Parser, (self.vocab, self.moves, self.model), None, None)
@ -419,7 +420,7 @@ cdef class Parser:
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
while not next_step.empty(): while not next_step.empty():
if not has_hidden: if not has_hidden:
for i in range( for i in cython.parallel.prange(
next_step.size(), num_threads=6, nogil=True): next_step.size(), num_threads=6, nogil=True):
self._parse_step(next_step[i], self._parse_step(next_step[i],
feat_weights, nr_class, nr_feat, nr_piece) feat_weights, nr_class, nr_feat, nr_piece)
@ -745,7 +746,7 @@ cdef class Parser:
# order, or the model goes out of synch # order, or the model goes out of synch
self.cfg.setdefault('extra_labels', []).append(label) self.cfg.setdefault('extra_labels', []).append(label)
def begin_training(self, gold_tuples, **cfg): def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg: if 'model' in cfg:
self.model = cfg['model'] self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples) gold_tuples = nonproj.preprocess_training_data(gold_tuples)
@ -756,9 +757,20 @@ cdef class Parser:
if self.model is True: if self.model is True:
cfg['pretrained_dims'] = self.vocab.vectors_length cfg['pretrained_dims'] = self.vocab.vectors_length
self.model, cfg = self.Model(self.moves.n_moves, **cfg) self.model, cfg = self.Model(self.moves.n_moves, **cfg)
self.init_multitask_objectives(gold_tuples, pipeline, **cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
self.cfg.update(cfg) self.cfg.update(cfg)
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
'''Setup models for secondary objectives, to benefit from multi-task
learning. This method is intended to be overridden by subclasses.
For instance, the dependency parser can benefit from sharing
an input representation with a label prediction model. These auxiliary
models are discarded after training.
'''
pass
def preprocess_gold(self, docs_golds): def preprocess_gold(self, docs_golds):
for doc, gold in docs_golds: for doc, gold in docs_golds:
yield doc, gold yield doc, gold