mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Allow multi-task objectives during training
This commit is contained in:
parent
4ae9ea7684
commit
bf917225ab
|
@ -291,7 +291,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
|
|
||||||
class NeuralTagger(BaseThincComponent):
|
class NeuralTagger(BaseThincComponent):
|
||||||
|
@ -395,7 +395,7 @@ class NeuralTagger(BaseThincComponent):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, **cfg):
|
def Model(cls, n_tags, **cfg):
|
||||||
|
@ -477,9 +477,25 @@ class NeuralTagger(BaseThincComponent):
|
||||||
|
|
||||||
class NeuralLabeller(NeuralTagger):
|
class NeuralLabeller(NeuralTagger):
|
||||||
name = 'nn_labeller'
|
name = 'nn_labeller'
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
if target == 'dep':
|
||||||
|
self.make_label = self.make_dep
|
||||||
|
elif target == 'tag':
|
||||||
|
self.make_label = self.make_tag
|
||||||
|
elif target == 'ent':
|
||||||
|
self.make_label = self.make_ent
|
||||||
|
elif target == 'dep_tag_offset':
|
||||||
|
self.make_label = self.make_dep_tag_offset
|
||||||
|
elif target == 'ent_tag':
|
||||||
|
self.make_label = self.make_ent_tag
|
||||||
|
elif hasattr(target, '__call__'):
|
||||||
|
self.make_label = target
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"NeuralLabeller target should be function or one of "
|
||||||
|
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||||
|
@ -495,43 +511,78 @@ class NeuralLabeller(NeuralTagger):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
|
||||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||||
for raw_text, annots_brackets in gold_tuples:
|
for raw_text, annots_brackets in gold_tuples:
|
||||||
for annots, brackets in annots_brackets:
|
for annots, brackets in annots_brackets:
|
||||||
ids, words, tags, heads, deps, ents = annots
|
ids, words, tags, heads, deps, ents = annots
|
||||||
for dep in deps:
|
for i in range(len(ids)):
|
||||||
if dep not in self.labels:
|
label = self.make_label(i, words, tags, heads, deps, ents)
|
||||||
self.labels[dep] = len(self.labels)
|
if label is not None and label not in self.labels:
|
||||||
token_vector_width = pipeline[0].model.nO
|
self.labels[label] = len(self.labels)
|
||||||
|
print(len(self.labels))
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
self.model = chain(
|
||||||
self.model = self.Model(len(self.labels), **self.cfg)
|
tok2vec,
|
||||||
link_vectors_to_models(self.vocab)
|
Softmax(len(self.labels), 128)
|
||||||
|
)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, n_tags, **cfg):
|
def Model(cls, n_tags, tok2vec=None, **cfg):
|
||||||
return build_tagger_model(n_tags, **cfg)
|
return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg)
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
for tag in gold.labels:
|
for i in range(len(gold.labels)):
|
||||||
if tag is None or tag not in self.labels:
|
label = self.make_label(i, gold.words, gold.tags, gold.heads,
|
||||||
|
gold.labels, gold.ents)
|
||||||
|
if label is None or label not in self.labels:
|
||||||
correct[idx] = guesses[idx]
|
correct[idx] = guesses[idx]
|
||||||
else:
|
else:
|
||||||
correct[idx] = self.labels[tag]
|
correct[idx] = self.labels[label]
|
||||||
idx += 1
|
idx += 1
|
||||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||||
d_scores /= d_scores.shape[0]
|
d_scores /= d_scores.shape[0]
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_dep(i, words, tags, heads, deps, ents):
|
||||||
|
if deps[i] is None or heads[i] is None:
|
||||||
|
return None
|
||||||
|
return deps[i]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_tag(i, words, tags, heads, deps, ents):
|
||||||
|
return tags[i]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_ent(i, words, tags, heads, deps, ents):
|
||||||
|
if ents is None:
|
||||||
|
return None
|
||||||
|
return ents[i]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_dep_tag_offset(i, words, tags, heads, deps, ents):
|
||||||
|
if deps[i] is None or heads[i] is None:
|
||||||
|
return None
|
||||||
|
offset = heads[i] - i
|
||||||
|
offset = min(offset, 2)
|
||||||
|
offset = max(offset, -2)
|
||||||
|
return '%s-%s:%d' % (deps[i], tags[i], offset)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_ent_tag(i, words, tags, heads, deps, ents):
|
||||||
|
if ents is None or ents[i] is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return '%s-%s' % (tags[i], ents[i])
|
||||||
|
|
||||||
|
|
||||||
class SimilarityHook(BaseThincComponent):
|
class SimilarityHook(BaseThincComponent):
|
||||||
"""
|
"""
|
||||||
|
@ -695,6 +746,14 @@ cdef class NeuralDependencyParser(NeuralParser):
|
||||||
name = 'parser'
|
name = 'parser'
|
||||||
TransitionSystem = ArcEager
|
TransitionSystem = ArcEager
|
||||||
|
|
||||||
|
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||||
|
for target in ['dep']:
|
||||||
|
labeller = NeuralLabeller(self.vocab, target=target)
|
||||||
|
tok2vec = self.model[0]
|
||||||
|
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||||
|
pipeline.append(labeller)
|
||||||
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
|
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
||||||
|
@ -705,13 +764,13 @@ cdef class NeuralEntityRecognizer(NeuralParser):
|
||||||
|
|
||||||
nr_feature = 6
|
nr_feature = 6
|
||||||
|
|
||||||
def predict_confidences(self, docs):
|
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||||
tensors = [d.tensor for d in docs]
|
for target in []:
|
||||||
samples = []
|
labeller = NeuralLabeller(self.vocab, target=target)
|
||||||
for i in range(10):
|
tok2vec = self.model[0]
|
||||||
states = self.parse_batch(docs, tensors, drop=0.3)
|
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||||
for state in states:
|
pipeline.append(labeller)
|
||||||
samples.append(self._get_entities(state))
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
|
@ -13,6 +13,7 @@ cdef class Parser:
|
||||||
cdef public object model
|
cdef public object model
|
||||||
cdef readonly TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
cdef readonly object cfg
|
cdef readonly object cfg
|
||||||
|
cdef public object _multitasks
|
||||||
|
|
||||||
cdef void _parse_step(self, StateC* state,
|
cdef void _parse_step(self, StateC* state,
|
||||||
const float* feat_weights,
|
const float* feat_weights,
|
||||||
|
|
|
@ -318,6 +318,7 @@ cdef class Parser:
|
||||||
for label in labels:
|
for label in labels:
|
||||||
self.moves.add_action(action, label)
|
self.moves.add_action(action, label)
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self._multitasks = []
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
@ -419,7 +420,7 @@ cdef class Parser:
|
||||||
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
||||||
while not next_step.empty():
|
while not next_step.empty():
|
||||||
if not has_hidden:
|
if not has_hidden:
|
||||||
for i in range(
|
for i in cython.parallel.prange(
|
||||||
next_step.size(), num_threads=6, nogil=True):
|
next_step.size(), num_threads=6, nogil=True):
|
||||||
self._parse_step(next_step[i],
|
self._parse_step(next_step[i],
|
||||||
feat_weights, nr_class, nr_feat, nr_piece)
|
feat_weights, nr_class, nr_feat, nr_piece)
|
||||||
|
@ -745,7 +746,7 @@ cdef class Parser:
|
||||||
# order, or the model goes out of synch
|
# order, or the model goes out of synch
|
||||||
self.cfg.setdefault('extra_labels', []).append(label)
|
self.cfg.setdefault('extra_labels', []).append(label)
|
||||||
|
|
||||||
def begin_training(self, gold_tuples, **cfg):
|
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
||||||
if 'model' in cfg:
|
if 'model' in cfg:
|
||||||
self.model = cfg['model']
|
self.model = cfg['model']
|
||||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||||
|
@ -756,9 +757,20 @@ cdef class Parser:
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
||||||
|
self.init_multitask_objectives(gold_tuples, pipeline, **cfg)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
|
|
||||||
|
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||||
|
'''Setup models for secondary objectives, to benefit from multi-task
|
||||||
|
learning. This method is intended to be overridden by subclasses.
|
||||||
|
|
||||||
|
For instance, the dependency parser can benefit from sharing
|
||||||
|
an input representation with a label prediction model. These auxiliary
|
||||||
|
models are discarded after training.
|
||||||
|
'''
|
||||||
|
pass
|
||||||
|
|
||||||
def preprocess_gold(self, docs_golds):
|
def preprocess_gold(self, docs_golds):
|
||||||
for doc, gold in docs_golds:
|
for doc, gold in docs_golds:
|
||||||
yield doc, gold
|
yield doc, gold
|
||||||
|
|
Loading…
Reference in New Issue
Block a user