mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Allow multi-task objectives during training
This commit is contained in:
parent
4ae9ea7684
commit
bf917225ab
|
@ -291,7 +291,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
|||
if self.model is True:
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model = self.Model(**self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
|
||||
class NeuralTagger(BaseThincComponent):
|
||||
|
@ -395,7 +395,7 @@ class NeuralTagger(BaseThincComponent):
|
|||
if self.model is True:
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
@classmethod
|
||||
def Model(cls, n_tags, **cfg):
|
||||
|
@ -477,9 +477,25 @@ class NeuralTagger(BaseThincComponent):
|
|||
|
||||
class NeuralLabeller(NeuralTagger):
|
||||
name = 'nn_labeller'
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
if target == 'dep':
|
||||
self.make_label = self.make_dep
|
||||
elif target == 'tag':
|
||||
self.make_label = self.make_tag
|
||||
elif target == 'ent':
|
||||
self.make_label = self.make_ent
|
||||
elif target == 'dep_tag_offset':
|
||||
self.make_label = self.make_dep_tag_offset
|
||||
elif target == 'ent_tag':
|
||||
self.make_label = self.make_ent_tag
|
||||
elif hasattr(target, '__call__'):
|
||||
self.make_label = target
|
||||
else:
|
||||
raise ValueError(
|
||||
"NeuralLabeller target should be function or one of "
|
||||
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||
|
@ -495,43 +511,78 @@ class NeuralLabeller(NeuralTagger):
|
|||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||
for raw_text, annots_brackets in gold_tuples:
|
||||
for annots, brackets in annots_brackets:
|
||||
ids, words, tags, heads, deps, ents = annots
|
||||
for dep in deps:
|
||||
if dep not in self.labels:
|
||||
self.labels[dep] = len(self.labels)
|
||||
token_vector_width = pipeline[0].model.nO
|
||||
for i in range(len(ids)):
|
||||
label = self.make_label(i, words, tags, heads, deps, ents)
|
||||
if label is not None and label not in self.labels:
|
||||
self.labels[label] = len(self.labels)
|
||||
print(len(self.labels))
|
||||
if self.model is True:
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
self.model = self.Model(len(self.labels), **self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
self.model = chain(
|
||||
tok2vec,
|
||||
Softmax(len(self.labels), 128)
|
||||
)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
@classmethod
|
||||
def Model(cls, n_tags, **cfg):
|
||||
return build_tagger_model(n_tags, **cfg)
|
||||
def Model(cls, n_tags, tok2vec=None, **cfg):
|
||||
return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg)
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||
guesses = scores.argmax(axis=1)
|
||||
for gold in golds:
|
||||
for tag in gold.labels:
|
||||
if tag is None or tag not in self.labels:
|
||||
for i in range(len(gold.labels)):
|
||||
label = self.make_label(i, gold.words, gold.tags, gold.heads,
|
||||
gold.labels, gold.ents)
|
||||
if label is None or label not in self.labels:
|
||||
correct[idx] = guesses[idx]
|
||||
else:
|
||||
correct[idx] = self.labels[tag]
|
||||
correct[idx] = self.labels[label]
|
||||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||
d_scores /= d_scores.shape[0]
|
||||
loss = (d_scores**2).sum()
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
return float(loss), d_scores
|
||||
|
||||
@staticmethod
|
||||
def make_dep(i, words, tags, heads, deps, ents):
|
||||
if deps[i] is None or heads[i] is None:
|
||||
return None
|
||||
return deps[i]
|
||||
|
||||
@staticmethod
|
||||
def make_tag(i, words, tags, heads, deps, ents):
|
||||
return tags[i]
|
||||
|
||||
@staticmethod
|
||||
def make_ent(i, words, tags, heads, deps, ents):
|
||||
if ents is None:
|
||||
return None
|
||||
return ents[i]
|
||||
|
||||
@staticmethod
|
||||
def make_dep_tag_offset(i, words, tags, heads, deps, ents):
|
||||
if deps[i] is None or heads[i] is None:
|
||||
return None
|
||||
offset = heads[i] - i
|
||||
offset = min(offset, 2)
|
||||
offset = max(offset, -2)
|
||||
return '%s-%s:%d' % (deps[i], tags[i], offset)
|
||||
|
||||
@staticmethod
|
||||
def make_ent_tag(i, words, tags, heads, deps, ents):
|
||||
if ents is None or ents[i] is None:
|
||||
return None
|
||||
else:
|
||||
return '%s-%s' % (tags[i], ents[i])
|
||||
|
||||
|
||||
class SimilarityHook(BaseThincComponent):
|
||||
"""
|
||||
|
@ -695,6 +746,14 @@ cdef class NeuralDependencyParser(NeuralParser):
|
|||
name = 'parser'
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||
for target in ['dep']:
|
||||
labeller = NeuralLabeller(self.vocab, target=target)
|
||||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||
pipeline.append(labeller)
|
||||
self._multitasks.append(labeller)
|
||||
|
||||
def __reduce__(self):
|
||||
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
|
@ -705,13 +764,13 @@ cdef class NeuralEntityRecognizer(NeuralParser):
|
|||
|
||||
nr_feature = 6
|
||||
|
||||
def predict_confidences(self, docs):
|
||||
tensors = [d.tensor for d in docs]
|
||||
samples = []
|
||||
for i in range(10):
|
||||
states = self.parse_batch(docs, tensors, drop=0.3)
|
||||
for state in states:
|
||||
samples.append(self._get_entities(state))
|
||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||
for target in []:
|
||||
labeller = NeuralLabeller(self.vocab, target=target)
|
||||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||
pipeline.append(labeller)
|
||||
self._multitasks.append(labeller)
|
||||
|
||||
def __reduce__(self):
|
||||
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
||||
|
|
|
@ -13,6 +13,7 @@ cdef class Parser:
|
|||
cdef public object model
|
||||
cdef readonly TransitionSystem moves
|
||||
cdef readonly object cfg
|
||||
cdef public object _multitasks
|
||||
|
||||
cdef void _parse_step(self, StateC* state,
|
||||
const float* feat_weights,
|
||||
|
|
|
@ -318,6 +318,7 @@ cdef class Parser:
|
|||
for label in labels:
|
||||
self.moves.add_action(action, label)
|
||||
self.model = model
|
||||
self._multitasks = []
|
||||
|
||||
def __reduce__(self):
|
||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
@ -419,7 +420,7 @@ cdef class Parser:
|
|||
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
||||
while not next_step.empty():
|
||||
if not has_hidden:
|
||||
for i in range(
|
||||
for i in cython.parallel.prange(
|
||||
next_step.size(), num_threads=6, nogil=True):
|
||||
self._parse_step(next_step[i],
|
||||
feat_weights, nr_class, nr_feat, nr_piece)
|
||||
|
@ -745,7 +746,7 @@ cdef class Parser:
|
|||
# order, or the model goes out of synch
|
||||
self.cfg.setdefault('extra_labels', []).append(label)
|
||||
|
||||
def begin_training(self, gold_tuples, **cfg):
|
||||
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
||||
if 'model' in cfg:
|
||||
self.model = cfg['model']
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||
|
@ -756,9 +757,20 @@ cdef class Parser:
|
|||
if self.model is True:
|
||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
||||
self.init_multitask_objectives(gold_tuples, pipeline, **cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
self.cfg.update(cfg)
|
||||
|
||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||
'''Setup models for secondary objectives, to benefit from multi-task
|
||||
learning. This method is intended to be overridden by subclasses.
|
||||
|
||||
For instance, the dependency parser can benefit from sharing
|
||||
an input representation with a label prediction model. These auxiliary
|
||||
models are discarded after training.
|
||||
'''
|
||||
pass
|
||||
|
||||
def preprocess_gold(self, docs_golds):
|
||||
for doc, gold in docs_golds:
|
||||
yield doc, gold
|
||||
|
|
Loading…
Reference in New Issue
Block a user