Don't use tagger to predict tags

This commit is contained in:
Matthew Honnibal 2017-05-08 07:55:34 -05:00
parent 50ddc9fc45
commit 245372973d

View File

@ -154,17 +154,17 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
_ = vocab[tag] _ = vocab[tag]
if vocab.morphology.tag_map: if vocab.morphology.tag_map:
for tag in tags: for tag in tags:
assert tag in vocab.morphology.tag_map, repr(tag) vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]}
tagger = Tagger(vocab) tagger = Tagger(vocab)
encoder = TokenVectorEncoder(vocab) encoder = TokenVectorEncoder(vocab)
parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
Xs, ys = organize_data(vocab, train_sents) Xs, ys = organize_data(vocab, train_sents)
dev_Xs, dev_ys = organize_data(vocab, dev_sents) dev_Xs, dev_ys = organize_data(vocab, dev_sents)
Xs = Xs #Xs = Xs[:1000]
ys = ys #ys = ys[:1000]
dev_Xs = dev_Xs[:1000] #dev_Xs = dev_Xs[:1000]
dev_ys = dev_ys[:1000] #dev_ys = dev_ys[:1000]
with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer): with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
docs = list(Xs) docs = list(Xs)
for doc in docs: for doc in docs:
@ -173,26 +173,26 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
nn_loss = [0.] nn_loss = [0.]
def track_progress(): def track_progress():
with encoder.tagger.use_params(optimizer.averages): with encoder.tagger.use_params(optimizer.averages):
scorer = score_model(vocab, encoder, tagger, parser, dev_Xs, dev_ys) with parser.model.use_params(optimizer.averages):
scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys)
itn = len(nn_loss) itn = len(nn_loss)
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
nn_loss.append(0.) nn_loss.append(0.)
track_progress()
trainer.each_epoch.append(track_progress) trainer.each_epoch.append(track_progress)
trainer.batch_size = 24 trainer.batch_size = 24
trainer.nb_epoch = 10 trainer.nb_epoch = 40
for docs, golds in trainer.iterate(Xs, ys): for docs, golds in trainer.iterate(Xs, ys, progress_bar=True):
docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs] docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
tokvecs, upd_tokvecs = encoder.begin_update(docs) tokvecs, upd_tokvecs = encoder.begin_update(docs)
for doc, tokvec in zip(docs, tokvecs): for doc, tokvec in zip(docs, tokvecs):
doc.tensor = tokvec doc.tensor = tokvec
for doc, gold in zip(docs, golds):
tagger.update(doc, gold)
d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer) d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer)
upd_tokvecs(d_tokvecs, sgd=optimizer) upd_tokvecs(d_tokvecs, sgd=optimizer)
encoder.update(docs, golds, sgd=optimizer) encoder.update(docs, golds, sgd=optimizer)
nn_loss[-1] += loss nn_loss[-1] += loss
nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
nlp.end_training(model_dir) #nlp.end_training(model_dir)
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))