* Try new CoNLL tagger method

This commit is contained in:
Matthew Honnibal 2016-02-22 22:57:06 +01:00
parent 3f12fb4191
commit 92e9134603

View File

@ -116,37 +116,42 @@ def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=0,
random.shuffle(train_sents) random.shuffle(train_sents)
heldout_sents = train_sents[:int(nr_train * 0.1)] heldout_sents = train_sents[:int(nr_train * 0.1)]
train_sents = train_sents[len(heldout_sents):] train_sents = train_sents[len(heldout_sents):]
assert len(heldout_sents) < len(train_sents) #train_sents = train_sents[:500]
#assert len(heldout_sents) < len(train_sents)
prev_score = 0.0 prev_score = 0.0
variance = 0.001 variance = 0.001
last_good_learn_rate = nlp.tagger.model.eta last_good_learn_rate = nlp.tagger.model.eta
for itn in range(n_iter): n = 0
random.shuffle(train_sents) total = 0
acc = 0 acc = 0
total = 0 while True:
for words, gold_tags in train_sents: words, gold_tags = random.choice(train_sents)
tokens = nlp.tokenizer.tokens_from_list(words) tokens = nlp.tokenizer.tokens_from_list(words)
acc += nlp.tagger.train(tokens, gold_tags) acc += nlp.tagger.train(tokens, gold_tags)
total += len(tokens) total += len(tokens)
dev_score = score_model(nlp, heldout_sents) n += 1
eval_score = score_model(nlp, dev_sents) if n and n % 10000 == 0:
if dev_score >= prev_score: dev_score = score_model(nlp, heldout_sents)
nlp.tagger.model.keep_update() eval_score = score_model(nlp, dev_sents)
prev_score = dev_score if dev_score > prev_score:
variance = 0.001 nlp.tagger.model.keep_update()
last_good_learn_rate = nlp.tagger.model.eta prev_score = dev_score
nlp.tagger.model.eta *= 1.05 variance = 0.001
print('%d:\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, acc/total, dev_score, eval_score, nlp.tagger.model.eta)) last_good_learn_rate = nlp.tagger.model.eta
else: nlp.tagger.model.eta *= 1.05
nlp.tagger.model.backtrack() print('%d:\t%.3f\t%.3f\t%.3f\t%.4f' % (n, acc/total, dev_score, eval_score, nlp.tagger.model.eta))
new_eta = numpy.random.normal(loc=last_good_learn_rate, scale=variance)
if new_eta >= 0.00001:
nlp.tagger.model.eta = new_eta
else: else:
nlp.tagger.model.eta = 0.00001 nlp.tagger.model.backtrack()
print('X:\t%.3f\t%.3f\t%.3f\t%.4f' % (acc/total, dev_score, eval_score, nlp.tagger.model.eta)) new_eta = numpy.random.normal(loc=last_good_learn_rate, scale=variance)
variance *= 1.1 if new_eta >= 0.0001:
prev_score *= 0.9999 nlp.tagger.model.eta = new_eta
else:
nlp.tagger.model.eta = 0.0001
print('X:\t%.3f\t%.3f\t%.3f\t%.4f' % (acc/total, dev_score, eval_score, nlp.tagger.model.eta))
variance *= 1.1
prev_score *= 0.9999
acc = 0.0
total = 0.0
nlp.end_training(data_dir=model_dir) nlp.end_training(data_dir=model_dir)
return nlp return nlp