Set data augmentation by default

This commit is contained in:
Matthew Honnibal 2017-06-08 15:24:37 -05:00
parent 34a2eecb17
commit 4c1b6a4c81

View File

@ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
batch_sizes = util.compounding(util.env_opt('batch_from', 1), batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 64), util.env_opt('batch_to', 64),
util.env_opt('batch_compound', 1.001)) util.env_opt('batch_compound', 1.001))
gold_preproc = util.env_opt('gold_preproc', False)
noise_level = util.env_opt('noise_level', 0.25)
if resume: if resume:
prints(output_path / 'model19.pickle', title="Resuming training") prints(output_path / 'model19.pickle', title="Resuming training")
@ -86,7 +88,9 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
i += 20 i += 20
with tqdm.tqdm(total=n_train_words, leave=False) as pbar: with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True, train_docs = corpus.train_docs(nlp, projectivize=True,
gold_preproc=False, max_length=0) gold_preproc=gold_preproc,
noise_level=noise_level,
max_length=0)
losses = {} losses = {}
for batch in minibatch(train_docs, size=batch_sizes): for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch) docs, golds = zip(*batch)
@ -105,7 +109,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
scorer = nlp_loaded.evaluate( scorer = nlp_loaded.evaluate(
corpus.dev_docs( corpus.dev_docs(
nlp_loaded, nlp_loaded,
gold_preproc=False)) gold_preproc=gold_preproc))
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_: with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores)) file_.write(json_dumps(scorer.scores))