Restore changes from nn-beam-parser

This commit is contained in:
Matthew Honnibal 2017-08-18 22:26:12 +02:00
parent ce321b0322
commit 11c31d285c
3 changed files with 23 additions and 11 deletions

View File

@ -21,10 +21,10 @@ CONVERTERS = {
@plac.annotations( @plac.annotations(
input_file=("input file", "positional", None, str), input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str), output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float), n_sents=("Number of sentences per doc", "option", "n", int),
morphology=("Enable appending morphology to tags", "flag", "m", bool) morphology=("Enable appending morphology to tags", "flag", "m", bool)
) )
def convert(cmd, input_file, output_dir, n_sents, morphology): def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
""" """
Convert files into JSON format for use with train command and other Convert files into JSON format for use with train command and other
experiment management functions. experiment management functions.

View File

@ -91,15 +91,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
for batch in minibatch(train_docs, size=batch_sizes): for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch) docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses) drop=next(dropout_rates), losses=losses,
update_tensors=True)
pbar.update(sum(len(doc) for doc in docs)) pbar.update(sum(len(doc) for doc in docs))
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
util.set_env_log(False) util.set_env_log(False)
epoch_model_path = output_path / ('model%d' % i) epoch_model_path = output_path / ('model%d' % i)
nlp.to_disk(epoch_model_path) nlp.to_disk(epoch_model_path)
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
dill.dump(nlp, file_, -1)
nlp_loaded = lang_class(pipeline=pipeline) nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path) nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate( scorer = nlp_loaded.evaluate(

View File

@ -277,7 +277,8 @@ class Language(object):
def make_doc(self, text): def make_doc(self, text):
return self.tokenizer(text) return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None,
update_tensors=False):
"""Update the models in the pipeline. """Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects. docs (iterable): A batch of `Doc` objects.
@ -304,14 +305,17 @@ class Language(object):
grads[key] = (W, dW) grads[key] = (W, dW)
pipes = list(self.pipeline[1:]) pipes = list(self.pipeline[1:])
random.shuffle(pipes) random.shuffle(pipes)
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
for proc in pipes: for proc in pipes:
if not hasattr(proc, 'update'): if not hasattr(proc, 'update'):
continue continue
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
d_tokvecses = proc.update((docs, tokvecses), golds, d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses) drop=drop, sgd=get_grads, losses=losses)
if d_tokvecses is not None: if update_tensors and d_tokvecses is not None:
bp_tokvecses(d_tokvecses, sgd=sgd) for i, d_tv in enumerate(d_tokvecses):
all_d_tokvecses[i] += d_tv
bp_tokvecses(all_d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items(): for key, (W, dW) in grads.items():
sgd(W, dW, key=key) sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory. # Clear the tensor variable, to free GPU memory.
@ -381,9 +385,18 @@ class Language(object):
return optimizer return optimizer
def evaluate(self, docs_golds): def evaluate(self, docs_golds):
docs, golds = zip(*docs_golds)
scorer = Scorer() scorer = Scorer()
for doc, gold in zip(self.pipe(docs, batch_size=32), golds): docs, golds = zip(*docs_golds)
docs = list(docs)
golds = list(golds)
for pipe in self.pipeline:
if not hasattr(pipe, 'pipe'):
for doc in docs:
pipe(doc)
else:
docs = list(pipe.pipe(docs))
assert len(docs) == len(golds)
for doc, gold in zip(docs, golds):
scorer.score(doc, gold) scorer.score(doc, gold)
doc.tensor = None doc.tensor = None
return scorer return scorer