Restore changes from nn-beam-parser

This commit is contained in:
Matthew Honnibal 2017-08-18 22:26:12 +02:00
parent ce321b0322
commit 11c31d285c
3 changed files with 23 additions and 11 deletions

View File

@ -21,10 +21,10 @@ CONVERTERS = {
@plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float),
n_sents=("Number of sentences per doc", "option", "n", int),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(cmd, input_file, output_dir, n_sents, morphology):
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
"""
Convert files into JSON format for use with train command and other
experiment management functions.

View File

@ -91,15 +91,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses)
drop=next(dropout_rates), losses=losses,
update_tensors=True)
pbar.update(sum(len(doc) for doc in docs))
with nlp.use_params(optimizer.averages):
util.set_env_log(False)
epoch_model_path = output_path / ('model%d' % i)
nlp.to_disk(epoch_model_path)
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
dill.dump(nlp, file_, -1)
nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate(

View File

@ -277,7 +277,8 @@ class Language(object):
def make_doc(self, text):
return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None):
def update(self, docs, golds, drop=0., sgd=None, losses=None,
update_tensors=False):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
@ -304,14 +305,17 @@ class Language(object):
grads[key] = (W, dW)
pipes = list(self.pipeline[1:])
random.shuffle(pipes)
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
for proc in pipes:
if not hasattr(proc, 'update'):
continue
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses)
if d_tokvecses is not None:
bp_tokvecses(d_tokvecses, sgd=sgd)
if update_tensors and d_tokvecses is not None:
for i, d_tv in enumerate(d_tokvecses):
all_d_tokvecses[i] += d_tv
bp_tokvecses(all_d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory.
@ -381,9 +385,18 @@ class Language(object):
return optimizer
def evaluate(self, docs_golds):
docs, golds = zip(*docs_golds)
scorer = Scorer()
for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
docs, golds = zip(*docs_golds)
docs = list(docs)
golds = list(golds)
for pipe in self.pipeline:
if not hasattr(pipe, 'pipe'):
for doc in docs:
pipe(doc)
else:
docs = list(pipe.pipe(docs))
assert len(docs) == len(golds)
for doc, gold in zip(docs, golds):
scorer.score(doc, gold)
doc.tensor = None
return scorer