mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Restore changes from nn-beam-parser
This commit is contained in:
parent
ce321b0322
commit
11c31d285c
|
@ -21,10 +21,10 @@ CONVERTERS = {
|
|||
@plac.annotations(
|
||||
input_file=("input file", "positional", None, str),
|
||||
output_dir=("output directory for converted file", "positional", None, str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", float),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||
)
|
||||
def convert(cmd, input_file, output_dir, n_sents, morphology):
|
||||
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
|
|
|
@ -91,15 +91,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
for batch in minibatch(train_docs, size=batch_sizes):
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, sgd=optimizer,
|
||||
drop=next(dropout_rates), losses=losses)
|
||||
drop=next(dropout_rates), losses=losses,
|
||||
update_tensors=True)
|
||||
pbar.update(sum(len(doc) for doc in docs))
|
||||
|
||||
with nlp.use_params(optimizer.averages):
|
||||
util.set_env_log(False)
|
||||
epoch_model_path = output_path / ('model%d' % i)
|
||||
nlp.to_disk(epoch_model_path)
|
||||
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
|
||||
dill.dump(nlp, file_, -1)
|
||||
nlp_loaded = lang_class(pipeline=pipeline)
|
||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||
scorer = nlp_loaded.evaluate(
|
||||
|
|
|
@ -277,7 +277,8 @@ class Language(object):
|
|||
def make_doc(self, text):
|
||||
return self.tokenizer(text)
|
||||
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
||||
update_tensors=False):
|
||||
"""Update the models in the pipeline.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
|
@ -304,14 +305,17 @@ class Language(object):
|
|||
grads[key] = (W, dW)
|
||||
pipes = list(self.pipeline[1:])
|
||||
random.shuffle(pipes)
|
||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
|
||||
for proc in pipes:
|
||||
if not hasattr(proc, 'update'):
|
||||
continue
|
||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||
drop=drop, sgd=get_grads, losses=losses)
|
||||
if d_tokvecses is not None:
|
||||
bp_tokvecses(d_tokvecses, sgd=sgd)
|
||||
if update_tensors and d_tokvecses is not None:
|
||||
for i, d_tv in enumerate(d_tokvecses):
|
||||
all_d_tokvecses[i] += d_tv
|
||||
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
||||
for key, (W, dW) in grads.items():
|
||||
sgd(W, dW, key=key)
|
||||
# Clear the tensor variable, to free GPU memory.
|
||||
|
@ -381,9 +385,18 @@ class Language(object):
|
|||
return optimizer
|
||||
|
||||
def evaluate(self, docs_golds):
|
||||
docs, golds = zip(*docs_golds)
|
||||
scorer = Scorer()
|
||||
for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
|
||||
docs, golds = zip(*docs_golds)
|
||||
docs = list(docs)
|
||||
golds = list(golds)
|
||||
for pipe in self.pipeline:
|
||||
if not hasattr(pipe, 'pipe'):
|
||||
for doc in docs:
|
||||
pipe(doc)
|
||||
else:
|
||||
docs = list(pipe.pipe(docs))
|
||||
assert len(docs) == len(golds)
|
||||
for doc, gold in zip(docs, golds):
|
||||
scorer.score(doc, gold)
|
||||
doc.tensor = None
|
||||
return scorer
|
||||
|
|
Loading…
Reference in New Issue
Block a user