mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Restore changes from nn-beam-parser
This commit is contained in:
parent
ce321b0322
commit
11c31d285c
|
@ -21,10 +21,10 @@ CONVERTERS = {
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
input_file=("input file", "positional", None, str),
|
input_file=("input file", "positional", None, str),
|
||||||
output_dir=("output directory for converted file", "positional", None, str),
|
output_dir=("output directory for converted file", "positional", None, str),
|
||||||
n_sents=("Number of sentences per doc", "option", "n", float),
|
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||||
)
|
)
|
||||||
def convert(cmd, input_file, output_dir, n_sents, morphology):
|
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
|
||||||
"""
|
"""
|
||||||
Convert files into JSON format for use with train command and other
|
Convert files into JSON format for use with train command and other
|
||||||
experiment management functions.
|
experiment management functions.
|
||||||
|
|
|
@ -91,15 +91,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
for batch in minibatch(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(docs, golds, sgd=optimizer,
|
nlp.update(docs, golds, sgd=optimizer,
|
||||||
drop=next(dropout_rates), losses=losses)
|
drop=next(dropout_rates), losses=losses,
|
||||||
|
update_tensors=True)
|
||||||
pbar.update(sum(len(doc) for doc in docs))
|
pbar.update(sum(len(doc) for doc in docs))
|
||||||
|
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
util.set_env_log(False)
|
util.set_env_log(False)
|
||||||
epoch_model_path = output_path / ('model%d' % i)
|
epoch_model_path = output_path / ('model%d' % i)
|
||||||
nlp.to_disk(epoch_model_path)
|
nlp.to_disk(epoch_model_path)
|
||||||
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
|
|
||||||
dill.dump(nlp, file_, -1)
|
|
||||||
nlp_loaded = lang_class(pipeline=pipeline)
|
nlp_loaded = lang_class(pipeline=pipeline)
|
||||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||||
scorer = nlp_loaded.evaluate(
|
scorer = nlp_loaded.evaluate(
|
||||||
|
|
|
@ -277,7 +277,8 @@ class Language(object):
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
||||||
|
update_tensors=False):
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -304,14 +305,17 @@ class Language(object):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
pipes = list(self.pipeline[1:])
|
pipes = list(self.pipeline[1:])
|
||||||
random.shuffle(pipes)
|
random.shuffle(pipes)
|
||||||
|
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||||
|
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
|
||||||
for proc in pipes:
|
for proc in pipes:
|
||||||
if not hasattr(proc, 'update'):
|
if not hasattr(proc, 'update'):
|
||||||
continue
|
continue
|
||||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
|
||||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||||
drop=drop, sgd=get_grads, losses=losses)
|
drop=drop, sgd=get_grads, losses=losses)
|
||||||
if d_tokvecses is not None:
|
if update_tensors and d_tokvecses is not None:
|
||||||
bp_tokvecses(d_tokvecses, sgd=sgd)
|
for i, d_tv in enumerate(d_tokvecses):
|
||||||
|
all_d_tokvecses[i] += d_tv
|
||||||
|
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
# Clear the tensor variable, to free GPU memory.
|
# Clear the tensor variable, to free GPU memory.
|
||||||
|
@ -381,9 +385,18 @@ class Language(object):
|
||||||
return optimizer
|
return optimizer
|
||||||
|
|
||||||
def evaluate(self, docs_golds):
|
def evaluate(self, docs_golds):
|
||||||
docs, golds = zip(*docs_golds)
|
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
|
docs, golds = zip(*docs_golds)
|
||||||
|
docs = list(docs)
|
||||||
|
golds = list(golds)
|
||||||
|
for pipe in self.pipeline:
|
||||||
|
if not hasattr(pipe, 'pipe'):
|
||||||
|
for doc in docs:
|
||||||
|
pipe(doc)
|
||||||
|
else:
|
||||||
|
docs = list(pipe.pipe(docs))
|
||||||
|
assert len(docs) == len(golds)
|
||||||
|
for doc, gold in zip(docs, golds):
|
||||||
scorer.score(doc, gold)
|
scorer.score(doc, gold)
|
||||||
doc.tensor = None
|
doc.tensor = None
|
||||||
return scorer
|
return scorer
|
||||||
|
|
Loading…
Reference in New Issue
Block a user