diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index a0a76e5ec..fef6753e6 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -21,10 +21,10 @@ CONVERTERS = { @plac.annotations( input_file=("input file", "positional", None, str), output_dir=("output directory for converted file", "positional", None, str), - n_sents=("Number of sentences per doc", "option", "n", float), + n_sents=("Number of sentences per doc", "option", "n", int), morphology=("Enable appending morphology to tags", "flag", "m", bool) ) -def convert(cmd, input_file, output_dir, n_sents, morphology): +def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): """ Convert files into JSON format for use with train command and other experiment management functions. diff --git a/spacy/cli/train.py b/spacy/cli/train.py index af028dae5..04aac8319 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -91,15 +91,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, for batch in minibatch(train_docs, size=batch_sizes): docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, - drop=next(dropout_rates), losses=losses) + drop=next(dropout_rates), losses=losses, + update_tensors=True) pbar.update(sum(len(doc) for doc in docs)) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ('model%d' % i) nlp.to_disk(epoch_model_path) - with (output_path / ('model%d.pickle' % i)).open('wb') as file_: - dill.dump(nlp, file_, -1) nlp_loaded = lang_class(pipeline=pipeline) nlp_loaded = nlp_loaded.from_disk(epoch_model_path) scorer = nlp_loaded.evaluate( diff --git a/spacy/language.py b/spacy/language.py index 0284c4636..cb679a2bc 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -277,7 +277,8 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, docs, golds, drop=0., sgd=None, losses=None, + update_tensors=False): """Update the models in the pipeline. docs (iterable): A batch of `Doc` objects. @@ -304,14 +305,17 @@ class Language(object): grads[key] = (W, dW) pipes = list(self.pipeline[1:]) random.shuffle(pipes) + tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) + all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses] for proc in pipes: if not hasattr(proc, 'update'): continue - tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) d_tokvecses = proc.update((docs, tokvecses), golds, drop=drop, sgd=get_grads, losses=losses) - if d_tokvecses is not None: - bp_tokvecses(d_tokvecses, sgd=sgd) + if update_tensors and d_tokvecses is not None: + for i, d_tv in enumerate(d_tokvecses): + all_d_tokvecses[i] += d_tv + bp_tokvecses(all_d_tokvecses, sgd=sgd) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) # Clear the tensor variable, to free GPU memory. @@ -381,9 +385,18 @@ class Language(object): return optimizer def evaluate(self, docs_golds): - docs, golds = zip(*docs_golds) scorer = Scorer() - for doc, gold in zip(self.pipe(docs, batch_size=32), golds): + docs, golds = zip(*docs_golds) + docs = list(docs) + golds = list(golds) + for pipe in self.pipeline: + if not hasattr(pipe, 'pipe'): + for doc in docs: + pipe(doc) + else: + docs = list(pipe.pipe(docs)) + assert len(docs) == len(golds) + for doc, gold in zip(docs, golds): scorer.score(doc, gold) doc.tensor = None return scorer