From 8e2cef49f3a67575454f4ba583aefa908e9b0407 Mon Sep 17 00:00:00 2001 From: Motoki Wu Date: Mon, 22 Apr 2019 05:10:16 -0700 Subject: [PATCH] Add save after `--save-every` batches for `spacy pretrain` (#3510) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using `spacy pretrain`, the model is saved only after every epoch. But each epoch can be very big since `pretrain` is used for language modeling tasks. So I added a `--save-every` option in the CLI to save after every `--save-every` batches. ## Description To test... Save this file to `sample_sents.jsonl` ``` {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} {"text": "hello there."} ``` Then run `--save-every 2` when pretraining. ```bash spacy pretrain sample_sents.jsonl en_core_web_md here -nw 1 -bs 1 -i 10 --save-every 2 ``` And it should save the model to the `here/` folder after every 2 batches. The models that are saved during an epoch will have a `.temp` appended to the save name. At the end the training, you should see these files (`ls here/`): ```bash config.json model2.bin model5.bin model8.bin log.jsonl model2.temp.bin model5.temp.bin model8.temp.bin model0.bin model3.bin model6.bin model9.bin model0.temp.bin model3.temp.bin model6.temp.bin model9.temp.bin model1.bin model4.bin model7.bin model1.temp.bin model4.temp.bin model7.temp.bin ``` ### Types of change This is a new feature to `spacy pretrain`. 🌵 **Unfortunately, I haven't been able to test this because compiling from source is not working (cythonize error).** ``` Processing matcher.pyx [Errno 2] No such file or directory: '/Users/mwu/github/spaCy/spacy/matcher.pyx' Traceback (most recent call last): File "/Users/mwu/github/spaCy/bin/cythonize.py", line 169, in run(args.root) File "/Users/mwu/github/spaCy/bin/cythonize.py", line 158, in run process(base, filename, db) File "/Users/mwu/github/spaCy/bin/cythonize.py", line 124, in process preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp") File "/Users/mwu/github/spaCy/bin/cythonize.py", line 87, in preserve_cwd func(*args) File "/Users/mwu/github/spaCy/bin/cythonize.py", line 63, in process_pyx raise Exception("Cython failed") Exception: Cython failed Traceback (most recent call last): File "setup.py", line 276, in setup_package() File "setup.py", line 209, in setup_package generate_cython(root, "spacy") File "setup.py", line 132, in generate_cython raise RuntimeError("Running cythonize failed") RuntimeError: Running cythonize failed ``` Edit: Fixed! after deleting all `.cpp` files: `find spacy -name "*.cpp" | xargs rm` ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. --- spacy/cli/pretrain.py | 43 ++++++++++++++++++++++++++--------------- website/docs/api/cli.md | 2 ++ 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 0b316b47c..ef91937a6 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -34,7 +34,8 @@ from .. import util max_length=("Max words per example.", "option", "xw", int), min_length=("Min words per example.", "option", "nw", int), seed=("Seed for random number generators", "option", "s", float), - nr_iter=("Number of iterations to pretrain", "option", "i", int), + n_iter=("Number of iterations to pretrain", "option", "i", int), + n_save_every=("Save model every X batches.", "option", "se", int), ) def pretrain( texts_loc, @@ -46,11 +47,12 @@ def pretrain( loss_func="cosine", use_vectors=False, dropout=0.2, - nr_iter=1000, + n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, + n_save_every=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, @@ -115,9 +117,26 @@ def pretrain( msg.divider("Pre-training tok2vec layer") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) - for epoch in range(nr_iter): - for batch in util.minibatch_by_words( - ((text, None) for text in texts), size=batch_size + + def _save_model(epoch, is_temp=False): + is_temp_str = ".temp" if is_temp else "" + with model.use_params(optimizer.averages): + with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open( + "wb" + ) as file_: + file_.write(model.tok2vec.to_bytes()) + log = { + "nr_word": tracker.nr_word, + "loss": tracker.loss, + "epoch_loss": tracker.epoch_loss, + "epoch": epoch, + } + with (output_dir / "log.jsonl").open("a") as file_: + file_.write(srsly.json_dumps(log) + "\n") + + for epoch in range(n_iter): + for batch_id, batch in enumerate( + util.minibatch_by_words(((text, None) for text in texts), size=batch_size) ): docs = make_docs( nlp, @@ -133,17 +152,9 @@ def pretrain( msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break - with model.use_params(optimizer.averages): - with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_: - file_.write(model.tok2vec.to_bytes()) - log = { - "nr_word": tracker.nr_word, - "loss": tracker.loss, - "epoch_loss": tracker.epoch_loss, - "epoch": epoch, - } - with (output_dir / "log.jsonl").open("a") as file_: - file_.write(srsly.json_dumps(log) + "\n") + if n_save_every and (batch_id % n_save_every == 0): + _save_model(epoch, is_temp=True) + _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 6d3a33c49..0bacfb3a0 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -285,6 +285,7 @@ improvement. ```bash $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width] [--depth] [--embed-rows] [--dropout] [--seed] [--n-iter] [--use-vectors] +[--n-save_every] ``` | Argument | Type | Description | @@ -302,6 +303,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width] | `--seed`, `-s` | option | Seed for random number generators. | | `--n-iter`, `-i` | option | Number of iterations to pretrain. | | `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. | +| `--n-save_every`, `-se` | option | Save model every X batches. | | **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. | ### JSONL format for raw text {#pretrain-jsonl}