mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Add model-last saving mechanism to pretraining (#12459)
* Adjust pretrain command * chane naming and add finally block * Add unit test * Add unit test assertions * Update spacy/training/pretrain.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * change finally block * Add to docs * Update website/docs/usage/embeddings-transformers.mdx * Add flag to skip saving model-last --------- Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
		
							parent
							
								
									4a1ec332de
								
							
						
					
					
						commit
						de32011e4c
					
				|  | @ -23,6 +23,7 @@ def pretrain_cli( | |||
|     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), | ||||
|     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), | ||||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), | ||||
|     skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """ | ||||
|  | @ -74,6 +75,7 @@ def pretrain_cli( | |||
|         epoch_resume=epoch_resume, | ||||
|         use_gpu=use_gpu, | ||||
|         silent=False, | ||||
|         skip_last=skip_last, | ||||
|     ) | ||||
|     msg.good("Successfully finished pretrain") | ||||
| 
 | ||||
|  |  | |||
|  | @ -165,7 +165,8 @@ def test_pretraining_default(): | |||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("objective", CHAR_OBJECTIVES) | ||||
| def test_pretraining_tok2vec_characters(objective): | ||||
| @pytest.mark.parametrize("skip_last", (True, False)) | ||||
| def test_pretraining_tok2vec_characters(objective, skip_last): | ||||
|     """Test that pretraining works with the character objective""" | ||||
|     config = Config().from_str(pretrain_string_listener) | ||||
|     config["pretraining"]["objective"] = objective | ||||
|  | @ -178,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective): | |||
|         filled["paths"]["raw_text"] = file_path | ||||
|         filled = filled.interpolate() | ||||
|         assert filled["pretraining"]["component"] == "tok2vec" | ||||
|         pretrain(filled, tmp_dir) | ||||
|         pretrain(filled, tmp_dir, skip_last=skip_last) | ||||
|         assert Path(tmp_dir / "model0.bin").exists() | ||||
|         assert Path(tmp_dir / "model4.bin").exists() | ||||
|         assert not Path(tmp_dir / "model5.bin").exists() | ||||
|         if skip_last: | ||||
|             assert not Path(tmp_dir / "model-last.bin").exists() | ||||
|         else: | ||||
|             assert Path(tmp_dir / "model-last.bin").exists() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("objective", VECTOR_OBJECTIVES) | ||||
|  | @ -237,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config): | |||
|         pretrain(filled, tmp_dir) | ||||
|         assert Path(tmp_dir / "model0.bin").exists() | ||||
|         assert Path(tmp_dir / "model4.bin").exists() | ||||
|         assert Path(tmp_dir / "model-last.bin").exists() | ||||
|         assert not Path(tmp_dir / "model5.bin").exists() | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -24,6 +24,7 @@ def pretrain( | |||
|     epoch_resume: Optional[int] = None, | ||||
|     use_gpu: int = -1, | ||||
|     silent: bool = True, | ||||
|     skip_last: bool = False, | ||||
| ): | ||||
|     msg = Printer(no_print=silent) | ||||
|     if config["training"]["seed"] is not None: | ||||
|  | @ -60,10 +61,14 @@ def pretrain( | |||
|     row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} | ||||
|     msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) | ||||
| 
 | ||||
|     def _save_model(epoch, is_temp=False): | ||||
|     def _save_model(epoch, is_temp=False, is_last=False): | ||||
|         is_temp_str = ".temp" if is_temp else "" | ||||
|         with model.use_params(optimizer.averages): | ||||
|             with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: | ||||
|             if is_last: | ||||
|                 save_path = output_dir / f"model-last.bin" | ||||
|             else: | ||||
|                 save_path = output_dir / f"model{epoch}{is_temp_str}.bin" | ||||
|             with (save_path).open("wb") as file_: | ||||
|                 file_.write(model.get_ref("tok2vec").to_bytes()) | ||||
|             log = { | ||||
|                 "nr_word": tracker.nr_word, | ||||
|  | @ -76,22 +81,26 @@ def pretrain( | |||
| 
 | ||||
|     # TODO: I think we probably want this to look more like the | ||||
|     # 'create_train_batches' function? | ||||
|     for epoch in range(epoch_resume, P["max_epochs"]): | ||||
|         for batch_id, batch in enumerate(batcher(corpus(nlp))): | ||||
|             docs = ensure_docs(batch) | ||||
|             loss = make_update(model, docs, optimizer, objective) | ||||
|             progress = tracker.update(epoch, loss, docs) | ||||
|             if progress: | ||||
|                 msg.row(progress, **row_settings) | ||||
|             if P["n_save_every"] and (batch_id % P["n_save_every"] == 0): | ||||
|                 _save_model(epoch, is_temp=True) | ||||
|     try: | ||||
|         for epoch in range(epoch_resume, P["max_epochs"]): | ||||
|             for batch_id, batch in enumerate(batcher(corpus(nlp))): | ||||
|                 docs = ensure_docs(batch) | ||||
|                 loss = make_update(model, docs, optimizer, objective) | ||||
|                 progress = tracker.update(epoch, loss, docs) | ||||
|                 if progress: | ||||
|                     msg.row(progress, **row_settings) | ||||
|                 if P["n_save_every"] and (batch_id % P["n_save_every"] == 0): | ||||
|                     _save_model(epoch, is_temp=True) | ||||
| 
 | ||||
|         if P["n_save_epoch"]: | ||||
|             if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1: | ||||
|             if P["n_save_epoch"]: | ||||
|                 if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1: | ||||
|                     _save_model(epoch) | ||||
|             else: | ||||
|                 _save_model(epoch) | ||||
|         else: | ||||
|             _save_model(epoch) | ||||
|         tracker.epoch_loss = 0.0 | ||||
|             tracker.epoch_loss = 0.0 | ||||
|     finally: | ||||
|         if not skip_last: | ||||
|             _save_model(P["max_epochs"], is_last=True) | ||||
| 
 | ||||
| 
 | ||||
| def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]: | ||||
|  |  | |||
|  | @ -1122,17 +1122,18 @@ auto-generated by setting `--pretraining` on | |||
| $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides] | ||||
| ``` | ||||
| 
 | ||||
| | Name                    | Description                                                                                                                                                                                                        | | ||||
| | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `config_path`           | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | ||||
| | `output_dir`            | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           | | ||||
| | `--code`, `-c`          | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               | | ||||
| | `--resume-path`, `-r`   | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          | | ||||
| | `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                | | ||||
| | `--gpu-id`, `-g`        | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         | | ||||
| | `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         | | ||||
| | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              | | ||||
| | **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               | | ||||
| | Name                                               | Description                                                                                                                                                                                                        | | ||||
| | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `config_path`                                      | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | ||||
| | `output_dir`                                       | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           | | ||||
| | `--code`, `-c`                                     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               | | ||||
| | `--resume-path`, `-r`                              | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          | | ||||
| | `--epoch-resume`, `-er`                            | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                | | ||||
| | `--gpu-id`, `-g`                                   | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         | | ||||
| | `--skip-last`, `-L` <Tag variant="new">3.5.2</Tag> | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                 | | ||||
| | `--help`, `-h`                                     | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         | | ||||
| | overrides                                          | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              | | ||||
| | **CREATES**                                        | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               | | ||||
| 
 | ||||
| ## evaluate {id="evaluate",version="2",tag="command"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file | |||
| that you want to use from pretraining. | ||||
| 
 | ||||
| A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as | ||||
| an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To | ||||
| make use of the final output, you could fill in this value in your config file: | ||||
| an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a | ||||
| copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can | ||||
| configure `n_save_epoch` to tell pretraining in which epoch interval it should | ||||
| save the current training progress. To use the final output to initialize your | ||||
| `tok2vec` layer, you could fill in this value in your config file: | ||||
| 
 | ||||
| ```ini {title="config.cfg"} | ||||
| 
 | ||||
| [paths] | ||||
| init_tok2vec = "pretrain/model4.bin" | ||||
| init_tok2vec = "pretrain/model-last.bin" | ||||
| 
 | ||||
| [initialize] | ||||
| init_tok2vec = ${paths.init_tok2vec} | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user