mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	Move rehearse functionality to existing train cli
This commit is contained in:
		
							parent
							
								
									97c86072e5
								
							
						
					
					
						commit
						e18a183f54
					
				|  | @ -1,83 +0,0 @@ | ||||||
| from typing import Optional, Dict, Any, Union |  | ||||||
| from pathlib import Path |  | ||||||
| from wasabi import msg |  | ||||||
| import typer |  | ||||||
| import logging |  | ||||||
| import sys |  | ||||||
| 
 |  | ||||||
| from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error |  | ||||||
| from ._util import import_code, setup_gpu |  | ||||||
| from ..training.loop import train as train_nlp |  | ||||||
| from ..training.initialize import init_nlp |  | ||||||
| from .. import util |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @app.command( |  | ||||||
|     "rehearse", |  | ||||||
|     context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, |  | ||||||
| ) |  | ||||||
| def rehearse_cli( |  | ||||||
|     # fmt: off |  | ||||||
|     ctx: typer.Context,  # This is only used to read additional arguments |  | ||||||
|     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), |  | ||||||
|     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), |  | ||||||
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), |  | ||||||
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), |  | ||||||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") |  | ||||||
|     # fmt: on |  | ||||||
| ): |  | ||||||
|     """ |  | ||||||
|     Rehearse a spaCy pipeline. Requires data in spaCy's binary format. To |  | ||||||
|     convert data from other formats, use the `spacy convert` command. The |  | ||||||
|     config file includes all settings and hyperparameters used during training. |  | ||||||
|     To override settings in the config, e.g. settings that point to local |  | ||||||
|     paths or that you want to experiment with, you can override them as |  | ||||||
|     command line options. For instance, --training.batch_size 128 overrides |  | ||||||
|     the value of "batch_size" in the block "[training]". The --code argument |  | ||||||
|     lets you pass in a Python file that's imported before training. It can be |  | ||||||
|     used to register custom functions and architectures that can then be |  | ||||||
|     referenced in the config. |  | ||||||
| 
 |  | ||||||
|     DOCS: https://spacy.io/api/cli#rehearse |  | ||||||
|     """ |  | ||||||
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) |  | ||||||
|     overrides = parse_config_overrides(ctx.args) |  | ||||||
|     import_code(code_path) |  | ||||||
|     rehearse(config_path, output_path, use_gpu=use_gpu, overrides=overrides) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def rehearse( |  | ||||||
|     config_path: Union[str, Path], |  | ||||||
|     output_path: Optional[Union[str, Path]] = None, |  | ||||||
|     *, |  | ||||||
|     use_gpu: int = -1, |  | ||||||
|     overrides: Dict[str, Any] = util.SimpleFrozenDict(), |  | ||||||
| ): |  | ||||||
|     config_path = util.ensure_path(config_path) |  | ||||||
|     output_path = util.ensure_path(output_path) |  | ||||||
|     # Make sure all files and paths exists if they are needed |  | ||||||
|     if not config_path or (str(config_path) != "-" and not config_path.exists()): |  | ||||||
|         msg.fail("Config file not found", config_path, exits=1) |  | ||||||
|     if not output_path: |  | ||||||
|         msg.info("No output directory provided") |  | ||||||
|     else: |  | ||||||
|         if not output_path.exists(): |  | ||||||
|             output_path.mkdir(parents=True) |  | ||||||
|             msg.good(f"Created output directory: {output_path}") |  | ||||||
|         msg.info(f"Saving to output directory: {output_path}") |  | ||||||
|     setup_gpu(use_gpu) |  | ||||||
|     with show_validation_error(config_path): |  | ||||||
|         config = util.load_config(config_path, overrides=overrides, interpolate=False) |  | ||||||
|     msg.divider("Initializing pipeline") |  | ||||||
|     with show_validation_error(config_path, hint_fill=False): |  | ||||||
|         nlp = init_nlp(config, use_gpu=use_gpu) |  | ||||||
|     msg.good("Initialized pipeline") |  | ||||||
|     msg.divider("Training pipeline") |  | ||||||
|     train_nlp( |  | ||||||
|         nlp, |  | ||||||
|         output_path, |  | ||||||
|         use_gpu=use_gpu, |  | ||||||
|         use_rehearse=True, |  | ||||||
|         stdout=sys.stdout, |  | ||||||
|         stderr=sys.stderr, |  | ||||||
|     ) |  | ||||||
|  | @ -22,7 +22,8 @@ def train_cli( | ||||||
|     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), |     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), | ||||||
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), |     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), | ||||||
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), |     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), | ||||||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") |     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), | ||||||
|  |     use_rehearse: bool = Opt(False, "--use_rehearse", "-r", help="Perform 'rehearsal updates' on a pre-trained model") | ||||||
|     # fmt: on |     # fmt: on | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|  | @ -42,7 +43,13 @@ def train_cli( | ||||||
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) |     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) | ||||||
|     overrides = parse_config_overrides(ctx.args) |     overrides = parse_config_overrides(ctx.args) | ||||||
|     import_code(code_path) |     import_code(code_path) | ||||||
|     train(config_path, output_path, use_gpu=use_gpu, overrides=overrides) |     train( | ||||||
|  |         config_path, | ||||||
|  |         output_path, | ||||||
|  |         use_gpu=use_gpu, | ||||||
|  |         overrides=overrides, | ||||||
|  |         use_rehearse=use_rehearse, | ||||||
|  |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def train( | def train( | ||||||
|  | @ -51,6 +58,7 @@ def train( | ||||||
|     *, |     *, | ||||||
|     use_gpu: int = -1, |     use_gpu: int = -1, | ||||||
|     overrides: Dict[str, Any] = util.SimpleFrozenDict(), |     overrides: Dict[str, Any] = util.SimpleFrozenDict(), | ||||||
|  |     use_rehearse: bool = False, | ||||||
| ): | ): | ||||||
|     config_path = util.ensure_path(config_path) |     config_path = util.ensure_path(config_path) | ||||||
|     output_path = util.ensure_path(output_path) |     output_path = util.ensure_path(output_path) | ||||||
|  | @ -72,4 +80,11 @@ def train( | ||||||
|         nlp = init_nlp(config, use_gpu=use_gpu) |         nlp = init_nlp(config, use_gpu=use_gpu) | ||||||
|     msg.good("Initialized pipeline") |     msg.good("Initialized pipeline") | ||||||
|     msg.divider("Training pipeline") |     msg.divider("Training pipeline") | ||||||
|     train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) |     train_nlp( | ||||||
|  |         nlp, | ||||||
|  |         output_path, | ||||||
|  |         use_gpu=use_gpu, | ||||||
|  |         stdout=sys.stdout, | ||||||
|  |         stderr=sys.stderr, | ||||||
|  |         use_rehearse=use_rehearse, | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  | @ -1050,7 +1050,7 @@ in the section `[paths]`. | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides] | $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [--use_rehearse] [overrides] | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| | Name              | Description                                                                                                                                                                                                        | | | Name              | Description                                                                                                                                                                                                        | | ||||||
|  | @ -1060,6 +1060,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] | ||||||
| | `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               | | | `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               | | ||||||
| | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       | | | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       | | ||||||
| | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         | | | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         | | ||||||
|  | | `--use_rehearse`, `-r`  | Use 'rehearsal' updates on a pre-trained model to address the catastrophic forgetting problem. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                         |                                                                                                                                                      | ||||||
| | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         | | | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         | | ||||||
| | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         | | | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         | | ||||||
| | **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                                          | | | **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                                          | | ||||||
|  | @ -1135,38 +1136,6 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [ | ||||||
| | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              | | | overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~                              | | ||||||
| | **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               | | | **CREATES**             | The pretrained weights that can be used to initialize `spacy train`.                                                                                                                                               | | ||||||
| 
 | 
 | ||||||
| ## rehearse {id="rehearse",tag="command, experimental"} |  | ||||||
| 
 |  | ||||||
| This command is designed to fine-tune pre-trained models while also trying to address the “catastrophic forgetting” problem.  |  | ||||||
| It uses "rehearsal" updates that teach the current model to make predictions similar to an initial model. This feature is experimental. |  | ||||||
| 
 |  | ||||||
| <Infobox title="Please note" variant="warning"> |  | ||||||
| 
 |  | ||||||
| The `rehearse` command outputs the sum of both losses from the `TrainablePipe.update` and `TrainablePipe.rehearse`.  |  | ||||||
| This can potentially cause the loss to increase drastically, even while the scores also increasing. It's likely due to the model making more different predictions than the intital model. |  | ||||||
| 
 |  | ||||||
| </Infobox> |  | ||||||
| 
 |  | ||||||
| > #### Example |  | ||||||
| > |  | ||||||
| > ```bash |  | ||||||
| > $ python -m spacy rehearse config.cfg --output ./output --paths.train ./train --paths.dev ./dev |  | ||||||
| > ``` |  | ||||||
| 
 |  | ||||||
| ```bash |  | ||||||
| $ python -m spacy rehearse [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides] |  | ||||||
| ``` |  | ||||||
| 
 |  | ||||||
| | Name              | Description                                                                                                                                                                                                        | |  | ||||||
| | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | |  | ||||||
| | `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | |  | ||||||
| | `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                          | |  | ||||||
| | `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               | |  | ||||||
| | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       | |  | ||||||
| | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         | |  | ||||||
| | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         | |  | ||||||
| | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         | |  | ||||||
| | **CREATES**       | The final rehearse pipeline and the best rehearsed pipeline.    |  | ||||||
| 
 | 
 | ||||||
| ## evaluate {id="evaluate",version="2",tag="command"} | ## evaluate {id="evaluate",version="2",tag="command"} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user