mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge pull request #6209 from svlandeg/feature/doc-updates
This commit is contained in:
		
						commit
						60f9e8e1d0
					
				| 
						 | 
					@ -278,7 +278,7 @@ def show_validation_error(
 | 
				
			||||||
                "fill-config' command to fill in all the defaults, if possible:",
 | 
					                "fill-config' command to fill in all the defaults, if possible:",
 | 
				
			||||||
                spaced=True,
 | 
					                spaced=True,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n")
 | 
					            print(f"{COMMAND} init fill-config {config_path} {config_path} \n")
 | 
				
			||||||
        sys.exit(1)
 | 
					        sys.exit(1)
 | 
				
			||||||
    except InterpolationError as e:
 | 
					    except InterpolationError as e:
 | 
				
			||||||
        msg.fail("Config validation error", e, exits=1)
 | 
					        msg.fail("Config validation error", e, exits=1)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -300,17 +300,16 @@ $ python -m spacy debug config [config_path] [--code] [--show-functions] [--show
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
✘ Config validation error
 | 
					✘ Config validation error
 | 
				
			||||||
 | 
					dropout     field required
 | 
				
			||||||
 | 
					optimizer   field required
 | 
				
			||||||
 | 
					optimize    extra fields not permitted
 | 
				
			||||||
 | 
					
 | 
				
			||||||
training -> dropout     field required
 | 
					{'seed': 0, 'accumulate_gradient': 1, 'dev_corpus': 'corpora.dev', 'train_corpus': 'corpora.train', 'gpu_allocator': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'before_to_disk': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'logger': {'@loggers': 'spacy.ConsoleLogger.v1', 'progress_bar': False}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
 | 
				
			||||||
training -> optimizer   field required
 | 
					 | 
				
			||||||
training -> optimize    extra fields not permitted
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
If your config contains missing values, you can run the 'init fill-config'
 | 
					If your config contains missing values, you can run the 'init fill-config'
 | 
				
			||||||
command to fill in all the defaults, if possible:
 | 
					command to fill in all the defaults, if possible:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/starter-config_invalid.cfg
 | 
					python -m spacy init fill-config tmp/starter-config_invalid.cfg tmp/starter-config_invalid.cfg
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
</Accordion>
 | 
					</Accordion>
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -181,20 +181,20 @@ This section defines settings and controls for the training and evaluation
 | 
				
			||||||
process that are used when you run [`spacy train`](/api/cli#train).
 | 
					process that are used when you run [`spacy train`](/api/cli#train).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                  | Description                                                                                                                                                                                                                                                                                                                         |
 | 
					| Name                  | Description                                                                                                                                                                                                                                                                                                                         |
 | 
				
			||||||
| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
 | 
					| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
 | 
				
			||||||
| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
 | 
					| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
 | 
				
			||||||
| `before_to_disk`      | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
 | 
					| `before_to_disk`      | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
 | 
				
			||||||
| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
 | 
					| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
 | 
				
			||||||
| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
 | 
					| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
 | 
				
			||||||
| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
 | 
					| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
 | 
				
			||||||
| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                           |
 | 
					| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
 | 
				
			||||||
| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
 | 
					| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
 | 
				
			||||||
 | 
					| `logger`              | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
 | 
				
			||||||
| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                                                                                                                     |
 | 
					| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                                                                                                                     |
 | 
				
			||||||
| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                                                           |
 | 
					| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                                                           |
 | 
				
			||||||
| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
 | 
					| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
 | 
				
			||||||
| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                                                                                                                     |
 | 
					| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                                                                                                                     |
 | 
				
			||||||
| `raw_text`            | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~                                                                       |
 | 
					 | 
				
			||||||
| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
 | 
					| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
 | 
				
			||||||
| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
 | 
					| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
 | 
				
			||||||
| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
 | 
					| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
 | 
				
			||||||
| 
						 | 
					@ -206,16 +206,16 @@ This section is optional and defines settings and controls for
 | 
				
			||||||
used when you run [`spacy pretrain`](/api/cli#pretrain).
 | 
					used when you run [`spacy pretrain`](/api/cli#pretrain).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name           | Description                                                                                                                                                                                                  |
 | 
					| Name           | Description                                                                                                                                                                                                  |
 | 
				
			||||||
| -------------- | ------------------------------------------------------------------------------------------------------ |
 | 
					| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `max_epochs`   | Maximum number of epochs. Defaults to `1000`. ~~int~~                                                                                                                                                        |
 | 
					| `max_epochs`   | Maximum number of epochs. Defaults to `1000`. ~~int~~                                                                                                                                                        |
 | 
				
			||||||
| `dropout`      | The dropout rate. Defaults to `0.2`. ~~float~~                                                                                                                                                               |
 | 
					| `dropout`      | The dropout rate. Defaults to `0.2`. ~~float~~                                                                                                                                                               |
 | 
				
			||||||
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                                                                                                                      |
 | 
					| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                                                                                                                      |
 | 
				
			||||||
| `objective`    | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~                                                                                                       |
 | 
					| `objective`    | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~                                                                                                       |
 | 
				
			||||||
| `optimizer`    | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~          |
 | 
					| `optimizer`    | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                      |
 | 
				
			||||||
| `corpus`       | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ |
 | 
					| `corpus`       | Dot notation of the config location defining the corpus with raw text. Defaults to `corpora.pretrain`. ~~str~~                                                                                               |
 | 
				
			||||||
| `batcher`      | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                      |
 | 
					| `batcher`      | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
 | 
				
			||||||
| `component`    | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~                              |
 | 
					| `component`    | Component name to identify the layer with the model to pretrain. Defaults to `"tok2vec"`. ~~str~~                                                                                                            |
 | 
				
			||||||
| `layer`        | The layer to pretrain. If empty, the whole component model will be used. ~~str~~                       |
 | 
					| `layer`        | The specific layer of the model to pretrain. If empty, the whole model will be used. ~~str~~                                                                                                                 |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### initialize {#config-initialize tag="section"}
 | 
					### initialize {#config-initialize tag="section"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -378,7 +378,7 @@ weights and [resume training](/api/language#resume_training).
 | 
				
			||||||
If you don't want a component to be updated, you can **freeze** it by adding it
 | 
					If you don't want a component to be updated, you can **freeze** it by adding it
 | 
				
			||||||
to the `frozen_components` list in the `[training]` block. Frozen components are
 | 
					to the `frozen_components` list in the `[training]` block. Frozen components are
 | 
				
			||||||
**not updated** during training and are included in the final trained pipeline
 | 
					**not updated** during training and are included in the final trained pipeline
 | 
				
			||||||
as-is.
 | 
					as-is. They are also excluded when calling `nlp.initialize()`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Note on frozen components
 | 
					> #### Note on frozen components
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user