Update default config [ci skip]

2025-10-30 23:47:31 +03:00 · 2020-10-01 22:27:37 +02:00 · 2020-10-01 22:27:37 +02:00 · 5762876dcc
commit 5762876dcc
parent 86c3ec9c2b
1 changed files with 16 additions and 10 deletions
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -2,7 +2,6 @@
 train = null
 dev = null
 vectors = null
-vocab_data = null
 init_tok2vec = null

 [system]
@ -11,8 +10,13 @@ gpu_allocator = null

 [nlp]
 lang = null
+# List of pipeline component names, in order. The names should correspond to
+# components defined in the [components block]
 pipeline = []
+# Components that are loaded but disabled by default
 disabled = []
+# Optional callbacks to modify the nlp object before it's initialized, after
+# it's created and after the pipeline has been set up
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
@ -20,6 +24,7 @@ after_pipeline_creation = null
 [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"

+# The pipeline components and their models
 [components]

 # Readers for corpora like dev and train.
@ -38,8 +43,7 @@ max_length = 0
 limit = 0
 # Apply some simply data augmentation, where we replace tokens with variations.
 # This is especially useful for punctuation and case replacement, to help
-# generalize beyond corpora that don't have smart-quotes, or only have smart
-# quotes, etc.
+# generalize beyond corpora that don't/only have smart quotes etc.
 augmenter = null

 [corpora.dev]
@ -53,6 +57,7 @@ gold_preproc = false
 max_length = 0
 # Limitation on number of training examples
 limit = 0
+# Optional callback for data augmentation
 augmenter = null

 # Training hyper-parameters and additional features.
@ -102,17 +107,18 @@ use_averages = false
 eps = 1e-8
 learn_rate = 0.001

-# The 'initialize' step is run before training or pretraining. Components and
-# the tokenizer can each define their own arguments via their .initialize
-# methods that are populated by the config. This lets them gather resources like
-# lookup tables and build label sets, construct vocabularies, etc.
+# These settings are used when nlp.initialize() is called (typically before
+# training or pretraining). Components and the tokenizer can each define their
+# own arguments via their initialize methods that are populated by the config.
+# This lets them gather data resources, build label sets etc.
 [initialize]
-vocab_data = ${paths.vocab_data}
-lookups = null
 vectors = ${paths.vectors}
 # Extra resources for transfer-learning or pseudo-rehearsal
 init_tok2vec = ${paths.init_tok2vec}
+# Data and lookups for vocabulary
+vocab_data = null
+lookups = null
 # Arguments passed to the tokenizer's initialize method
 tokenizer = {}
-# Arguments passed to the initialize methods of the components (keyed by component name)
+# Arguments for initialize methods of the components (keyed by component)
 components = {}