Merge branch 'feature/prepare' of https://github.com/explosion/spaCy into feature/prepare

2025-07-16 03:02:41 +03:00 · 2020-09-29 16:57:38 +02:00 · 2020-09-29 16:57:38 +02:00 · 8ce9f44433
commit 8ce9f44433
parent e4f535a964 30c76dbd67
4 changed files with 21 additions and 32 deletions
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -1,8 +1,9 @@
 [paths]
 train = ""
 dev = ""
-init_tok2vec = null
+vectors = null
 vocab_data = null
+init_tok2vec = null

 [system]
 seed = 0
@ -96,19 +97,16 @@ eps = 1e-8
 learn_rate = 0.001

 # The 'initialize' step is run before training or pretraining. Components and
-# the tokenizer can each define their own prepare step, giving them a chance
-# to gather resources like lookup-tables, build label sets, construct vocabularies,
-# etc. After 'prepare' is finished, the result will be saved out to disk, which
-# will then be read in at the start of training. You can call the prepare step
-# separately with the `spacy prepare` command, or you can let the train script
-# do it for you.
+# the tokenizer can each define their own arguments via their .initialize
+# methods that are populated by the config. This lets them gather resources like
+# lookup tables and build label sets, construct vocabularies, etc.
 [initialize]
-tokenizer = {}
-components = {}
-
-[initialize.vocab]
-data = ${paths.vocab_data}
+vocab_data = ${paths.vocab_data}
 lookups = null
-vectors = null
+vectors = ${paths.vectors}
 # Extra resources for transfer-learning or pseudo-rehearsal
 init_tok2vec = ${paths.init_tok2vec}
+# Arguments passed to the tokenizer's initialize method
+tokenizer = {}
+# Arguments passed to the initialize methods of the components (keyed by component name)
+components = {}
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1188,14 +1188,13 @@ class Language:
        config = self.config.interpolate()
        # These are the settings provided in the [initialize] block in the config
        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
-        V = I["vocab"]
        init_vocab(
-            self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"],
+            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"],
        )
        pretrain_cfg = config.get("pretraining")
        if pretrain_cfg:
            P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
-            init_tok2vec(self, P, V)
+            init_tok2vec(self, P, I)
        if self.vocab.vectors.data.shape[1] >= 1:
            ops = get_current_ops()
            self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -357,12 +357,14 @@ class ConfigSchemaPretrain(BaseModel):
        arbitrary_types_allowed = True


-class ConfigSchemaInitVocab(BaseModel):
+class ConfigSchemaInit(BaseModel):
    # fmt: off
-    data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
+    vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
    lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
    vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
+    tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
+    components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
    # fmt: on

    class Config:
@ -370,16 +372,6 @@ class ConfigSchemaInitVocab(BaseModel):
        arbitrary_types_allowed = True


-class ConfigSchemaInit(BaseModel):
-    vocab: ConfigSchemaInitVocab
-    tokenizer: Any
-    components: Dict[StrictStr, Any]
-
-    class Config:
-        extra = "forbid"
-        arbitrary_types_allowed = True
-
-
 class ConfigSchema(BaseModel):
    training: ConfigSchemaTraining
    nlp: ConfigSchemaNlp
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -121,15 +121,15 @@ def load_vectors_into_model(


 def init_tok2vec(
-    nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
+    nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
 ) -> bool:
    # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
    P = pretrain_config
-    V = vocab_config
+    I = init_config
    weights_data = None
-    init_tok2vec = ensure_path(V["init_tok2vec"])
+    init_tok2vec = ensure_path(I["init_tok2vec"])
    if init_tok2vec is not None:
-        if P["objective"].get("type") == "vectors" and not V["vectors"]:
+        if P["objective"].get("type") == "vectors" and not I["vectors"]:
            err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"'
            errors = [{"loc": ["initialize", "vocab"], "msg": err}]
            raise ConfigValidationError(config=nlp.config, errors=errors)