diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 86293fd40..c0fd27c3c 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,8 +1,9 @@ [paths] train = "" dev = "" -init_tok2vec = null +vectors = null vocab_data = null +init_tok2vec = null [system] seed = 0 @@ -96,19 +97,16 @@ eps = 1e-8 learn_rate = 0.001 # The 'initialize' step is run before training or pretraining. Components and -# the tokenizer can each define their own prepare step, giving them a chance -# to gather resources like lookup-tables, build label sets, construct vocabularies, -# etc. After 'prepare' is finished, the result will be saved out to disk, which -# will then be read in at the start of training. You can call the prepare step -# separately with the `spacy prepare` command, or you can let the train script -# do it for you. +# the tokenizer can each define their own arguments via their .initialize +# methods that are populated by the config. This lets them gather resources like +# lookup tables and build label sets, construct vocabularies, etc. [initialize] -tokenizer = {} -components = {} - -[initialize.vocab] -data = ${paths.vocab_data} +vocab_data = ${paths.vocab_data} lookups = null -vectors = null +vectors = ${paths.vectors} # Extra resources for transfer-learning or pseudo-rehearsal init_tok2vec = ${paths.init_tok2vec} +# Arguments passed to the tokenizer's initialize method +tokenizer = {} +# Arguments passed to the initialize methods of the components (keyed by component name) +components = {} diff --git a/spacy/language.py b/spacy/language.py index ec2e42a35..ee73faed3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1188,14 +1188,13 @@ class Language: config = self.config.interpolate() # These are the settings provided in the [initialize] block in the config I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) - V = I["vocab"] init_vocab( - self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], + self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"], ) pretrain_cfg = config.get("pretraining") if pretrain_cfg: P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) - init_tok2vec(self, P, V) + init_tok2vec(self, P, I) if self.vocab.vectors.data.shape[1] >= 1: ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) diff --git a/spacy/schemas.py b/spacy/schemas.py index 0b2eeba68..658eeb574 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -357,12 +357,14 @@ class ConfigSchemaPretrain(BaseModel): arbitrary_types_allowed = True -class ConfigSchemaInitVocab(BaseModel): +class ConfigSchemaInit(BaseModel): # fmt: off - data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file") + vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file") lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") vectors: Optional[StrictStr] = Field(..., title="Path to vectors") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") + tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize") + components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component") # fmt: on class Config: @@ -370,16 +372,6 @@ class ConfigSchemaInitVocab(BaseModel): arbitrary_types_allowed = True -class ConfigSchemaInit(BaseModel): - vocab: ConfigSchemaInitVocab - tokenizer: Any - components: Dict[StrictStr, Any] - - class Config: - extra = "forbid" - arbitrary_types_allowed = True - - class ConfigSchema(BaseModel): training: ConfigSchemaTraining nlp: ConfigSchemaNlp diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 862c76448..aa5edde5d 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -121,15 +121,15 @@ def load_vectors_into_model( def init_tok2vec( - nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] + nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any] ) -> bool: # Load pretrained tok2vec weights - cf. CLI command 'pretrain' P = pretrain_config - V = vocab_config + I = init_config weights_data = None - init_tok2vec = ensure_path(V["init_tok2vec"]) + init_tok2vec = ensure_path(I["init_tok2vec"]) if init_tok2vec is not None: - if P["objective"].get("type") == "vectors" and not V["vectors"]: + if P["objective"].get("type") == "vectors" and not I["vectors"]: err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"' errors = [{"loc": ["initialize", "vocab"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors)