mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	Tighten up format
This commit is contained in:
		
							parent
							
								
									978ab54a84
								
							
						
					
					
						commit
						fd594cfb9b
					
				|  | @ -1,8 +1,9 @@ | |||
| [paths] | ||||
| train = "" | ||||
| dev = "" | ||||
| init_tok2vec = null | ||||
| vectors = null | ||||
| vocab_data = null | ||||
| init_tok2vec = null | ||||
| 
 | ||||
| [system] | ||||
| seed = 0 | ||||
|  | @ -96,19 +97,16 @@ eps = 1e-8 | |||
| learn_rate = 0.001 | ||||
| 
 | ||||
| # The 'initialize' step is run before training or pretraining. Components and | ||||
| # the tokenizer can each define their own prepare step, giving them a chance | ||||
| # to gather resources like lookup-tables, build label sets, construct vocabularies, | ||||
| # etc. After 'prepare' is finished, the result will be saved out to disk, which | ||||
| # will then be read in at the start of training. You can call the prepare step | ||||
| # separately with the `spacy prepare` command, or you can let the train script | ||||
| # do it for you. | ||||
| # the tokenizer can each define their own arguments via their .initialize | ||||
| # methods that are populated by the config. This lets them gather resources like | ||||
| # lookup tables and build label sets, construct vocabularies, etc. | ||||
| [initialize] | ||||
| tokenizer = {} | ||||
| components = {} | ||||
| 
 | ||||
| [initialize.vocab] | ||||
| data = ${paths.vocab_data} | ||||
| vocab_data = ${paths.vocab_data} | ||||
| lookups = null | ||||
| vectors = null | ||||
| vectors = ${paths.vectors} | ||||
| # Extra resources for transfer-learning or pseudo-rehearsal | ||||
| init_tok2vec = ${paths.init_tok2vec} | ||||
| # Arguments passed to the tokenizer's initialize method | ||||
| tokenizer = {} | ||||
| # Arguments passed to the initialize methods of the components (keyed by component name) | ||||
| components = {} | ||||
|  |  | |||
|  | @ -1203,14 +1203,13 @@ class Language: | |||
|         config = self.config.interpolate() | ||||
|         # These are the settings provided in the [initialize] block in the config | ||||
|         I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) | ||||
|         V = I["vocab"] | ||||
|         init_vocab( | ||||
|             self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], | ||||
|             self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"], | ||||
|         ) | ||||
|         pretrain_cfg = config.get("pretraining") | ||||
|         if pretrain_cfg: | ||||
|             P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) | ||||
|             init_tok2vec(self, P, V) | ||||
|             init_tok2vec(self, P, I) | ||||
|         if self.vocab.vectors.data.shape[1] >= 1: | ||||
|             ops = get_current_ops() | ||||
|             self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) | ||||
|  |  | |||
|  | @ -357,12 +357,14 @@ class ConfigSchemaPretrain(BaseModel): | |||
|         arbitrary_types_allowed = True | ||||
| 
 | ||||
| 
 | ||||
| class ConfigSchemaInitVocab(BaseModel): | ||||
| class ConfigSchemaInit(BaseModel): | ||||
|     # fmt: off | ||||
|     data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file") | ||||
|     vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file") | ||||
|     lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") | ||||
|     vectors: Optional[StrictStr] = Field(..., title="Path to vectors") | ||||
|     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") | ||||
|     tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize") | ||||
|     components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component") | ||||
|     # fmt: on | ||||
| 
 | ||||
|     class Config: | ||||
|  | @ -370,16 +372,6 @@ class ConfigSchemaInitVocab(BaseModel): | |||
|         arbitrary_types_allowed = True | ||||
| 
 | ||||
| 
 | ||||
| class ConfigSchemaInit(BaseModel): | ||||
|     vocab: ConfigSchemaInitVocab | ||||
|     tokenizer: Any | ||||
|     components: Dict[StrictStr, Any] | ||||
| 
 | ||||
|     class Config: | ||||
|         extra = "forbid" | ||||
|         arbitrary_types_allowed = True | ||||
| 
 | ||||
| 
 | ||||
| class ConfigSchema(BaseModel): | ||||
|     training: ConfigSchemaTraining | ||||
|     nlp: ConfigSchemaNlp | ||||
|  |  | |||
|  | @ -121,15 +121,15 @@ def load_vectors_into_model( | |||
| 
 | ||||
| 
 | ||||
| def init_tok2vec( | ||||
|     nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any] | ||||
|     nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any] | ||||
| ) -> bool: | ||||
|     # Load pretrained tok2vec weights - cf. CLI command 'pretrain' | ||||
|     P = pretrain_config | ||||
|     V = vocab_config | ||||
|     I = init_config | ||||
|     weights_data = None | ||||
|     init_tok2vec = ensure_path(V["init_tok2vec"]) | ||||
|     init_tok2vec = ensure_path(I["init_tok2vec"]) | ||||
|     if init_tok2vec is not None: | ||||
|         if P["objective"].get("type") == "vectors" and not V["vectors"]: | ||||
|         if P["objective"].get("type") == "vectors" and not I["vectors"]: | ||||
|             err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"' | ||||
|             errors = [{"loc": ["initialize", "vocab"], "msg": err}] | ||||
|             raise ConfigValidationError(config=nlp.config, errors=errors) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user