mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Tighten up format
This commit is contained in:
parent
978ab54a84
commit
fd594cfb9b
|
@ -1,8 +1,9 @@
|
||||||
[paths]
|
[paths]
|
||||||
train = ""
|
train = ""
|
||||||
dev = ""
|
dev = ""
|
||||||
init_tok2vec = null
|
vectors = null
|
||||||
vocab_data = null
|
vocab_data = null
|
||||||
|
init_tok2vec = null
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
seed = 0
|
seed = 0
|
||||||
|
@ -96,19 +97,16 @@ eps = 1e-8
|
||||||
learn_rate = 0.001
|
learn_rate = 0.001
|
||||||
|
|
||||||
# The 'initialize' step is run before training or pretraining. Components and
|
# The 'initialize' step is run before training or pretraining. Components and
|
||||||
# the tokenizer can each define their own prepare step, giving them a chance
|
# the tokenizer can each define their own arguments via their .initialize
|
||||||
# to gather resources like lookup-tables, build label sets, construct vocabularies,
|
# methods that are populated by the config. This lets them gather resources like
|
||||||
# etc. After 'prepare' is finished, the result will be saved out to disk, which
|
# lookup tables and build label sets, construct vocabularies, etc.
|
||||||
# will then be read in at the start of training. You can call the prepare step
|
|
||||||
# separately with the `spacy prepare` command, or you can let the train script
|
|
||||||
# do it for you.
|
|
||||||
[initialize]
|
[initialize]
|
||||||
tokenizer = {}
|
vocab_data = ${paths.vocab_data}
|
||||||
components = {}
|
|
||||||
|
|
||||||
[initialize.vocab]
|
|
||||||
data = ${paths.vocab_data}
|
|
||||||
lookups = null
|
lookups = null
|
||||||
vectors = null
|
vectors = ${paths.vectors}
|
||||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
# Arguments passed to the tokenizer's initialize method
|
||||||
|
tokenizer = {}
|
||||||
|
# Arguments passed to the initialize methods of the components (keyed by component name)
|
||||||
|
components = {}
|
||||||
|
|
|
@ -1203,14 +1203,13 @@ class Language:
|
||||||
config = self.config.interpolate()
|
config = self.config.interpolate()
|
||||||
# These are the settings provided in the [initialize] block in the config
|
# These are the settings provided in the [initialize] block in the config
|
||||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
V = I["vocab"]
|
|
||||||
init_vocab(
|
init_vocab(
|
||||||
self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"],
|
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"],
|
||||||
)
|
)
|
||||||
pretrain_cfg = config.get("pretraining")
|
pretrain_cfg = config.get("pretraining")
|
||||||
if pretrain_cfg:
|
if pretrain_cfg:
|
||||||
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
|
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
|
||||||
init_tok2vec(self, P, V)
|
init_tok2vec(self, P, I)
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
|
|
|
@ -357,12 +357,14 @@ class ConfigSchemaPretrain(BaseModel):
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
class ConfigSchemaInitVocab(BaseModel):
|
class ConfigSchemaInit(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
|
vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
|
||||||
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
|
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
|
||||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||||
|
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
||||||
|
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
@ -370,16 +372,6 @@ class ConfigSchemaInitVocab(BaseModel):
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
class ConfigSchemaInit(BaseModel):
|
|
||||||
vocab: ConfigSchemaInitVocab
|
|
||||||
tokenizer: Any
|
|
||||||
components: Dict[StrictStr, Any]
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
extra = "forbid"
|
|
||||||
arbitrary_types_allowed = True
|
|
||||||
|
|
||||||
|
|
||||||
class ConfigSchema(BaseModel):
|
class ConfigSchema(BaseModel):
|
||||||
training: ConfigSchemaTraining
|
training: ConfigSchemaTraining
|
||||||
nlp: ConfigSchemaNlp
|
nlp: ConfigSchemaNlp
|
||||||
|
|
|
@ -121,15 +121,15 @@ def load_vectors_into_model(
|
||||||
|
|
||||||
|
|
||||||
def init_tok2vec(
|
def init_tok2vec(
|
||||||
nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
|
nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
|
||||||
) -> bool:
|
) -> bool:
|
||||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||||
P = pretrain_config
|
P = pretrain_config
|
||||||
V = vocab_config
|
I = init_config
|
||||||
weights_data = None
|
weights_data = None
|
||||||
init_tok2vec = ensure_path(V["init_tok2vec"])
|
init_tok2vec = ensure_path(I["init_tok2vec"])
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
if P["objective"].get("type") == "vectors" and not V["vectors"]:
|
if P["objective"].get("type") == "vectors" and not I["vectors"]:
|
||||||
err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"'
|
err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"'
|
||||||
errors = [{"loc": ["initialize", "vocab"], "msg": err}]
|
errors = [{"loc": ["initialize", "vocab"], "msg": err}]
|
||||||
raise ConfigValidationError(config=nlp.config, errors=errors)
|
raise ConfigValidationError(config=nlp.config, errors=errors)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user