mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 16:24:16 +03:00
Merge branch 'feature/prepare' of https://github.com/explosion/spaCy into feature/prepare
This commit is contained in:
commit
8ce9f44433
|
@ -1,8 +1,9 @@
|
|||
[paths]
|
||||
train = ""
|
||||
dev = ""
|
||||
init_tok2vec = null
|
||||
vectors = null
|
||||
vocab_data = null
|
||||
init_tok2vec = null
|
||||
|
||||
[system]
|
||||
seed = 0
|
||||
|
@ -96,19 +97,16 @@ eps = 1e-8
|
|||
learn_rate = 0.001
|
||||
|
||||
# The 'initialize' step is run before training or pretraining. Components and
|
||||
# the tokenizer can each define their own prepare step, giving them a chance
|
||||
# to gather resources like lookup-tables, build label sets, construct vocabularies,
|
||||
# etc. After 'prepare' is finished, the result will be saved out to disk, which
|
||||
# will then be read in at the start of training. You can call the prepare step
|
||||
# separately with the `spacy prepare` command, or you can let the train script
|
||||
# do it for you.
|
||||
# the tokenizer can each define their own arguments via their .initialize
|
||||
# methods that are populated by the config. This lets them gather resources like
|
||||
# lookup tables and build label sets, construct vocabularies, etc.
|
||||
[initialize]
|
||||
tokenizer = {}
|
||||
components = {}
|
||||
|
||||
[initialize.vocab]
|
||||
data = ${paths.vocab_data}
|
||||
vocab_data = ${paths.vocab_data}
|
||||
lookups = null
|
||||
vectors = null
|
||||
vectors = ${paths.vectors}
|
||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
# Arguments passed to the tokenizer's initialize method
|
||||
tokenizer = {}
|
||||
# Arguments passed to the initialize methods of the components (keyed by component name)
|
||||
components = {}
|
||||
|
|
|
@ -1188,14 +1188,13 @@ class Language:
|
|||
config = self.config.interpolate()
|
||||
# These are the settings provided in the [initialize] block in the config
|
||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||
V = I["vocab"]
|
||||
init_vocab(
|
||||
self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"],
|
||||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"],
|
||||
)
|
||||
pretrain_cfg = config.get("pretraining")
|
||||
if pretrain_cfg:
|
||||
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
|
||||
init_tok2vec(self, P, V)
|
||||
init_tok2vec(self, P, I)
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
ops = get_current_ops()
|
||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||
|
|
|
@ -357,12 +357,14 @@ class ConfigSchemaPretrain(BaseModel):
|
|||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class ConfigSchemaInitVocab(BaseModel):
|
||||
class ConfigSchemaInit(BaseModel):
|
||||
# fmt: off
|
||||
data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
|
||||
vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
|
||||
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
|
||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
||||
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
@ -370,16 +372,6 @@ class ConfigSchemaInitVocab(BaseModel):
|
|||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class ConfigSchemaInit(BaseModel):
|
||||
vocab: ConfigSchemaInitVocab
|
||||
tokenizer: Any
|
||||
components: Dict[StrictStr, Any]
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class ConfigSchema(BaseModel):
|
||||
training: ConfigSchemaTraining
|
||||
nlp: ConfigSchemaNlp
|
||||
|
|
|
@ -121,15 +121,15 @@ def load_vectors_into_model(
|
|||
|
||||
|
||||
def init_tok2vec(
|
||||
nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
|
||||
nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
|
||||
) -> bool:
|
||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||
P = pretrain_config
|
||||
V = vocab_config
|
||||
I = init_config
|
||||
weights_data = None
|
||||
init_tok2vec = ensure_path(V["init_tok2vec"])
|
||||
init_tok2vec = ensure_path(I["init_tok2vec"])
|
||||
if init_tok2vec is not None:
|
||||
if P["objective"].get("type") == "vectors" and not V["vectors"]:
|
||||
if P["objective"].get("type") == "vectors" and not I["vectors"]:
|
||||
err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"'
|
||||
errors = [{"loc": ["initialize", "vocab"], "msg": err}]
|
||||
raise ConfigValidationError(config=nlp.config, errors=errors)
|
||||
|
|
Loading…
Reference in New Issue
Block a user