Merge branch 'feature/prepare' of https://github.com/explosion/spaCy into feature/prepare

This commit is contained in:
Matthew Honnibal 2020-09-29 16:57:38 +02:00
commit 8ce9f44433
4 changed files with 21 additions and 32 deletions

View File

@ -1,8 +1,9 @@
[paths]
train = ""
dev = ""
init_tok2vec = null
vectors = null
vocab_data = null
init_tok2vec = null
[system]
seed = 0
@ -96,19 +97,16 @@ eps = 1e-8
learn_rate = 0.001
# The 'initialize' step is run before training or pretraining. Components and
# the tokenizer can each define their own prepare step, giving them a chance
# to gather resources like lookup-tables, build label sets, construct vocabularies,
# etc. After 'prepare' is finished, the result will be saved out to disk, which
# will then be read in at the start of training. You can call the prepare step
# separately with the `spacy prepare` command, or you can let the train script
# do it for you.
# the tokenizer can each define their own arguments via their .initialize
# methods that are populated by the config. This lets them gather resources like
# lookup tables and build label sets, construct vocabularies, etc.
[initialize]
tokenizer = {}
components = {}
[initialize.vocab]
data = ${paths.vocab_data}
vocab_data = ${paths.vocab_data}
lookups = null
vectors = null
vectors = ${paths.vectors}
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec}
# Arguments passed to the tokenizer's initialize method
tokenizer = {}
# Arguments passed to the initialize methods of the components (keyed by component name)
components = {}

View File

@ -1188,14 +1188,13 @@ class Language:
config = self.config.interpolate()
# These are the settings provided in the [initialize] block in the config
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
V = I["vocab"]
init_vocab(
self, data=V["data"], lookups=V["lookups"], vectors=V["vectors"],
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"],
)
pretrain_cfg = config.get("pretraining")
if pretrain_cfg:
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
init_tok2vec(self, P, V)
init_tok2vec(self, P, I)
if self.vocab.vectors.data.shape[1] >= 1:
ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)

View File

@ -357,12 +357,14 @@ class ConfigSchemaPretrain(BaseModel):
arbitrary_types_allowed = True
class ConfigSchemaInitVocab(BaseModel):
class ConfigSchemaInit(BaseModel):
# fmt: off
data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
vocab_data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for Pipe.initialize methods of pipeline components, keyed by component")
# fmt: on
class Config:
@ -370,16 +372,6 @@ class ConfigSchemaInitVocab(BaseModel):
arbitrary_types_allowed = True
class ConfigSchemaInit(BaseModel):
vocab: ConfigSchemaInitVocab
tokenizer: Any
components: Dict[StrictStr, Any]
class Config:
extra = "forbid"
arbitrary_types_allowed = True
class ConfigSchema(BaseModel):
training: ConfigSchemaTraining
nlp: ConfigSchemaNlp

View File

@ -121,15 +121,15 @@ def load_vectors_into_model(
def init_tok2vec(
nlp: "Language", pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
) -> bool:
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
P = pretrain_config
V = vocab_config
I = init_config
weights_data = None
init_tok2vec = ensure_path(V["init_tok2vec"])
init_tok2vec = ensure_path(I["init_tok2vec"])
if init_tok2vec is not None:
if P["objective"].get("type") == "vectors" and not V["vectors"]:
if P["objective"].get("type") == "vectors" and not I["vectors"]:
err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"'
errors = [{"loc": ["initialize", "vocab"], "msg": err}]
raise ConfigValidationError(config=nlp.config, errors=errors)