Update config

This commit is contained in:
Ines Montani 2020-09-28 12:05:23 +02:00
parent 9f6ad06452
commit 1590de11b1
5 changed files with 26 additions and 37 deletions

View File

@ -72,14 +72,15 @@ def init_pipeline(config: Config, use_gpu: int = -1) -> Language:
config = nlp.config.interpolate() config = nlp.config.interpolate()
# Resolve all training-relevant sections using the filled nlp config # Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining) T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
dot_names = [T["train_corpus"], T["dev_corpus"], T["raw_text"]] dot_names = [T["train_corpus"], T["dev_corpus"]]
train_corpus, dev_corpus, raw_text = resolve_dot_names(config, dot_names) train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
init_vocab(nlp, data=I["vocab"]["data"], lookups=I["vocab"]["lookups"]) V = I["vocab"]
init_vocab(nlp, data=V["data"], lookups=V["lookups"])
msg.good("Created vocabulary") msg.good("Created vocabulary")
if T["vectors"] is not None: if V["vectors"] is not None:
add_vectors(nlp, T["vectors"]) add_vectors(nlp, V["vectors"])
msg.good(f"Added vectors: {T['vectors']}") msg.good(f"Added vectors: {V['vectors']}")
optimizer = T["optimizer"] optimizer = T["optimizer"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training # Components that shouldn't be updated during training
@ -130,20 +131,15 @@ def init_vocab(
def add_tok2vec_weights( def add_tok2vec_weights(
nlp: Language, pretrain_config: Dict[str, Any], init_config: Dict[str, Any] nlp: Language, pretrain_config: Dict[str, Any], vocab_config: Dict[str, Any]
) -> None: ) -> None:
# Load pretrained tok2vec weights - cf. CLI command 'pretrain' # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
P = pretrain_config P = pretrain_config
I = init_config V = vocab_config
raw_text = util.ensure_path(I["vocab"]["raw_text"])
if raw_text is not None:
if not raw_text.exists():
msg.fail("Can't find raw text", raw_text, exits=1)
raw_text = list(srsly.read_jsonl(raw_text))
weights_data = None weights_data = None
init_tok2vec = util.ensure_path(I["vocab"]["init_tok2vec"]) init_tok2vec = util.ensure_path(V["init_tok2vec"])
if init_tok2vec is not None: if init_tok2vec is not None:
if P["objective"].get("type") == "vectors" and not I["vectors"]: if P["objective"].get("type") == "vectors" and not V["vectors"]:
err = "Need initialize.vectors if pretraining.objective.type is vectors" err = "Need initialize.vectors if pretraining.objective.type is vectors"
msg.fail(err, exits=1) msg.fail(err, exits=1)
if not init_tok2vec.exists(): if not init_tok2vec.exists():

View File

@ -277,11 +277,6 @@ path = ${paths.dev}
max_length = 0 max_length = 0
[training] [training]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}
{% if use_transformer -%} {% if use_transformer -%}
accumulate_gradient = {{ transformer["size_factor"] }} accumulate_gradient = {{ transformer["size_factor"] }}
{% endif -%} {% endif -%}
@ -317,3 +312,12 @@ start = 100
stop = 1000 stop = 1000
compound = 1.001 compound = 1.001
{% endif %} {% endif %}
[initialize]
[initialize.vocab]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}

View File

@ -1,8 +1,9 @@
[paths] [paths]
train = "" train = ""
dev = "" dev = ""
raw = null raw_text = null
init_tok2vec = null init_tok2vec = null
vocab_data = null
[system] [system]
seed = 0 seed = 0
@ -54,11 +55,6 @@ seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator} gpu_allocator = ${system.gpu_allocator}
dropout = 0.1 dropout = 0.1
accumulate_gradient = 1 accumulate_gradient = 1
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw}
vectors = null
lookups = null
# Controls early-stopping. 0 or -1 mean unlimited. # Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600 patience = 1600
max_epochs = 0 max_epochs = 0
@ -112,9 +108,8 @@ tokenizer = {}
components = {} components = {}
[initialize.vocab] [initialize.vocab]
data = null data = ${paths.vocab_data}
lookups = null lookups = null
vectors = null vectors = null
# Extra resources for transfer-learning or pseudo-rehearsal # Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec} init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw}

View File

@ -32,7 +32,7 @@ learn_rate = 0.001
[corpora.pretrain] [corpora.pretrain]
@readers = "spacy.JsonlReader.v1" @readers = "spacy.JsonlReader.v1"
path = ${paths.raw} path = ${paths.raw_text}
min_length = 5 min_length = 5
max_length = 500 max_length = 500
limit = 0 limit = 0

View File

@ -3,7 +3,6 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
from enum import Enum from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator from pydantic import BaseModel, Field, ValidationError, validator
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
from pydantic import root_validator
from thinc.config import Promise from thinc.config import Promise
from collections import defaultdict from collections import defaultdict
from thinc.api import Optimizer from thinc.api import Optimizer
@ -205,8 +204,6 @@ class ModelMetaSchema(BaseModel):
class ConfigSchemaTraining(BaseModel): class ConfigSchemaTraining(BaseModel):
# fmt: off # fmt: off
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data") dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
train_corpus: StrictStr = Field(..., title="Path in the config to the training data") train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data") batcher: Batcher = Field(..., title="Batcher for the training data")
@ -219,8 +216,6 @@ class ConfigSchemaTraining(BaseModel):
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model") score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
optimizer: Optimizer = Field(..., title="The optimizer to use") optimizer: Optimizer = Field(..., title="The optimizer to use")
logger: Logger = Field(..., title="The logger to track training progress") logger: Logger = Field(..., title="The logger to track training progress")
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
@ -275,11 +270,10 @@ class ConfigSchemaPretrain(BaseModel):
class ConfigSchemaInitVocab(BaseModel): class ConfigSchemaInitVocab(BaseModel):
# fmt: off # fmt: off
data: Optional[str] = Field(..., title="Path to JSON-formatted vocabulary file") data: Optional[StrictStr] = Field(..., title="Path to JSON-formatted vocabulary file")
lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization")
vectors: Optional[StrictStr] = Field(..., title="Path to vectors") vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
# fmt: on # fmt: on
class Config: class Config:
@ -290,7 +284,7 @@ class ConfigSchemaInitVocab(BaseModel):
class ConfigSchemaInit(BaseModel): class ConfigSchemaInit(BaseModel):
vocab: ConfigSchemaInitVocab vocab: ConfigSchemaInitVocab
tokenizer: Any tokenizer: Any
components: Dict[str, Any] components: Dict[StrictStr, Any]
class Config: class Config:
extra = "forbid" extra = "forbid"