Fix handling of optional [pretraining] block (#5954)

* Fix handling of optional [pretraining] block

* Remote pretraining from default config

* Fix test

* Add schema option for empty pretrain block
This commit is contained in:
Ines Montani 2020-08-24 15:56:03 +02:00 committed by GitHub
parent 463f1c8623
commit 0e7f99da58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 84 additions and 45 deletions

View File

@ -51,7 +51,7 @@ def debug_model_cli(
with show_validation_error(config_path): with show_validation_error(config_path):
config = util.load_config(config_path, overrides=config_overrides) config = util.load_config(config_path, overrides=config_overrides)
nlp, config = util.load_model_from_config(config_path) nlp, config = util.load_model_from_config(config_path)
seed = config["pretraining"]["seed"] seed = config["training"]["seed"]
if seed is not None: if seed is not None:
msg.info(f"Fixing random seed: {seed}") msg.info(f"Fixing random seed: {seed}")
fix_random_seed(seed) fix_random_seed(seed)

View File

@ -7,6 +7,7 @@ import srsly
import re import re
from .. import util from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema from ..schemas import RecommendationSchema
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
@ -48,6 +49,7 @@ def init_fill_config_cli(
# fmt: off # fmt: off
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False), base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True), output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
pretraining: bool = Opt(False, "--pretraining", "-p", help="Include config for pretraining (with 'spacy pretrain')"),
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes") diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
# fmt: on # fmt: on
): ):
@ -58,19 +60,24 @@ def init_fill_config_cli(
can be used with a config generated via the training quickstart widget: can be used with a config generated via the training quickstart widget:
https://nightly.spacy.io/usage/training#quickstart https://nightly.spacy.io/usage/training#quickstart
""" """
fill_config(output_file, base_path, diff=diff) fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
def fill_config( def fill_config(
output_file: Path, base_path: Path, *, diff: bool = False output_file: Path, base_path: Path, *, pretraining: bool = False, diff: bool = False
) -> Tuple[Config, Config]: ) -> Tuple[Config, Config]:
is_stdout = str(output_file) == "-" is_stdout = str(output_file) == "-"
msg = Printer(no_print=is_stdout) msg = Printer(no_print=is_stdout)
with show_validation_error(hint_fill=False): with show_validation_error(hint_fill=False):
config = util.load_config(base_path) config = util.load_config(base_path)
nlp, _ = util.load_model_from_config(config, auto_fill=True) nlp, _ = util.load_model_from_config(config, auto_fill=True)
filled = nlp.config
if pretraining:
validate_config_for_pretrain(filled, msg)
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
filled = pretrain_config.merge(filled)
before = config.to_str() before = config.to_str()
after = nlp.config.to_str() after = filled.to_str()
if before == after: if before == after:
msg.warn("Nothing to auto-fill: base config is already complete") msg.warn("Nothing to auto-fill: base config is already complete")
else: else:
@ -84,7 +91,7 @@ def fill_config(
print(diff_strings(before, after)) print(diff_strings(before, after))
msg.divider("END CONFIG DIFF") msg.divider("END CONFIG DIFF")
print("") print("")
save_config(nlp.config, output_file, is_stdout=is_stdout) save_config(filled, output_file, is_stdout=is_stdout)
def init_config( def init_config(
@ -132,12 +139,9 @@ def init_config(
msg.info("Generated template specific for your use case") msg.info("Generated template specific for your use case")
for label, value in use_case.items(): for label, value in use_case.items():
msg.text(f"- {label}: {value}") msg.text(f"- {label}: {value}")
use_transformer = bool(template_vars.use_transformer)
with show_validation_error(hint_fill=False): with show_validation_error(hint_fill=False):
config = util.load_config_from_str(base_template) config = util.load_config_from_str(base_template)
nlp, _ = util.load_model_from_config(config, auto_fill=True) nlp, _ = util.load_model_from_config(config, auto_fill=True)
if use_transformer:
nlp.config.pop("pretraining", {}) # TODO: solve this better
msg.good("Auto-filled config with all values") msg.good("Auto-filled config with all values")
save_config(nlp.config, output_file, is_stdout=is_stdout) save_config(nlp.config, output_file, is_stdout=is_stdout)
@ -161,3 +165,15 @@ def has_spacy_transformers() -> bool:
return True return True
except ImportError: except ImportError:
return False return False
def validate_config_for_pretrain(config: Config, msg: Printer) -> None:
if "tok2vec" not in config["nlp"]["pipeline"]:
msg.warn(
"No tok2vec component found in the pipeline. If your tok2vec "
"component has a different name, you may need to adjust the "
"tok2vec_model reference in the [pretraining] block. If you don't "
"have a tok2vec component, make sure to add it to your [components] "
"and the pipeline specified in the [nlp] block, so you can pretrain "
"weights for it."
)

View File

@ -90,19 +90,20 @@ def pretrain(
with show_validation_error(config_path): with show_validation_error(config_path):
config = util.load_config(config_path, overrides=config_overrides) config = util.load_config(config_path, overrides=config_overrides)
nlp, config = util.load_model_from_config(config) nlp, config = util.load_model_from_config(config)
# TODO: validate that [pretraining] block exists pretrain_config = config["pretraining"]
if not pretrain_config:
# TODO: What's the solution here? How do we handle optional blocks?
msg.fail("The [pretraining] block in your config is empty", exits=1)
if not output_dir.exists(): if not output_dir.exists():
output_dir.mkdir() output_dir.mkdir()
msg.good(f"Created output directory: {output_dir}") msg.good(f"Created output directory: {output_dir}")
seed = config["pretraining"]["seed"] seed = pretrain_config["seed"]
if seed is not None: if seed is not None:
fix_random_seed(seed) fix_random_seed(seed)
if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]: if use_gpu >= 0 and pretrain_config["use_pytorch_for_gpu_memory"]:
use_pytorch_for_gpu_memory() use_pytorch_for_gpu_memory()
config.to_disk(output_dir / "config.cfg") config.to_disk(output_dir / "config.cfg")
msg.good("Saved config file in the output directory") msg.good("Saved config file in the output directory")
pretrain_config = config["pretraining"]
if texts_loc != "-": # reading from a file if texts_loc != "-": # reading from a file
with msg.loading("Loading input texts..."): with msg.loading("Loading input texts..."):
texts = list(srsly.read_jsonl(texts_loc)) texts = list(srsly.read_jsonl(texts_loc))

View File

@ -117,7 +117,7 @@ def train(
# Load a pretrained tok2vec model - cf. CLI command 'pretrain' # Load a pretrained tok2vec model - cf. CLI command 'pretrain'
if weights_data is not None: if weights_data is not None:
tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None) tok2vec_path = config["pretraining"].get("tok2vec_model", None)
if tok2vec_path is None: if tok2vec_path is None:
msg.fail( msg.fail(
f"To use a pretrained tok2vec model, the config needs to specify which " f"To use a pretrained tok2vec model, the config needs to specify which "

View File

@ -90,29 +90,3 @@ eps = 1e-8
warmup_steps = 250 warmup_steps = 250
total_steps = 20000 total_steps = 20000
initial_rate = 0.001 initial_rate = 0.001
[pretraining]
max_epochs = 1000
min_length = 5
max_length = 500
dropout = 0.2
n_save_every = null
batch_size = 3000
seed = ${system.seed}
use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory}
tok2vec_model = "components.tok2vec.model"
[pretraining.objective]
type = "characters"
n_characters = 4
[pretraining.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001

View File

@ -0,0 +1,25 @@
[pretraining]
max_epochs = 1000
min_length = 5
max_length = 500
dropout = 0.2
n_save_every = null
batch_size = 3000
seed = ${system.seed}
use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory}
tok2vec_model = "components.tok2vec.model"
[pretraining.objective]
type = "characters"
n_characters = 4
[pretraining.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001

View File

@ -37,6 +37,9 @@ from . import about
# This is the base config will all settings (training etc.) # This is the base config will all settings (training etc.)
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg" DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH) DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
# This is the base config for the [pretraining] block and currently not included
# in the main config and only added via the 'init fill-config' command
DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
class BaseDefaults: class BaseDefaults:

View File

@ -233,6 +233,11 @@ class ConfigSchemaNlp(BaseModel):
arbitrary_types_allowed = True arbitrary_types_allowed = True
class ConfigSchemaPretrainEmpty(BaseModel):
class Config:
extra = "forbid"
class ConfigSchemaPretrain(BaseModel): class ConfigSchemaPretrain(BaseModel):
# fmt: off # fmt: off
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for") max_epochs: StrictInt = Field(..., title="Maximum number of epochs to train for")
@ -257,16 +262,17 @@ class ConfigSchemaPretrain(BaseModel):
class ConfigSchema(BaseModel): class ConfigSchema(BaseModel):
training: ConfigSchemaTraining training: ConfigSchemaTraining
nlp: ConfigSchemaNlp nlp: ConfigSchemaNlp
pretraining: Optional[ConfigSchemaPretrain] pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
components: Dict[str, Dict[str, Any]] components: Dict[str, Dict[str, Any]]
@root_validator @root_validator
def validate_config(cls, values): def validate_config(cls, values):
"""Perform additional validation for settings with dependencies.""" """Perform additional validation for settings with dependencies."""
pt = values.get("pretraining") pt = values.get("pretraining")
if pt and pt.objective.get("type") == "vectors" and not values["nlp"].vectors: if pt and not isinstance(pt, ConfigSchemaPretrainEmpty):
err = "Need nlp.vectors if pretraining.objective.type is vectors" if pt.objective.get("type") == "vectors" and not values["nlp"].vectors:
raise ValueError(err) err = "Need nlp.vectors if pretraining.objective.type is vectors"
raise ValueError(err)
return values return values
class Config: class Config:

View File

@ -3,10 +3,11 @@ from thinc.config import Config, ConfigValidationError
import spacy import spacy
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.de import German from spacy.lang.de import German
from spacy.language import Language from spacy.language import Language, DEFAULT_CONFIG
from spacy.util import registry, load_model_from_config from spacy.util import registry, load_model_from_config
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
from spacy.schemas import ConfigSchema
from ..util import make_tempdir from ..util import make_tempdir
@ -299,3 +300,16 @@ def test_config_interpolation():
nlp2 = English.from_config(interpolated) nlp2 = English.from_config(interpolated)
assert nlp2.config["training"]["train_corpus"]["path"] == "" assert nlp2.config["training"]["train_corpus"]["path"] == ""
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
def test_config_optional_sections():
config = Config().from_str(nlp_config_string)
config = DEFAULT_CONFIG.merge(config)
assert "pretraining" not in config
filled = registry.fill_config(config, schema=ConfigSchema, validate=False)
# Make sure that optional "pretraining" block doesn't default to None,
# which would (rightly) cause error because it'd result in a top-level
# key that's not a section (dict). Note that the following roundtrip is
# also how Config.interpolate works under the hood.
new_config = Config().from_str(filled.to_str())
assert new_config["pretraining"] == {}