Fix typos and refactor CLI logging

This commit is contained in:
Ines Montani 2020-09-28 21:17:10 +02:00
parent 2e9c9e74af
commit a139fe672b
7 changed files with 52 additions and 67 deletions

View File

@ -448,19 +448,8 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
return result
class CliLogger:
"""Helper mocking up the most commonly used logger methods. Can be passed
into functions like train() to make them output pretty-printed messages
on the CLI and regular logging if used from within Python.
"""
debug = msg.text
info = msg.info
warn = msg.info
error = msg.fail
def setup_gpu(use_gpu: int):
def setup_gpu(use_gpu: int) -> None:
"""Configure the GPU and log info."""
if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu)

View File

@ -7,7 +7,7 @@ import typer
from .. import util
from ..training.initialize import init_nlp
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, CliLogger, setup_gpu
from ._util import import_code, setup_gpu
@init_cli.command(
@ -32,6 +32,6 @@ def init_pipeline_cli(
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good)
nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
nlp.to_disk(output_path)
msg.good(f"Saved initialized pipeline to {output_path}")

View File

@ -5,7 +5,7 @@ import typer
import re
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu, CliLogger
from ._util import import_code, setup_gpu
from ..training.pretrain import pretrain
from ..util import load_config
@ -73,7 +73,7 @@ def pretrain_cli(
resume_path=resume_path,
epoch_resume=epoch_resume,
use_gpu=use_gpu,
logger=CliLogger,
silent=False,
)
msg.good("Successfully finished pretrain")

View File

@ -6,7 +6,7 @@ import typer
import logging
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, CliLogger, setup_gpu
from ._util import import_code, setup_gpu
from ..language import Language
from ..training.loop import train
from ..training.initialize import init_nlp, must_reinitialize
@ -50,15 +50,13 @@ def train_cli(
msg.divider("Initializing pipeline")
nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
msg.divider("Training pipeline")
final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger)
if final_path:
msg.good(f"Saved pipeline to output directory", final_path)
train(nlp, output_path, use_gpu=use_gpu, silent=False)
def init_pipeline(
config: Config, output_path: Optional[Path], *, use_gpu: int = -1
) -> Language:
init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good}
init_kwargs = {"use_gpu": use_gpu, "silent": False}
if output_path is not None:
init_path = output_path / "model-initial"
if not init_path.exists():

View File

@ -1,7 +1,8 @@
from typing import Union, Dict, Optional, Any, List, Callable
from typing import Union, Dict, Optional, Any, List
from thinc.api import Config, fix_random_seed, set_gpu_allocator
from thinc.api import ConfigValidationError
from pathlib import Path
from wasabi import Printer
import srsly
from .loop import create_before_to_disk_callback
@ -10,16 +11,11 @@ from ..lookups import Lookups
from ..errors import Errors
from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
from ..util import registry, load_model_from_config, resolve_dot_names
from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB
from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
def init_nlp(
config: Config,
*,
use_gpu: int = -1,
logger: Callable[[Any], Any] = logger,
on_success: Callable[[str], None] = lambda x: None,
) -> Language:
def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language:
msg = Printer(no_print=silent)
raw_config = config
config = raw_config.interpolate()
if config["training"]["seed"] is not None:
@ -30,7 +26,7 @@ def init_nlp(
# Use original config here before it's resolved to functions
sourced_components = get_sourced_components(config)
nlp = load_model_from_config(raw_config, auto_fill=True)
on_success("Set up nlp object from config")
msg.good("Set up nlp object from config")
config = nlp.config.interpolate()
# Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
@ -38,29 +34,31 @@ def init_nlp(
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
V = I["vocab"]
init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"])
init_vocab(
nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent
)
optimizer = T["optimizer"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training
frozen_components = T["frozen_components"]
# Sourced components that require resume_training
resume_components = [p for p in sourced_components if p not in frozen_components]
logger.info(f"Pipeline: {nlp.pipe_names}")
msg.info(f"Pipeline: {nlp.pipe_names}")
if resume_components:
with nlp.select_pipes(enable=resume_components):
logger.info(f"Resuming training for: {resume_components}")
msg.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
on_success(f"Initialized pipeline components")
msg.good(f"Initialized pipeline components")
# Verify the config after calling 'begin_training' to ensure labels
# are properly initialized
verify_config(nlp)
if "pretraining" in config and config["pretraining"]:
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
loaded = add_tok2vec_weights(nlp, P, I)
loaded = add_tok2vec_weights(nlp, P, V)
if loaded and P["component"]:
on_success(f"Loaded pretrained weights into component '{P['component']}'")
msg.good(f"Loaded pretrained weights into component '{P['component']}'")
nlp = before_to_disk(nlp)
return nlp
@ -76,11 +74,12 @@ def init_vocab(
data: Optional[Path] = None,
lookups: Optional[Lookups] = None,
vectors: Optional[str] = None,
on_success: Callable[[str], None] = lambda x: None,
silent: bool = True,
) -> Language:
msg = Printer(no_print=silent)
if lookups:
nlp.vocab.lookups = lookups
on_success(f"Added vocab lookups: {', '.join(lookups.tables)}")
msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
data_path = ensure_path(data)
if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path)
@ -96,11 +95,11 @@ def init_vocab(
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab")
on_success("Created vocabulary")
msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
msg.good("Created vocabulary")
if vectors is not None:
load_vectors_into_model(nlp, vectors)
on_success(f"Added vectors: {vectors}")
msg.good(f"Added vectors: {vectors}")
def load_vectors_into_model(
@ -137,8 +136,8 @@ def add_tok2vec_weights(
init_tok2vec = ensure_path(V["init_tok2vec"])
if init_tok2vec is not None:
if P["objective"].get("type") == "vectors" and not V["vectors"]:
err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
errors = [{"loc": ["initialize", "vectors"], "msg": err}]
err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"'
errors = [{"loc": ["initialize", "vocab"], "msg": err}]
raise ConfigValidationError(config=nlp.config, errors=errors)
if not init_tok2vec.exists():
err = f"can't find pretrained tok2vec: {init_tok2vec}"

View File

@ -5,12 +5,13 @@ from timeit import default_timer as timer
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
import random
import tqdm
from wasabi import Printer
from .example import Example
from ..schemas import ConfigSchemaTraining
from ..language import Language
from ..errors import Errors
from ..util import resolve_dot_names, registry, logger
from ..util import resolve_dot_names, registry
def train(
@ -18,8 +19,8 @@ def train(
output_path: Optional[Path] = None,
*,
use_gpu: int = -1,
logger: Callable[[Any], Any] = logger,
) -> Optional[Path]:
silent: bool = False,
) -> None:
"""Train a pipeline.
nlp (Language): The initialized nlp object with the full config.
@ -31,7 +32,7 @@ def train(
swapped for CLI logger.
RETURNS (Path / None): The path to the final exported model.
"""
msg = Printer(no_print=silent)
# Create iterator, which yields out info after each optimization step.
config = nlp.config.interpolate()
if config["training"]["seed"] is not None:
@ -62,10 +63,10 @@ def train(
eval_frequency=T["eval_frequency"],
exclude=frozen_components,
)
logger.info(f"Pipeline: {nlp.pipe_names}")
msg.info(f"Pipeline: {nlp.pipe_names}")
if frozen_components:
logger.info(f"Frozen components: {frozen_components}")
logger.info(f"Initial learn rate: {optimizer.learn_rate}")
msg.info(f"Frozen components: {frozen_components}")
msg.info(f"Initial learn rate: {optimizer.learn_rate}")
with nlp.select_pipes(disable=frozen_components):
print_row, finalize_logger = train_logger(nlp)
try:
@ -89,7 +90,7 @@ def train(
if output_path is not None:
# We don't want to swallow the traceback if we don't have a
# specific error.
logger.warn(
msg.warn(
f"Aborting and saving the final best model. "
f"Encountered exception: {str(e)}"
)
@ -105,7 +106,7 @@ def train(
nlp.to_disk(final_model_path)
else:
nlp.to_disk(final_model_path)
return final_model_path
msg.good(f"Saved pipeline to output directory", final_model_path)
def train_while_improving(

View File

@ -1,4 +1,4 @@
from typing import Optional, Callable, Any, Iterable, Union, List
from typing import Optional, Callable, Iterable, Union, List
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
from pathlib import Path
@ -8,7 +8,7 @@ import srsly
import numpy
import time
import re
from wasabi import msg
from wasabi import Printer
from .example import Example
from ..tokens import Doc
@ -16,7 +16,7 @@ from ..attrs import ID
from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
from ..util import registry, load_model_from_config, dot_to_object, logger
from ..util import registry, load_model_from_config, dot_to_object
def pretrain(
@ -25,8 +25,9 @@ def pretrain(
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
use_gpu: int = -1,
logger: Callable[[Any], Any] = logger,
silent: bool = True,
):
msg = Printer(no_print=silent)
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
@ -42,11 +43,10 @@ def pretrain(
optimizer = P["optimizer"]
# Load in pretrained weights to resume from
if resume_path is not None:
_resume_model(model, resume_path, epoch_resume)
_resume_model(model, resume_path, epoch_resume, silent=silent)
else:
# Without '--resume-path' the '--epoch-resume' argument is ignored
epoch_resume = 0
# TODO: move this to logger function?
tracker = ProgressTracker(frequency=10000)
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
@ -94,12 +94,10 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
def _resume_model(
model: Model,
resume_path: Path,
epoch_resume: int,
logger: Callable[[Any], Any] = logger,
model: Model, resume_path: Path, epoch_resume: int, silent: bool = True,
) -> None:
logger.info(f"Resume training tok2vec from: {resume_path}")
msg = Printer(no_print=silent)
msg.info(f"Resume training tok2vec from: {resume_path}")
with resume_path.open("rb") as file_:
weights_data = file_.read()
model.get_ref("tok2vec").from_bytes(weights_data)
@ -108,9 +106,9 @@ def _resume_model(
if model_name:
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
logger.info(f"Resuming from epoch: {epoch_resume}")
msg.info(f"Resuming from epoch: {epoch_resume}")
else:
logger.info(f"Resuming from epoch: {epoch_resume}")
msg.info(f"Resuming from epoch: {epoch_resume}")
def make_update(