Fix typos and refactor CLI logging

This commit is contained in:
Ines Montani 2020-09-28 21:17:10 +02:00
parent 2e9c9e74af
commit a139fe672b
7 changed files with 52 additions and 67 deletions

View File

@ -448,19 +448,8 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
return result return result
class CliLogger: def setup_gpu(use_gpu: int) -> None:
"""Helper mocking up the most commonly used logger methods. Can be passed """Configure the GPU and log info."""
into functions like train() to make them output pretty-printed messages
on the CLI and regular logging if used from within Python.
"""
debug = msg.text
info = msg.info
warn = msg.info
error = msg.fail
def setup_gpu(use_gpu: int):
if use_gpu >= 0: if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}") msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu) require_gpu(use_gpu)

View File

@ -7,7 +7,7 @@ import typer
from .. import util from .. import util
from ..training.initialize import init_nlp from ..training.initialize import init_nlp
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, CliLogger, setup_gpu from ._util import import_code, setup_gpu
@init_cli.command( @init_cli.command(
@ -32,6 +32,6 @@ def init_pipeline_cli(
with show_validation_error(config_path): with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides) config = util.load_config(config_path, overrides=overrides)
with show_validation_error(hint_fill=False): with show_validation_error(hint_fill=False):
nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good) nlp = init_nlp(config, use_gpu=use_gpu, silent=False)
nlp.to_disk(output_path) nlp.to_disk(output_path)
msg.good(f"Saved initialized pipeline to {output_path}") msg.good(f"Saved initialized pipeline to {output_path}")

View File

@ -5,7 +5,7 @@ import typer
import re import re
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu, CliLogger from ._util import import_code, setup_gpu
from ..training.pretrain import pretrain from ..training.pretrain import pretrain
from ..util import load_config from ..util import load_config
@ -73,7 +73,7 @@ def pretrain_cli(
resume_path=resume_path, resume_path=resume_path,
epoch_resume=epoch_resume, epoch_resume=epoch_resume,
use_gpu=use_gpu, use_gpu=use_gpu,
logger=CliLogger, silent=False,
) )
msg.good("Successfully finished pretrain") msg.good("Successfully finished pretrain")

View File

@ -6,7 +6,7 @@ import typer
import logging import logging
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, CliLogger, setup_gpu from ._util import import_code, setup_gpu
from ..language import Language from ..language import Language
from ..training.loop import train from ..training.loop import train
from ..training.initialize import init_nlp, must_reinitialize from ..training.initialize import init_nlp, must_reinitialize
@ -50,15 +50,13 @@ def train_cli(
msg.divider("Initializing pipeline") msg.divider("Initializing pipeline")
nlp = init_pipeline(config, output_path, use_gpu=use_gpu) nlp = init_pipeline(config, output_path, use_gpu=use_gpu)
msg.divider("Training pipeline") msg.divider("Training pipeline")
final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger) train(nlp, output_path, use_gpu=use_gpu, silent=False)
if final_path:
msg.good(f"Saved pipeline to output directory", final_path)
def init_pipeline( def init_pipeline(
config: Config, output_path: Optional[Path], *, use_gpu: int = -1 config: Config, output_path: Optional[Path], *, use_gpu: int = -1
) -> Language: ) -> Language:
init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good} init_kwargs = {"use_gpu": use_gpu, "silent": False}
if output_path is not None: if output_path is not None:
init_path = output_path / "model-initial" init_path = output_path / "model-initial"
if not init_path.exists(): if not init_path.exists():

View File

@ -1,7 +1,8 @@
from typing import Union, Dict, Optional, Any, List, Callable from typing import Union, Dict, Optional, Any, List
from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import Config, fix_random_seed, set_gpu_allocator
from thinc.api import ConfigValidationError from thinc.api import ConfigValidationError
from pathlib import Path from pathlib import Path
from wasabi import Printer
import srsly import srsly
from .loop import create_before_to_disk_callback from .loop import create_before_to_disk_callback
@ -10,16 +11,11 @@ from ..lookups import Lookups
from ..errors import Errors from ..errors import Errors
from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain
from ..util import registry, load_model_from_config, resolve_dot_names from ..util import registry, load_model_from_config, resolve_dot_names
from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB
def init_nlp( def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language:
config: Config, msg = Printer(no_print=silent)
*,
use_gpu: int = -1,
logger: Callable[[Any], Any] = logger,
on_success: Callable[[str], None] = lambda x: None,
) -> Language:
raw_config = config raw_config = config
config = raw_config.interpolate() config = raw_config.interpolate()
if config["training"]["seed"] is not None: if config["training"]["seed"] is not None:
@ -30,7 +26,7 @@ def init_nlp(
# Use original config here before it's resolved to functions # Use original config here before it's resolved to functions
sourced_components = get_sourced_components(config) sourced_components = get_sourced_components(config)
nlp = load_model_from_config(raw_config, auto_fill=True) nlp = load_model_from_config(raw_config, auto_fill=True)
on_success("Set up nlp object from config") msg.good("Set up nlp object from config")
config = nlp.config.interpolate() config = nlp.config.interpolate()
# Resolve all training-relevant sections using the filled nlp config # Resolve all training-relevant sections using the filled nlp config
T = registry.resolve(config["training"], schema=ConfigSchemaTraining) T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
@ -38,29 +34,31 @@ def init_nlp(
train_corpus, dev_corpus = resolve_dot_names(config, dot_names) train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
V = I["vocab"] V = I["vocab"]
init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"]) init_vocab(
nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent
)
optimizer = T["optimizer"] optimizer = T["optimizer"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Components that shouldn't be updated during training # Components that shouldn't be updated during training
frozen_components = T["frozen_components"] frozen_components = T["frozen_components"]
# Sourced components that require resume_training # Sourced components that require resume_training
resume_components = [p for p in sourced_components if p not in frozen_components] resume_components = [p for p in sourced_components if p not in frozen_components]
logger.info(f"Pipeline: {nlp.pipe_names}") msg.info(f"Pipeline: {nlp.pipe_names}")
if resume_components: if resume_components:
with nlp.select_pipes(enable=resume_components): with nlp.select_pipes(enable=resume_components):
logger.info(f"Resuming training for: {resume_components}") msg.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer) nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]): with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
on_success(f"Initialized pipeline components") msg.good(f"Initialized pipeline components")
# Verify the config after calling 'begin_training' to ensure labels # Verify the config after calling 'begin_training' to ensure labels
# are properly initialized # are properly initialized
verify_config(nlp) verify_config(nlp)
if "pretraining" in config and config["pretraining"]: if "pretraining" in config and config["pretraining"]:
P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain)
loaded = add_tok2vec_weights(nlp, P, I) loaded = add_tok2vec_weights(nlp, P, V)
if loaded and P["component"]: if loaded and P["component"]:
on_success(f"Loaded pretrained weights into component '{P['component']}'") msg.good(f"Loaded pretrained weights into component '{P['component']}'")
nlp = before_to_disk(nlp) nlp = before_to_disk(nlp)
return nlp return nlp
@ -76,11 +74,12 @@ def init_vocab(
data: Optional[Path] = None, data: Optional[Path] = None,
lookups: Optional[Lookups] = None, lookups: Optional[Lookups] = None,
vectors: Optional[str] = None, vectors: Optional[str] = None,
on_success: Callable[[str], None] = lambda x: None, silent: bool = True,
) -> Language: ) -> Language:
msg = Printer(no_print=silent)
if lookups: if lookups:
nlp.vocab.lookups = lookups nlp.vocab.lookups = lookups
on_success(f"Added vocab lookups: {', '.join(lookups.tables)}") msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}")
data_path = ensure_path(data) data_path = ensure_path(data)
if data_path is not None: if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path) lex_attrs = srsly.read_jsonl(data_path)
@ -96,11 +95,11 @@ def init_vocab(
else: else:
oov_prob = DEFAULT_OOV_PROB oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob}) nlp.vocab.cfg.update({"oov_prob": oov_prob})
on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab") msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab")
on_success("Created vocabulary") msg.good("Created vocabulary")
if vectors is not None: if vectors is not None:
load_vectors_into_model(nlp, vectors) load_vectors_into_model(nlp, vectors)
on_success(f"Added vectors: {vectors}") msg.good(f"Added vectors: {vectors}")
def load_vectors_into_model( def load_vectors_into_model(
@ -137,8 +136,8 @@ def add_tok2vec_weights(
init_tok2vec = ensure_path(V["init_tok2vec"]) init_tok2vec = ensure_path(V["init_tok2vec"])
if init_tok2vec is not None: if init_tok2vec is not None:
if P["objective"].get("type") == "vectors" and not V["vectors"]: if P["objective"].get("type") == "vectors" and not V["vectors"]:
err = 'need initialize.vectors if pretraining.objective.type is "vectors"' err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"'
errors = [{"loc": ["initialize", "vectors"], "msg": err}] errors = [{"loc": ["initialize", "vocab"], "msg": err}]
raise ConfigValidationError(config=nlp.config, errors=errors) raise ConfigValidationError(config=nlp.config, errors=errors)
if not init_tok2vec.exists(): if not init_tok2vec.exists():
err = f"can't find pretrained tok2vec: {init_tok2vec}" err = f"can't find pretrained tok2vec: {init_tok2vec}"

View File

@ -5,12 +5,13 @@ from timeit import default_timer as timer
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
import random import random
import tqdm import tqdm
from wasabi import Printer
from .example import Example from .example import Example
from ..schemas import ConfigSchemaTraining from ..schemas import ConfigSchemaTraining
from ..language import Language from ..language import Language
from ..errors import Errors from ..errors import Errors
from ..util import resolve_dot_names, registry, logger from ..util import resolve_dot_names, registry
def train( def train(
@ -18,8 +19,8 @@ def train(
output_path: Optional[Path] = None, output_path: Optional[Path] = None,
*, *,
use_gpu: int = -1, use_gpu: int = -1,
logger: Callable[[Any], Any] = logger, silent: bool = False,
) -> Optional[Path]: ) -> None:
"""Train a pipeline. """Train a pipeline.
nlp (Language): The initialized nlp object with the full config. nlp (Language): The initialized nlp object with the full config.
@ -31,7 +32,7 @@ def train(
swapped for CLI logger. swapped for CLI logger.
RETURNS (Path / None): The path to the final exported model. RETURNS (Path / None): The path to the final exported model.
""" """
msg = Printer(no_print=silent)
# Create iterator, which yields out info after each optimization step. # Create iterator, which yields out info after each optimization step.
config = nlp.config.interpolate() config = nlp.config.interpolate()
if config["training"]["seed"] is not None: if config["training"]["seed"] is not None:
@ -62,10 +63,10 @@ def train(
eval_frequency=T["eval_frequency"], eval_frequency=T["eval_frequency"],
exclude=frozen_components, exclude=frozen_components,
) )
logger.info(f"Pipeline: {nlp.pipe_names}") msg.info(f"Pipeline: {nlp.pipe_names}")
if frozen_components: if frozen_components:
logger.info(f"Frozen components: {frozen_components}") msg.info(f"Frozen components: {frozen_components}")
logger.info(f"Initial learn rate: {optimizer.learn_rate}") msg.info(f"Initial learn rate: {optimizer.learn_rate}")
with nlp.select_pipes(disable=frozen_components): with nlp.select_pipes(disable=frozen_components):
print_row, finalize_logger = train_logger(nlp) print_row, finalize_logger = train_logger(nlp)
try: try:
@ -89,7 +90,7 @@ def train(
if output_path is not None: if output_path is not None:
# We don't want to swallow the traceback if we don't have a # We don't want to swallow the traceback if we don't have a
# specific error. # specific error.
logger.warn( msg.warn(
f"Aborting and saving the final best model. " f"Aborting and saving the final best model. "
f"Encountered exception: {str(e)}" f"Encountered exception: {str(e)}"
) )
@ -105,7 +106,7 @@ def train(
nlp.to_disk(final_model_path) nlp.to_disk(final_model_path)
else: else:
nlp.to_disk(final_model_path) nlp.to_disk(final_model_path)
return final_model_path msg.good(f"Saved pipeline to output directory", final_model_path)
def train_while_improving( def train_while_improving(

View File

@ -1,4 +1,4 @@
from typing import Optional, Callable, Any, Iterable, Union, List from typing import Optional, Callable, Iterable, Union, List
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
from pathlib import Path from pathlib import Path
@ -8,7 +8,7 @@ import srsly
import numpy import numpy
import time import time
import re import re
from wasabi import msg from wasabi import Printer
from .example import Example from .example import Example
from ..tokens import Doc from ..tokens import Doc
@ -16,7 +16,7 @@ from ..attrs import ID
from ..ml.models.multi_task import build_cloze_multi_task_model from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
from ..util import registry, load_model_from_config, dot_to_object, logger from ..util import registry, load_model_from_config, dot_to_object
def pretrain( def pretrain(
@ -25,8 +25,9 @@ def pretrain(
resume_path: Optional[Path] = None, resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None, epoch_resume: Optional[int] = None,
use_gpu: int = -1, use_gpu: int = -1,
logger: Callable[[Any], Any] = logger, silent: bool = True,
): ):
msg = Printer(no_print=silent)
if config["training"]["seed"] is not None: if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"]) fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"] allocator = config["training"]["gpu_allocator"]
@ -42,11 +43,10 @@ def pretrain(
optimizer = P["optimizer"] optimizer = P["optimizer"]
# Load in pretrained weights to resume from # Load in pretrained weights to resume from
if resume_path is not None: if resume_path is not None:
_resume_model(model, resume_path, epoch_resume) _resume_model(model, resume_path, epoch_resume, silent=silent)
else: else:
# Without '--resume-path' the '--epoch-resume' argument is ignored # Without '--resume-path' the '--epoch-resume' argument is ignored
epoch_resume = 0 epoch_resume = 0
# TODO: move this to logger function? # TODO: move this to logger function?
tracker = ProgressTracker(frequency=10000) tracker = ProgressTracker(frequency=10000)
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
@ -94,12 +94,10 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
def _resume_model( def _resume_model(
model: Model, model: Model, resume_path: Path, epoch_resume: int, silent: bool = True,
resume_path: Path,
epoch_resume: int,
logger: Callable[[Any], Any] = logger,
) -> None: ) -> None:
logger.info(f"Resume training tok2vec from: {resume_path}") msg = Printer(no_print=silent)
msg.info(f"Resume training tok2vec from: {resume_path}")
with resume_path.open("rb") as file_: with resume_path.open("rb") as file_:
weights_data = file_.read() weights_data = file_.read()
model.get_ref("tok2vec").from_bytes(weights_data) model.get_ref("tok2vec").from_bytes(weights_data)
@ -108,9 +106,9 @@ def _resume_model(
if model_name: if model_name:
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
logger.info(f"Resuming from epoch: {epoch_resume}") msg.info(f"Resuming from epoch: {epoch_resume}")
else: else:
logger.info(f"Resuming from epoch: {epoch_resume}") msg.info(f"Resuming from epoch: {epoch_resume}")
def make_update( def make_update(