diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index c41905970..2c944bf3a 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -448,19 +448,8 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in return result -class CliLogger: - """Helper mocking up the most commonly used logger methods. Can be passed - into functions like train() to make them output pretty-printed messages - on the CLI and regular logging if used from within Python. - """ - - debug = msg.text - info = msg.info - warn = msg.info - error = msg.fail - - -def setup_gpu(use_gpu: int): +def setup_gpu(use_gpu: int) -> None: + """Configure the GPU and log info.""" if use_gpu >= 0: msg.info(f"Using GPU: {use_gpu}") require_gpu(use_gpu) diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index de1dc8a46..a92705cb0 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -7,7 +7,7 @@ import typer from .. import util from ..training.initialize import init_nlp from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, CliLogger, setup_gpu +from ._util import import_code, setup_gpu @init_cli.command( @@ -32,6 +32,6 @@ def init_pipeline_cli( with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) with show_validation_error(hint_fill=False): - nlp = init_nlp(config, use_gpu=use_gpu, logger=CliLogger, on_succcess=msg.good) + nlp = init_nlp(config, use_gpu=use_gpu, silent=False) nlp.to_disk(output_path) msg.good(f"Saved initialized pipeline to {output_path}") diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 6494486a9..de9341449 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -5,7 +5,7 @@ import typer import re from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, setup_gpu, CliLogger +from ._util import import_code, setup_gpu from ..training.pretrain import pretrain from ..util import load_config @@ -73,7 +73,7 @@ def pretrain_cli( resume_path=resume_path, epoch_resume=epoch_resume, use_gpu=use_gpu, - logger=CliLogger, + silent=False, ) msg.good("Successfully finished pretrain") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index aa0e71b5a..b0bd48ddb 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -6,7 +6,7 @@ import typer import logging from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, CliLogger, setup_gpu +from ._util import import_code, setup_gpu from ..language import Language from ..training.loop import train from ..training.initialize import init_nlp, must_reinitialize @@ -50,15 +50,13 @@ def train_cli( msg.divider("Initializing pipeline") nlp = init_pipeline(config, output_path, use_gpu=use_gpu) msg.divider("Training pipeline") - final_path = train(nlp, output_path, use_gpu=use_gpu, logger=CliLogger) - if final_path: - msg.good(f"Saved pipeline to output directory", final_path) + train(nlp, output_path, use_gpu=use_gpu, silent=False) def init_pipeline( config: Config, output_path: Optional[Path], *, use_gpu: int = -1 ) -> Language: - init_kwargs = {"use_gpu": use_gpu, "logger": CliLogger, "on_success": msg.good} + init_kwargs = {"use_gpu": use_gpu, "silent": False} if output_path is not None: init_path = output_path / "model-initial" if not init_path.exists(): diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 8938886fe..ecfc57ee9 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -1,7 +1,8 @@ -from typing import Union, Dict, Optional, Any, List, Callable +from typing import Union, Dict, Optional, Any, List from thinc.api import Config, fix_random_seed, set_gpu_allocator from thinc.api import ConfigValidationError from pathlib import Path +from wasabi import Printer import srsly from .loop import create_before_to_disk_callback @@ -10,16 +11,11 @@ from ..lookups import Lookups from ..errors import Errors from ..schemas import ConfigSchemaTraining, ConfigSchemaInit, ConfigSchemaPretrain from ..util import registry, load_model_from_config, resolve_dot_names -from ..util import load_model, ensure_path, logger, OOV_RANK, DEFAULT_OOV_PROB +from ..util import load_model, ensure_path, OOV_RANK, DEFAULT_OOV_PROB -def init_nlp( - config: Config, - *, - use_gpu: int = -1, - logger: Callable[[Any], Any] = logger, - on_success: Callable[[str], None] = lambda x: None, -) -> Language: +def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Language: + msg = Printer(no_print=silent) raw_config = config config = raw_config.interpolate() if config["training"]["seed"] is not None: @@ -30,7 +26,7 @@ def init_nlp( # Use original config here before it's resolved to functions sourced_components = get_sourced_components(config) nlp = load_model_from_config(raw_config, auto_fill=True) - on_success("Set up nlp object from config") + msg.good("Set up nlp object from config") config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config T = registry.resolve(config["training"], schema=ConfigSchemaTraining) @@ -38,29 +34,31 @@ def init_nlp( train_corpus, dev_corpus = resolve_dot_names(config, dot_names) I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) V = I["vocab"] - init_vocab(nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"]) + init_vocab( + nlp, data=V["data"], lookups=V["lookups"], vectors=V["vectors"], silent=silent + ) optimizer = T["optimizer"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) # Components that shouldn't be updated during training frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced_components if p not in frozen_components] - logger.info(f"Pipeline: {nlp.pipe_names}") + msg.info(f"Pipeline: {nlp.pipe_names}") if resume_components: with nlp.select_pipes(enable=resume_components): - logger.info(f"Resuming training for: {resume_components}") + msg.info(f"Resuming training for: {resume_components}") nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) - on_success(f"Initialized pipeline components") + msg.good(f"Initialized pipeline components") # Verify the config after calling 'begin_training' to ensure labels # are properly initialized verify_config(nlp) if "pretraining" in config and config["pretraining"]: P = registry.resolve(config["pretraining"], schema=ConfigSchemaPretrain) - loaded = add_tok2vec_weights(nlp, P, I) + loaded = add_tok2vec_weights(nlp, P, V) if loaded and P["component"]: - on_success(f"Loaded pretrained weights into component '{P['component']}'") + msg.good(f"Loaded pretrained weights into component '{P['component']}'") nlp = before_to_disk(nlp) return nlp @@ -76,11 +74,12 @@ def init_vocab( data: Optional[Path] = None, lookups: Optional[Lookups] = None, vectors: Optional[str] = None, - on_success: Callable[[str], None] = lambda x: None, + silent: bool = True, ) -> Language: + msg = Printer(no_print=silent) if lookups: nlp.vocab.lookups = lookups - on_success(f"Added vocab lookups: {', '.join(lookups.tables)}") + msg.good(f"Added vocab lookups: {', '.join(lookups.tables)}") data_path = ensure_path(data) if data_path is not None: lex_attrs = srsly.read_jsonl(data_path) @@ -96,11 +95,11 @@ def init_vocab( else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) - on_success(f"Added {len(nlp.vocab)} lexical entries to the vocab") - on_success("Created vocabulary") + msg.good(f"Added {len(nlp.vocab)} lexical entries to the vocab") + msg.good("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) - on_success(f"Added vectors: {vectors}") + msg.good(f"Added vectors: {vectors}") def load_vectors_into_model( @@ -137,8 +136,8 @@ def add_tok2vec_weights( init_tok2vec = ensure_path(V["init_tok2vec"]) if init_tok2vec is not None: if P["objective"].get("type") == "vectors" and not V["vectors"]: - err = 'need initialize.vectors if pretraining.objective.type is "vectors"' - errors = [{"loc": ["initialize", "vectors"], "msg": err}] + err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"' + errors = [{"loc": ["initialize", "vocab"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors) if not init_tok2vec.exists(): err = f"can't find pretrained tok2vec: {init_tok2vec}" diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 3e3e9f5ce..5153be66c 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -5,12 +5,13 @@ from timeit import default_timer as timer from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator import random import tqdm +from wasabi import Printer from .example import Example from ..schemas import ConfigSchemaTraining from ..language import Language from ..errors import Errors -from ..util import resolve_dot_names, registry, logger +from ..util import resolve_dot_names, registry def train( @@ -18,8 +19,8 @@ def train( output_path: Optional[Path] = None, *, use_gpu: int = -1, - logger: Callable[[Any], Any] = logger, -) -> Optional[Path]: + silent: bool = False, +) -> None: """Train a pipeline. nlp (Language): The initialized nlp object with the full config. @@ -31,7 +32,7 @@ def train( swapped for CLI logger. RETURNS (Path / None): The path to the final exported model. """ - + msg = Printer(no_print=silent) # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() if config["training"]["seed"] is not None: @@ -62,10 +63,10 @@ def train( eval_frequency=T["eval_frequency"], exclude=frozen_components, ) - logger.info(f"Pipeline: {nlp.pipe_names}") + msg.info(f"Pipeline: {nlp.pipe_names}") if frozen_components: - logger.info(f"Frozen components: {frozen_components}") - logger.info(f"Initial learn rate: {optimizer.learn_rate}") + msg.info(f"Frozen components: {frozen_components}") + msg.info(f"Initial learn rate: {optimizer.learn_rate}") with nlp.select_pipes(disable=frozen_components): print_row, finalize_logger = train_logger(nlp) try: @@ -89,7 +90,7 @@ def train( if output_path is not None: # We don't want to swallow the traceback if we don't have a # specific error. - logger.warn( + msg.warn( f"Aborting and saving the final best model. " f"Encountered exception: {str(e)}" ) @@ -105,7 +106,7 @@ def train( nlp.to_disk(final_model_path) else: nlp.to_disk(final_model_path) - return final_model_path + msg.good(f"Saved pipeline to output directory", final_model_path) def train_while_improving( diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index e8dd9df30..5e136cdf1 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -1,4 +1,4 @@ -from typing import Optional, Callable, Any, Iterable, Union, List +from typing import Optional, Callable, Iterable, Union, List from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance from pathlib import Path @@ -8,7 +8,7 @@ import srsly import numpy import time import re -from wasabi import msg +from wasabi import Printer from .example import Example from ..tokens import Doc @@ -16,7 +16,7 @@ from ..attrs import ID from ..ml.models.multi_task import build_cloze_multi_task_model from ..ml.models.multi_task import build_cloze_characters_multi_task_model from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain -from ..util import registry, load_model_from_config, dot_to_object, logger +from ..util import registry, load_model_from_config, dot_to_object def pretrain( @@ -25,8 +25,9 @@ def pretrain( resume_path: Optional[Path] = None, epoch_resume: Optional[int] = None, use_gpu: int = -1, - logger: Callable[[Any], Any] = logger, + silent: bool = True, ): + msg = Printer(no_print=silent) if config["training"]["seed"] is not None: fix_random_seed(config["training"]["seed"]) allocator = config["training"]["gpu_allocator"] @@ -42,11 +43,10 @@ def pretrain( optimizer = P["optimizer"] # Load in pretrained weights to resume from if resume_path is not None: - _resume_model(model, resume_path, epoch_resume) + _resume_model(model, resume_path, epoch_resume, silent=silent) else: # Without '--resume-path' the '--epoch-resume' argument is ignored epoch_resume = 0 - # TODO: move this to logger function? tracker = ProgressTracker(frequency=10000) msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") @@ -94,12 +94,10 @@ def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]: def _resume_model( - model: Model, - resume_path: Path, - epoch_resume: int, - logger: Callable[[Any], Any] = logger, + model: Model, resume_path: Path, epoch_resume: int, silent: bool = True, ) -> None: - logger.info(f"Resume training tok2vec from: {resume_path}") + msg = Printer(no_print=silent) + msg.info(f"Resume training tok2vec from: {resume_path}") with resume_path.open("rb") as file_: weights_data = file_.read() model.get_ref("tok2vec").from_bytes(weights_data) @@ -108,9 +106,9 @@ def _resume_model( if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 - logger.info(f"Resuming from epoch: {epoch_resume}") + msg.info(f"Resuming from epoch: {epoch_resume}") else: - logger.info(f"Resuming from epoch: {epoch_resume}") + msg.info(f"Resuming from epoch: {epoch_resume}") def make_update(