diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 57a88159d..0b27f63dc 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -3,6 +3,7 @@ from pathlib import Path from wasabi import msg import typer import logging +import sys from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code, setup_gpu @@ -39,7 +40,12 @@ def train_cli( DOCS: https://nightly.spacy.io/api/cli#train """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) - verify_cli_args(config_path, output_path) + # Make sure all files and paths exists if they are needed + if not config_path or not config_path.exists(): + msg.fail("Config file not found", config_path, exits=1) + if output_path is not None and not output_path.exists(): + output_path.mkdir() + msg.good(f"Created output directory: {output_path}") overrides = parse_config_overrides(ctx.args) import_code(code_path) setup_gpu(use_gpu) @@ -50,14 +56,4 @@ def train_cli( nlp = init_nlp(config, use_gpu=use_gpu) msg.good("Initialized pipeline") msg.divider("Training pipeline") - train(nlp, output_path, use_gpu=use_gpu, silent=False) - - -def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None: - # Make sure all files and paths exists if they are needed - if not config_path or not config_path.exists(): - msg.fail("Config file not found", config_path, exits=1) - if output_path is not None: - if not output_path.exists(): - output_path.mkdir() - msg.good(f"Created output directory: {output_path}") + train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index d64f211c4..7cb1555d7 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -102,7 +102,7 @@ def load_vectors_into_model( "with the packaged vectors. Make sure that the vectors package you're " "loading is compatible with the current version of spaCy." ) - err = ConfigValidationError.from_error(config=None, title=title, desc=desc) + err = ConfigValidationError.from_error(e, config=None, title=title, desc=desc) raise err from None nlp.vocab.vectors = vectors_nlp.vocab.vectors if add_strings: diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 0f054d433..be2da4bd8 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -1,18 +1,24 @@ -from typing import Dict, Any, Tuple, Callable, List +from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO +import wasabi +import tqdm +import sys from ..util import registry from .. import util from ..errors import Errors -from wasabi import msg @registry.loggers("spacy.ConsoleLogger.v1") -def console_logger(): +def console_logger(progress_bar: bool=False): def setup_printer( nlp: "Language", - ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: + stdout: IO=sys.stdout, + stderr: IO=sys.stderr + ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable]: + msg = wasabi.Printer(no_print=True) # we assume here that only components are enabled that should be trained & logged logged_pipes = nlp.pipe_names + eval_frequency = nlp.config["training"]["eval_frequency"] score_weights = nlp.config["training"]["score_weights"] score_cols = [col for col, value in score_weights.items() if value is not None] score_widths = [max(len(col), 6) for col in score_cols] @@ -22,10 +28,18 @@ def console_logger(): table_header = [col.upper() for col in table_header] table_widths = [3, 6] + loss_widths + score_widths + [6] table_aligns = ["r" for _ in table_widths] - msg.row(table_header, widths=table_widths) - msg.row(["-" * width for width in table_widths]) + stdout.write(msg.row(table_header, widths=table_widths)) + stdout.write(msg.row(["-" * width for width in table_widths])) + progress = None - def log_step(info: Dict[str, Any]): + def log_step(info: Optional[Dict[str, Any]]): + nonlocal progress + + if info is None: + # If we don't have a new checkpoint, just return. + if progress is not None: + progress.update(1) + return try: losses = [ "{0:.2f}".format(float(info["losses"][pipe_name])) @@ -39,24 +53,37 @@ def console_logger(): keys=list(info["losses"].keys()), ) ) from None + scores = [] for col in score_cols: score = info["other_scores"].get(col, 0.0) try: score = float(score) - if col != "speed": - score *= 100 - scores.append("{0:.2f}".format(score)) except TypeError: err = Errors.E916.format(name=col, score_type=type(score)) raise ValueError(err) from None + if col != "speed": + score *= 100 + scores.append("{0:.2f}".format(score)) + data = ( [info["epoch"], info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] ) - msg.row(data, widths=table_widths, aligns=table_aligns) + if progress is not None: + progress.close() + stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns)) + if progress_bar: + # Set disable=None, so that it disables on non-TTY + progress = tqdm.tqdm( + total=eval_frequency, + disable=None, + leave=False, + file=stderr + ) + progress.set_description(f"Epoch {info['epoch']+1}") def finalize(): pass @@ -70,10 +97,12 @@ def console_logger(): def wandb_logger(project_name: str, remove_config_values: List[str] = []): import wandb - console = console_logger() + console = console_logger(progress_bar=False) def setup_logger( nlp: "Language", + stdout: IO=sys.stdout, + stderr: IO=sys.stderr ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: config = nlp.config.interpolate() config_dot = util.dict_to_dot(config) @@ -81,18 +110,19 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []): del config_dot[field] config = util.dot_to_dict(config_dot) wandb.init(project=project_name, config=config, reinit=True) - console_log_step, console_finalize = console(nlp) + console_log_step, console_finalize = console(nlp, stdout, stderr) - def log_step(info: Dict[str, Any]): + def log_step(info: Optional[Dict[str, Any]]): console_log_step(info) - score = info["score"] - other_scores = info["other_scores"] - losses = info["losses"] - wandb.log({"score": score}) - if losses: - wandb.log({f"loss_{k}": v for k, v in losses.items()}) - if isinstance(other_scores, dict): - wandb.log(other_scores) + if info is not None: + score = info["score"] + other_scores = info["other_scores"] + losses = info["losses"] + wandb.log({"score": score}) + if losses: + wandb.log({f"loss_{k}": v for k, v in losses.items()}) + if isinstance(other_scores, dict): + wandb.log(other_scores) def finalize(): console_finalize() diff --git a/spacy/training/loop.py b/spacy/training/loop.py index e20cddd3e..093a9ebb3 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -1,11 +1,11 @@ -from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any +from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any, IO from typing import Optional, TYPE_CHECKING from pathlib import Path from timeit import default_timer as timer from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator import random -import tqdm -from wasabi import Printer +import wasabi +import sys from .example import Example from ..schemas import ConfigSchemaTraining @@ -21,7 +21,8 @@ def train( output_path: Optional[Path] = None, *, use_gpu: int = -1, - silent: bool = False, + stdout: IO=sys.stdout, + stderr: IO=sys.stderr ) -> None: """Train a pipeline. @@ -29,10 +30,15 @@ def train( output_path (Path): Optional output path to save trained model to. use_gpu (int): Whether to train on GPU. Make sure to call require_gpu before calling this function. - silent (bool): Whether to pretty-print outputs. + stdout (file): A file-like object to write output messages. To disable + printing, set to io.StringIO. + stderr (file): A second file-like object to write output messages. To disable + printing, set to io.StringIO. + RETURNS (Path / None): The path to the final exported model. """ - msg = Printer(no_print=silent) + # We use no_print here so we can respect the stdout/stderr options. + msg = wasabi.Printer(no_print=True) # Create iterator, which yields out info after each optimization step. config = nlp.config.interpolate() if config["training"]["seed"] is not None: @@ -63,50 +69,44 @@ def train( eval_frequency=T["eval_frequency"], exclude=frozen_components, ) - msg.info(f"Pipeline: {nlp.pipe_names}") + stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}")) if frozen_components: - msg.info(f"Frozen components: {frozen_components}") - msg.info(f"Initial learn rate: {optimizer.learn_rate}") + stdout.write(msg.info(f"Frozen components: {frozen_components}")) + stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}")) with nlp.select_pipes(disable=frozen_components): - print_row, finalize_logger = train_logger(nlp) + log_step, finalize_logger = train_logger(nlp, stdout, stderr) try: - progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) - progress.set_description(f"Epoch 1") for batch, info, is_best_checkpoint in training_step_iterator: - progress.update(1) - if is_best_checkpoint is not None: - progress.close() - print_row(info) - if is_best_checkpoint and output_path is not None: - with nlp.select_pipes(disable=frozen_components): - update_meta(T, nlp, info) - with nlp.use_params(optimizer.averages): - nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-best") - progress = tqdm.tqdm(total=T["eval_frequency"], leave=False) - progress.set_description(f"Epoch {info['epoch']}") + log_step(info if is_best_checkpoint else None) + if is_best_checkpoint is not None and output_path is not None: + with nlp.select_pipes(disable=frozen_components): + update_meta(T, nlp, info) + with nlp.use_params(optimizer.averages): + nlp = before_to_disk(nlp) + nlp.to_disk(output_path / "model-best") except Exception as e: - finalize_logger() if output_path is not None: # We don't want to swallow the traceback if we don't have a - # specific error. - msg.warn( - f"Aborting and saving the final best model. " - f"Encountered exception: {str(e)}" + # specific error, but we do want to warn that we're trying + # to do something here. + stdout.write( + msg.warn( + f"Aborting and saving the final best model. " + f"Encountered exception: {str(e)}" + ) ) - nlp = before_to_disk(nlp) - nlp.to_disk(output_path / "model-final") raise e finally: finalize_logger() if output_path is not None: - final_model_path = output_path / "model-final" + final_model_path = output_path / "model-last" if optimizer.averages: with nlp.use_params(optimizer.averages): nlp.to_disk(final_model_path) else: nlp.to_disk(final_model_path) - msg.good(f"Saved pipeline to output directory", final_model_path) + # This will only run if we don't hit an error + stdout.write(msg.good("Saved pipeline to output directory", final_model_path)) def train_while_improving( diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 74d2f6de5..fb1efec1b 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -689,8 +689,8 @@ During training, the results of each step are passed to a logger function. By default, these results are written to the console with the [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support for writing the log files to [Weights & Biases](https://www.wandb.com/) with the -[`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a -**dictionary** with the following keys: +[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function +receives a **dictionary** with the following keys: | Key | Value | | -------------- | ---------------------------------------------------------------------------------------------- | @@ -715,30 +715,37 @@ tabular results to a file: ```python ### functions.py -from typing import Tuple, Callable, Dict, Any +import sys +from typing import IO, Tuple, Callable, Dict, Any import spacy +from spacy import Language from pathlib import Path @spacy.registry.loggers("my_custom_logger.v1") def custom_logger(log_path): - def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]: - with Path(log_path).open("w", encoding="utf8") as file_: - file_.write("step\\t") - file_.write("score\\t") - for pipe in nlp.pipe_names: - file_.write(f"loss_{pipe}\\t") - file_.write("\\n") + def setup_logger( + nlp: Language, + stdout: IO=sys.stdout, + stderr: IO=sys.stderr + ) -> Tuple[Callable, Callable]: + stdout.write(f"Logging to {log_path}\n") + log_file = Path(log_path).open("w", encoding="utf8") + log_file.write("step\\t") + log_file.write("score\\t") + for pipe in nlp.pipe_names: + log_file.write(f"loss_{pipe}\\t") + log_file.write("\\n") - def log_step(info: Dict[str, Any]): - with Path(log_path).open("a") as file_: - file_.write(f"{info['step']}\\t") - file_.write(f"{info['score']}\\t") + def log_step(info: Optional[Dict[str, Any]]): + if info: + log_file.write(f"{info['step']}\\t") + log_file.write(f"{info['score']}\\t") for pipe in nlp.pipe_names: - file_.write(f"{info['losses'][pipe]}\\t") - file_.write("\\n") + log_file.write(f"{info['losses'][pipe]}\\t") + log_file.write("\\n") def finalize(): - pass + log_file.close() return log_step, finalize