Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-08-13 08:34:57 +03:00 · 2020-10-03 16:08:27 +02:00 · 2020-10-03 16:08:27 +02:00 · 3b8f352eda
commit 3b8f352eda
parent 35d695a031 7b127f307e
6 changed files with 119 additions and 86 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a29"
+__version__ = "3.0.0a30"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -3,6 +3,7 @@ from pathlib import Path
 from wasabi import msg
 import typer
 import logging
+import sys

 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, setup_gpu
@ -39,7 +40,12 @@ def train_cli(
    DOCS: https://nightly.spacy.io/api/cli#train
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
-    verify_cli_args(config_path, output_path)
+    # Make sure all files and paths exists if they are needed
+    if not config_path or not config_path.exists():
+        msg.fail("Config file not found", config_path, exits=1)
+    if output_path is not None and not output_path.exists():
+        output_path.mkdir()
+        msg.good(f"Created output directory: {output_path}")
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
@ -50,14 +56,4 @@ def train_cli(
        nlp = init_nlp(config, use_gpu=use_gpu)
    msg.good("Initialized pipeline")
    msg.divider("Training pipeline")
-    train(nlp, output_path, use_gpu=use_gpu, silent=False)
-
-
-def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
-    # Make sure all files and paths exists if they are needed
-    if not config_path or not config_path.exists():
-        msg.fail("Config file not found", config_path, exits=1)
-    if output_path is not None:
-        if not output_path.exists():
-            output_path.mkdir()
-            msg.good(f"Created output directory: {output_path}")
+    train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -102,7 +102,7 @@ def load_vectors_into_model(
            "with the packaged vectors. Make sure that the vectors package you're "
            "loading is compatible with the current version of spaCy."
        )
-        err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
+        err = ConfigValidationError.from_error(e, config=None, title=title, desc=desc)
        raise err from None
    nlp.vocab.vectors = vectors_nlp.vocab.vectors
    if add_strings:
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@ -1,18 +1,24 @@
-from typing import Dict, Any, Tuple, Callable, List
+from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO
+import wasabi
+import tqdm
+import sys

 from ..util import registry
 from .. import util
 from ..errors import Errors
-from wasabi import msg


@registry.loggers("spacy.ConsoleLogger.v1")
-def console_logger():
+def console_logger(progress_bar: bool=False):
    def setup_printer(
        nlp: "Language",
-    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
+        stdout: IO=sys.stdout,
+        stderr: IO=sys.stderr
+    ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable]:
+        msg = wasabi.Printer(no_print=True)
        # we assume here that only components are enabled that should be trained & logged
        logged_pipes = nlp.pipe_names
+        eval_frequency = nlp.config["training"]["eval_frequency"]
        score_weights = nlp.config["training"]["score_weights"]
        score_cols = [col for col, value in score_weights.items() if value is not None]
        score_widths = [max(len(col), 6) for col in score_cols]
@ -22,10 +28,18 @@ def console_logger():
        table_header = [col.upper() for col in table_header]
        table_widths = [3, 6] + loss_widths + score_widths + [6]
        table_aligns = ["r" for _ in table_widths]
-        msg.row(table_header, widths=table_widths)
-        msg.row(["-" * width for width in table_widths])
+        stdout.write(msg.row(table_header, widths=table_widths))
+        stdout.write(msg.row(["-" * width for width in table_widths]))
+        progress = None

-        def log_step(info: Dict[str, Any]):
+        def log_step(info: Optional[Dict[str, Any]]):
+            nonlocal progress
+
+            if info is None:
+                # If we don't have a new checkpoint, just return.
+                if progress is not None:
+                    progress.update(1)
+                return 
            try:
                losses = [
                    "{0:.2f}".format(float(info["losses"][pipe_name]))
@ -39,24 +53,37 @@ def console_logger():
                        keys=list(info["losses"].keys()),
                    )
                ) from None
+
            scores = []
            for col in score_cols:
                score = info["other_scores"].get(col, 0.0)
                try:
                    score = float(score)
-                    if col != "speed":
-                        score *= 100
-                    scores.append("{0:.2f}".format(score))
                except TypeError:
                    err = Errors.E916.format(name=col, score_type=type(score))
                    raise ValueError(err) from None
+                if col != "speed":
+                    score *= 100
+                scores.append("{0:.2f}".format(score))
+
            data = (
                [info["epoch"], info["step"]]
                + losses
                + scores
                + ["{0:.2f}".format(float(info["score"]))]
            )
-            msg.row(data, widths=table_widths, aligns=table_aligns)
+            if progress is not None:
+                progress.close()
+            stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns))
+            if progress_bar:
+                # Set disable=None, so that it disables on non-TTY
+                progress = tqdm.tqdm(
+                    total=eval_frequency,
+                    disable=None,
+                    leave=False,
+                    file=stderr
+                )
+                progress.set_description(f"Epoch {info['epoch']+1}")

        def finalize():
            pass
@ -70,10 +97,12 @@ def console_logger():
 def wandb_logger(project_name: str, remove_config_values: List[str] = []):
    import wandb

-    console = console_logger()
+    console = console_logger(progress_bar=False)

    def setup_logger(
        nlp: "Language",
+        stdout: IO=sys.stdout,
+        stderr: IO=sys.stderr
    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
        config = nlp.config.interpolate()
        config_dot = util.dict_to_dot(config)
@ -81,18 +110,19 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []):
            del config_dot[field]
        config = util.dot_to_dict(config_dot)
        wandb.init(project=project_name, config=config, reinit=True)
-        console_log_step, console_finalize = console(nlp)
+        console_log_step, console_finalize = console(nlp, stdout, stderr)

-        def log_step(info: Dict[str, Any]):
+        def log_step(info: Optional[Dict[str, Any]]):
            console_log_step(info)
-            score = info["score"]
-            other_scores = info["other_scores"]
-            losses = info["losses"]
-            wandb.log({"score": score})
-            if losses:
-                wandb.log({f"loss_{k}": v for k, v in losses.items()})
-            if isinstance(other_scores, dict):
-                wandb.log(other_scores)
+            if info is not None:
+                score = info["score"]
+                other_scores = info["other_scores"]
+                losses = info["losses"]
+                wandb.log({"score": score})
+                if losses:
+                    wandb.log({f"loss_{k}": v for k, v in losses.items()})
+                if isinstance(other_scores, dict):
+                    wandb.log(other_scores)

        def finalize():
            console_finalize()
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -1,11 +1,11 @@
-from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
+from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any, IO
 from typing import Optional, TYPE_CHECKING
 from pathlib import Path
 from timeit import default_timer as timer
 from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
 import random
-import tqdm
-from wasabi import Printer
+import wasabi
+import sys

 from .example import Example
 from ..schemas import ConfigSchemaTraining
@ -21,7 +21,8 @@ def train(
    output_path: Optional[Path] = None,
    *,
    use_gpu: int = -1,
-    silent: bool = False,
+    stdout: IO=sys.stdout,
+    stderr: IO=sys.stderr
 ) -> None:
    """Train a pipeline.

@ -29,10 +30,15 @@ def train(
    output_path (Path): Optional output path to save trained model to.
    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
        before calling this function.
-    silent (bool): Whether to pretty-print outputs.
+    stdout (file): A file-like object to write output messages. To disable
+        printing, set to io.StringIO.
+    stderr (file): A second file-like object to write output messages. To disable
+        printing, set to io.StringIO.
+ 
    RETURNS (Path / None): The path to the final exported model.
    """
-    msg = Printer(no_print=silent)
+    # We use no_print here so we can respect the stdout/stderr options.
+    msg = wasabi.Printer(no_print=True)
    # Create iterator, which yields out info after each optimization step.
    config = nlp.config.interpolate()
    if config["training"]["seed"] is not None:
@ -63,50 +69,44 @@ def train(
        eval_frequency=T["eval_frequency"],
        exclude=frozen_components,
    )
-    msg.info(f"Pipeline: {nlp.pipe_names}")
+    stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}"))
    if frozen_components:
-        msg.info(f"Frozen components: {frozen_components}")
-    msg.info(f"Initial learn rate: {optimizer.learn_rate}")
+        stdout.write(msg.info(f"Frozen components: {frozen_components}"))
+    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}"))
    with nlp.select_pipes(disable=frozen_components):
-        print_row, finalize_logger = train_logger(nlp)
+        log_step, finalize_logger = train_logger(nlp, stdout, stderr)
    try:
-        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
-        progress.set_description(f"Epoch 1")
        for batch, info, is_best_checkpoint in training_step_iterator:
-            progress.update(1)
-            if is_best_checkpoint is not None:
-                progress.close()
-                print_row(info)
-                if is_best_checkpoint and output_path is not None:
-                    with nlp.select_pipes(disable=frozen_components):
-                        update_meta(T, nlp, info)
-                    with nlp.use_params(optimizer.averages):
-                        nlp = before_to_disk(nlp)
-                        nlp.to_disk(output_path / "model-best")
-                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
-                progress.set_description(f"Epoch {info['epoch']}")
+            log_step(info if is_best_checkpoint else None)
+            if is_best_checkpoint is not None and output_path is not None:
+                with nlp.select_pipes(disable=frozen_components):
+                    update_meta(T, nlp, info)
+                with nlp.use_params(optimizer.averages):
+                    nlp = before_to_disk(nlp)
+                    nlp.to_disk(output_path / "model-best")
    except Exception as e:
-        finalize_logger()
        if output_path is not None:
            # We don't want to swallow the traceback if we don't have a
-            # specific error.
-            msg.warn(
-                f"Aborting and saving the final best model. "
-                f"Encountered exception: {str(e)}"
+            # specific error, but we do want to warn that we're trying
+            # to do something here.
+            stdout.write(
+                msg.warn(
+                    f"Aborting and saving the final best model. "
+                    f"Encountered exception: {str(e)}"
+                )
            )
-            nlp = before_to_disk(nlp)
-            nlp.to_disk(output_path / "model-final")
        raise e
    finally:
        finalize_logger()
        if output_path is not None:
-            final_model_path = output_path / "model-final"
+            final_model_path = output_path / "model-last"
            if optimizer.averages:
                with nlp.use_params(optimizer.averages):
                    nlp.to_disk(final_model_path)
            else:
                nlp.to_disk(final_model_path)
-            msg.good(f"Saved pipeline to output directory", final_model_path)
+    # This will only run if we don't hit an error
+    stdout.write(msg.good("Saved pipeline to output directory", final_model_path))


 def train_while_improving(
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -689,8 +689,8 @@ During training, the results of each step are passed to a logger function. By
 default, these results are written to the console with the
 [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
 for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
-[`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a
-**dictionary** with the following keys:
+[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function
+receives a **dictionary** with the following keys:

 | Key            | Value                                                                                                 |
 | -------------- | ----------------------------------------------------------------------------------------------------- |
@ -715,30 +715,37 @@ tabular results to a file:

 ```python
 ### functions.py
-from typing import Tuple, Callable, Dict, Any
+import sys
+from typing import IO, Tuple, Callable, Dict, Any
 import spacy
+from spacy import Language
 from pathlib import Path

@spacy.registry.loggers("my_custom_logger.v1")
 def custom_logger(log_path):
-    def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
-        with Path(log_path).open("w", encoding="utf8") as file_:
-            file_.write("step\\t")
-            file_.write("score\\t")
-            for pipe in nlp.pipe_names:
-                file_.write(f"loss_{pipe}\\t")
-            file_.write("\\n")
+    def setup_logger(
+        nlp: Language,
+        stdout: IO=sys.stdout,
+        stderr: IO=sys.stderr
+    ) -> Tuple[Callable, Callable]:
+        stdout.write(f"Logging to {log_path}\n")
+        log_file = Path(log_path).open("w", encoding="utf8")
+        log_file.write("step\\t")
+        log_file.write("score\\t")
+        for pipe in nlp.pipe_names:
+            log_file.write(f"loss_{pipe}\\t")
+        log_file.write("\\n")

-        def log_step(info: Dict[str, Any]):
-            with Path(log_path).open("a") as file_:
-                file_.write(f"{info['step']}\\t")
-                file_.write(f"{info['score']}\\t")
+        def log_step(info: Optional[Dict[str, Any]]):
+            if info:
+                log_file.write(f"{info['step']}\\t")
+                log_file.write(f"{info['score']}\\t")
                for pipe in nlp.pipe_names:
-                    file_.write(f"{info['losses'][pipe]}\\t")
-                file_.write("\\n")
+                    log_file.write(f"{info['losses'][pipe]}\\t")
+                log_file.write("\\n")

        def finalize():
-            pass
+            log_file.close()

        return log_step, finalize