Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-21 21:49:49 +03:00 · 2020-10-03 16:08:27 +02:00 · 2020-10-03 16:08:27 +02:00 · 3b8f352eda
commit 3b8f352eda
parent 35d695a031 7b127f307e
6 changed files with 119 additions and 86 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a29"
+__version__ = "3.0.0a30"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -3,6 +3,7 @@ from pathlib import Path
 from wasabi import msg
 import typer
 import logging
 import sys
 from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 from ._util import import_code, setup_gpu
@ -39,7 +40,12 @@ def train_cli(
    DOCS: https://nightly.spacy.io/api/cli#train
    """
    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
-    verify_cli_args(config_path, output_path)
+    # Make sure all files and paths exists if they are needed
    if not config_path or not config_path.exists():
        msg.fail("Config file not found", config_path, exits=1)
    if output_path is not None and not output_path.exists():
        output_path.mkdir()
        msg.good(f"Created output directory: {output_path}")
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
@ -50,14 +56,4 @@ def train_cli(
        nlp = init_nlp(config, use_gpu=use_gpu)
    msg.good("Initialized pipeline")
    msg.divider("Training pipeline")
-    train(nlp, output_path, use_gpu=use_gpu, silent=False)
+    train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
 def verify_cli_args(config_path: Path, output_path: Optional[Path] = None) -> None:
    # Make sure all files and paths exists if they are needed
    if not config_path or not config_path.exists():
        msg.fail("Config file not found", config_path, exits=1)
    if output_path is not None:
        if not output_path.exists():
            output_path.mkdir()
            msg.good(f"Created output directory: {output_path}")
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -102,7 +102,7 @@ def load_vectors_into_model(
            "with the packaged vectors. Make sure that the vectors package you're "
            "loading is compatible with the current version of spaCy."
        )
-        err = ConfigValidationError.from_error(config=None, title=title, desc=desc)
+        err = ConfigValidationError.from_error(e, config=None, title=title, desc=desc)
        raise err from None
    nlp.vocab.vectors = vectors_nlp.vocab.vectors
    if add_strings:
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@ -1,18 +1,24 @@
-from typing import Dict, Any, Tuple, Callable, List
+from typing import TYPE_CHECKING, Dict, Any, Tuple, Callable, List, Optional, IO
 import wasabi
 import tqdm
 import sys
 from ..util import registry
 from .. import util
 from ..errors import Errors
 from wasabi import msg
@registry.loggers("spacy.ConsoleLogger.v1")
-def console_logger():
+def console_logger(progress_bar: bool=False):
    def setup_printer(
        nlp: "Language",
-    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
+        stdout: IO=sys.stdout,
        stderr: IO=sys.stderr
    ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable]:
        msg = wasabi.Printer(no_print=True)
        # we assume here that only components are enabled that should be trained & logged
        logged_pipes = nlp.pipe_names
        eval_frequency = nlp.config["training"]["eval_frequency"]
        score_weights = nlp.config["training"]["score_weights"]
        score_cols = [col for col, value in score_weights.items() if value is not None]
        score_widths = [max(len(col), 6) for col in score_cols]
@ -22,10 +28,18 @@ def console_logger():
        table_header = [col.upper() for col in table_header]
        table_widths = [3, 6] + loss_widths + score_widths + [6]
        table_aligns = ["r" for _ in table_widths]
-        msg.row(table_header, widths=table_widths)
+        stdout.write(msg.row(table_header, widths=table_widths))
-        msg.row(["-" * width for width in table_widths])
+        stdout.write(msg.row(["-" * width for width in table_widths]))
        progress = None
-        def log_step(info: Dict[str, Any]):
+        def log_step(info: Optional[Dict[str, Any]]):
            nonlocal progress
            if info is None:
                # If we don't have a new checkpoint, just return.
                if progress is not None:
                    progress.update(1)
                return 
            try:
                losses = [
                    "{0:.2f}".format(float(info["losses"][pipe_name]))
@ -39,24 +53,37 @@ def console_logger():
                        keys=list(info["losses"].keys()),
                    )
                ) from None
            scores = []
            for col in score_cols:
                score = info["other_scores"].get(col, 0.0)
                try:
                    score = float(score)
                    if col != "speed":
                        score *= 100
                    scores.append("{0:.2f}".format(score))
                except TypeError:
                    err = Errors.E916.format(name=col, score_type=type(score))
                    raise ValueError(err) from None
                if col != "speed":
                    score *= 100
                scores.append("{0:.2f}".format(score))
            data = (
                [info["epoch"], info["step"]]
                + losses
                + scores
                + ["{0:.2f}".format(float(info["score"]))]
            )
-            msg.row(data, widths=table_widths, aligns=table_aligns)
+            if progress is not None:
                progress.close()
            stdout.write(msg.row(data, widths=table_widths, aligns=table_aligns))
            if progress_bar:
                # Set disable=None, so that it disables on non-TTY
                progress = tqdm.tqdm(
                    total=eval_frequency,
                    disable=None,
                    leave=False,
                    file=stderr
                )
                progress.set_description(f"Epoch {info['epoch']+1}")
        def finalize():
            pass
@ -70,10 +97,12 @@ def console_logger():
 def wandb_logger(project_name: str, remove_config_values: List[str] = []):
    import wandb
-    console = console_logger()
+    console = console_logger(progress_bar=False)
    def setup_logger(
        nlp: "Language",
        stdout: IO=sys.stdout,
        stderr: IO=sys.stderr
    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
        config = nlp.config.interpolate()
        config_dot = util.dict_to_dot(config)
@ -81,18 +110,19 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []):
            del config_dot[field]
        config = util.dot_to_dict(config_dot)
        wandb.init(project=project_name, config=config, reinit=True)
-        console_log_step, console_finalize = console(nlp)
+        console_log_step, console_finalize = console(nlp, stdout, stderr)
-        def log_step(info: Dict[str, Any]):
+        def log_step(info: Optional[Dict[str, Any]]):
            console_log_step(info)
-            score = info["score"]
+            if info is not None:
-            other_scores = info["other_scores"]
+                score = info["score"]
-            losses = info["losses"]
+                other_scores = info["other_scores"]
-            wandb.log({"score": score})
+                losses = info["losses"]
-            if losses:
+                wandb.log({"score": score})
-                wandb.log({f"loss_{k}": v for k, v in losses.items()})
+                if losses:
-            if isinstance(other_scores, dict):
+                    wandb.log({f"loss_{k}": v for k, v in losses.items()})
-                wandb.log(other_scores)
+                if isinstance(other_scores, dict):
                    wandb.log(other_scores)
        def finalize():
            console_finalize()
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -1,11 +1,11 @@
-from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any
+from typing import List, Callable, Tuple, Dict, Iterable, Iterator, Union, Any, IO
 from typing import Optional, TYPE_CHECKING
 from pathlib import Path
 from timeit import default_timer as timer
 from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
 import random
-import tqdm
+import wasabi
-from wasabi import Printer
+import sys
 from .example import Example
 from ..schemas import ConfigSchemaTraining
@ -21,7 +21,8 @@ def train(
    output_path: Optional[Path] = None,
    *,
    use_gpu: int = -1,
-    silent: bool = False,
+    stdout: IO=sys.stdout,
    stderr: IO=sys.stderr
 ) -> None:
    """Train a pipeline.
@ -29,10 +30,15 @@ def train(
    output_path (Path): Optional output path to save trained model to.
    use_gpu (int): Whether to train on GPU. Make sure to call require_gpu
        before calling this function.
-    silent (bool): Whether to pretty-print outputs.
+    stdout (file): A file-like object to write output messages. To disable
        printing, set to io.StringIO.
    stderr (file): A second file-like object to write output messages. To disable
        printing, set to io.StringIO.
    RETURNS (Path / None): The path to the final exported model.
    """
-    msg = Printer(no_print=silent)
+    # We use no_print here so we can respect the stdout/stderr options.
    msg = wasabi.Printer(no_print=True)
    # Create iterator, which yields out info after each optimization step.
    config = nlp.config.interpolate()
    if config["training"]["seed"] is not None:
@ -63,50 +69,44 @@ def train(
        eval_frequency=T["eval_frequency"],
        exclude=frozen_components,
    )
-    msg.info(f"Pipeline: {nlp.pipe_names}")
+    stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}"))
    if frozen_components:
-        msg.info(f"Frozen components: {frozen_components}")
+        stdout.write(msg.info(f"Frozen components: {frozen_components}"))
-    msg.info(f"Initial learn rate: {optimizer.learn_rate}")
+    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}"))
    with nlp.select_pipes(disable=frozen_components):
-        print_row, finalize_logger = train_logger(nlp)
+        log_step, finalize_logger = train_logger(nlp, stdout, stderr)
    try:
        progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
        progress.set_description(f"Epoch 1")
        for batch, info, is_best_checkpoint in training_step_iterator:
-            progress.update(1)
+            log_step(info if is_best_checkpoint else None)
-            if is_best_checkpoint is not None:
+            if is_best_checkpoint is not None and output_path is not None:
-                progress.close()
+                with nlp.select_pipes(disable=frozen_components):
-                print_row(info)
+                    update_meta(T, nlp, info)
-                if is_best_checkpoint and output_path is not None:
+                with nlp.use_params(optimizer.averages):
-                    with nlp.select_pipes(disable=frozen_components):
+                    nlp = before_to_disk(nlp)
-                        update_meta(T, nlp, info)
+                    nlp.to_disk(output_path / "model-best")
                    with nlp.use_params(optimizer.averages):
                        nlp = before_to_disk(nlp)
                        nlp.to_disk(output_path / "model-best")
                progress = tqdm.tqdm(total=T["eval_frequency"], leave=False)
                progress.set_description(f"Epoch {info['epoch']}")
    except Exception as e:
        finalize_logger()
        if output_path is not None:
            # We don't want to swallow the traceback if we don't have a
-            # specific error.
+            # specific error, but we do want to warn that we're trying
-            msg.warn(
+            # to do something here.
-                f"Aborting and saving the final best model. "
+            stdout.write(
-                f"Encountered exception: {str(e)}"
+                msg.warn(
                    f"Aborting and saving the final best model. "
                    f"Encountered exception: {str(e)}"
                )
            )
            nlp = before_to_disk(nlp)
            nlp.to_disk(output_path / "model-final")
        raise e
    finally:
        finalize_logger()
        if output_path is not None:
-            final_model_path = output_path / "model-final"
+            final_model_path = output_path / "model-last"
            if optimizer.averages:
                with nlp.use_params(optimizer.averages):
                    nlp.to_disk(final_model_path)
            else:
                nlp.to_disk(final_model_path)
-            msg.good(f"Saved pipeline to output directory", final_model_path)
+    # This will only run if we don't hit an error
    stdout.write(msg.good("Saved pipeline to output directory", final_model_path))
 def train_while_improving(
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -689,8 +689,8 @@ During training, the results of each step are passed to a logger function. By
 default, these results are written to the console with the
 [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
 for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
-[`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a
+[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function
-**dictionary** with the following keys:
+receives a **dictionary** with the following keys:
 | Key            | Value                                                                                                 |
 | -------------- | ----------------------------------------------------------------------------------------------------- |
@ -715,30 +715,37 @@ tabular results to a file:
 ```python
 ### functions.py
-from typing import Tuple, Callable, Dict, Any
+import sys
 from typing import IO, Tuple, Callable, Dict, Any
 import spacy
 from spacy import Language
 from pathlib import Path
@spacy.registry.loggers("my_custom_logger.v1")
 def custom_logger(log_path):
-    def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
+    def setup_logger(
-        with Path(log_path).open("w", encoding="utf8") as file_:
+        nlp: Language,
-            file_.write("step\\t")
+        stdout: IO=sys.stdout,
-            file_.write("score\\t")
+        stderr: IO=sys.stderr
-            for pipe in nlp.pipe_names:
+    ) -> Tuple[Callable, Callable]:
-                file_.write(f"loss_{pipe}\\t")
+        stdout.write(f"Logging to {log_path}\n")
-            file_.write("\\n")
+        log_file = Path(log_path).open("w", encoding="utf8")
        log_file.write("step\\t")
        log_file.write("score\\t")
        for pipe in nlp.pipe_names:
            log_file.write(f"loss_{pipe}\\t")
        log_file.write("\\n")
-        def log_step(info: Dict[str, Any]):
+        def log_step(info: Optional[Dict[str, Any]]):
-            with Path(log_path).open("a") as file_:
+            if info:
-                file_.write(f"{info['step']}\\t")
+                log_file.write(f"{info['step']}\\t")
-                file_.write(f"{info['score']}\\t")
+                log_file.write(f"{info['score']}\\t")
                for pipe in nlp.pipe_names:
-                    file_.write(f"{info['losses'][pipe]}\\t")
+                    log_file.write(f"{info['losses'][pipe]}\\t")
-                file_.write("\\n")
+                log_file.write("\\n")
        def finalize():
-            pass
+            log_file.close()
        return log_step, finalize