Move WandB loggers into spacy-loggers (#9223)

* factor out the WandB logger into spacy-loggers Signed-off-by: Elia Robyn Speer <gh@arborelia.net> * depend on spacy-loggers so they are available Signed-off-by: Elia Robyn Speer <gh@arborelia.net> * remove docs of spacy.WandbLogger.v2 (moved to spacy-loggers) Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * Version number suggestions from code review Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * update references to WandbLogger Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * make order of deps more consistent Signed-off-by: Elia Robyn Speer <elia@explosion.ai> Co-authored-by: Elia Robyn Speer <elia@explosion.ai> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-09-14 08:02:40 +03:00 · 2021-09-29 05:12:50 -04:00 · 2021-09-29 05:12:50 -04:00 · 5b0b0ca809
commit 5b0b0ca809
parent fe5f5d6ac6
7 changed files with 21 additions and 235 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 # Our libraries
 spacy-legacy>=3.0.8,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.10,<8.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -41,6 +41,7 @@ setup_requires =
 install_requires =
    # Our libraries
    spacy-legacy>=3.0.8,<3.1.0
    spacy-loggers>=1.0.0,<2.0.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
--- a/spacy/training/init.py
+++ b/spacy/training/init.py
@ -7,5 +7,5 @@ from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F40
 from .iob_utils import biluo_tags_to_spans, tags_to_entities  # noqa: F401
 from .gold_io import docs_to_json, read_json_file  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
-from .loggers import console_logger, wandb_logger  # noqa: F401
+from .loggers import console_logger  # noqa: F401
 from .callbacks import create_copy_from_base_model  # noqa: F401
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@ -4,7 +4,6 @@ import tqdm
 import sys
 from ..util import registry
 from .. import util
 from ..errors import Errors
 if TYPE_CHECKING:
@ -100,166 +99,3 @@ def console_logger(progress_bar: bool = False):
    return setup_printer
@registry.loggers("spacy.WandbLogger.v2")
 def wandb_logger(
    project_name: str,
    remove_config_values: List[str] = [],
    model_log_interval: Optional[int] = None,
    log_dataset_dir: Optional[str] = None,
 ):
    try:
        import wandb
        # test that these are available
        from wandb import init, log, join  # noqa: F401
    except ImportError:
        raise ImportError(Errors.E880)
    console = console_logger(progress_bar=False)
    def setup_logger(
        nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
        config = nlp.config.interpolate()
        config_dot = util.dict_to_dot(config)
        for field in remove_config_values:
            del config_dot[field]
        config = util.dot_to_dict(config_dot)
        run = wandb.init(project=project_name, config=config, reinit=True)
        console_log_step, console_finalize = console(nlp, stdout, stderr)
        def log_dir_artifact(
            path: str,
            name: str,
            type: str,
            metadata: Optional[Dict[str, Any]] = {},
            aliases: Optional[List[str]] = [],
        ):
            dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
            dataset_artifact.add_dir(path, name=name)
            wandb.log_artifact(dataset_artifact, aliases=aliases)
        if log_dataset_dir:
            log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
        def log_step(info: Optional[Dict[str, Any]]):
            console_log_step(info)
            if info is not None:
                score = info["score"]
                other_scores = info["other_scores"]
                losses = info["losses"]
                wandb.log({"score": score})
                if losses:
                    wandb.log({f"loss_{k}": v for k, v in losses.items()})
                if isinstance(other_scores, dict):
                    wandb.log(other_scores)
                if model_log_interval and info.get("output_path"):
                    if info["step"] % model_log_interval == 0 and info["step"] != 0:
                        log_dir_artifact(
                            path=info["output_path"],
                            name="pipeline_" + run.id,
                            type="checkpoint",
                            metadata=info,
                            aliases=[
                                f"epoch {info['epoch']} step {info['step']}",
                                "latest",
                                "best"
                                if info["score"] == max(info["checkpoints"])[0]
                                else "",
                            ],
                        )
        def finalize() -> None:
            console_finalize()
            wandb.join()
        return log_step, finalize
    return setup_logger
@registry.loggers("spacy.WandbLogger.v3")
 def wandb_logger(
    project_name: str,
    remove_config_values: List[str] = [],
    model_log_interval: Optional[int] = None,
    log_dataset_dir: Optional[str] = None,
    entity: Optional[str] = None,
    run_name: Optional[str] = None,
 ):
    try:
        import wandb
        # test that these are available
        from wandb import init, log, join  # noqa: F401
    except ImportError:
        raise ImportError(Errors.E880)
    console = console_logger(progress_bar=False)
    def setup_logger(
        nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
        config = nlp.config.interpolate()
        config_dot = util.dict_to_dot(config)
        for field in remove_config_values:
            del config_dot[field]
        config = util.dot_to_dict(config_dot)
        run = wandb.init(
            project=project_name, config=config, entity=entity, reinit=True
        )
        if run_name:
            wandb.run.name = run_name
        console_log_step, console_finalize = console(nlp, stdout, stderr)
        def log_dir_artifact(
            path: str,
            name: str,
            type: str,
            metadata: Optional[Dict[str, Any]] = {},
            aliases: Optional[List[str]] = [],
        ):
            dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
            dataset_artifact.add_dir(path, name=name)
            wandb.log_artifact(dataset_artifact, aliases=aliases)
        if log_dataset_dir:
            log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
        def log_step(info: Optional[Dict[str, Any]]):
            console_log_step(info)
            if info is not None:
                score = info["score"]
                other_scores = info["other_scores"]
                losses = info["losses"]
                wandb.log({"score": score})
                if losses:
                    wandb.log({f"loss_{k}": v for k, v in losses.items()})
                if isinstance(other_scores, dict):
                    wandb.log(other_scores)
                if model_log_interval and info.get("output_path"):
                    if info["step"] % model_log_interval == 0 and info["step"] != 0:
                        log_dir_artifact(
                            path=info["output_path"],
                            name="pipeline_" + run.id,
                            type="checkpoint",
                            metadata=info,
                            aliases=[
                                f"epoch {info['epoch']} step {info['step']}",
                                "latest",
                                "best"
                                if info["score"] == max(info["checkpoints"])[0]
                                else "",
                            ],
                        )
        def finalize() -> None:
            console_finalize()
            wandb.join()
        return log_step, finalize
    return setup_logger
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -411,10 +411,13 @@ finished. To log each training step, a
 [`spacy train`](/api/cli#train), including information such as the training loss
 and the accuracy scores on the development set.
-There are two built-in logging functions: a logger printing results to the
+The built-in, default logger is the ConsoleLogger, which prints results to the
-console in tabular format (which is the default), and one that also sends the
+console in tabular format. The 
-results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
+[spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
-using one of the built-in loggers listed here, you can also
+a dependency of spaCy, enables other loggers: currently it provides one that sends
 results to a [Weights & Biases](https://www.wandb.com/) dashboard.
 Instead of using one of the built-in loggers, you can
 [implement your own](/usage/training#custom-logging).
 #### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
@ -463,63 +466,6 @@ start decreasing across epochs.
 </Accordion>
 #### spacy.WandbLogger.v3 {#WandbLogger tag="registered function"}
 > #### Installation
 >
 > ```bash
 > $ pip install wandb
 > $ wandb login
 > ```
 Built-in logger that sends the results of each training step to the dashboard of
 the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
 & Biases should be installed, and you should be logged in. The logger will send
 the full config file to W&B, as well as various system information such as
 memory utilization, network traffic, disk IO, GPU statistics, etc. This will
 also include information such as your hostname and operating system, as well as
 the location of your Python executable.
 <Infobox variant="warning">
 Note that by default, the full (interpolated)
 [training config](/usage/training#config) is sent over to the W&B dashboard. If
 you prefer to **exclude certain information** such as path names, you can list
 those fields in "dot notation" in the `remove_config_values` parameter. These
 fields will then be removed from the config before uploading, but will otherwise
 remain in the config file stored on your local system.
 </Infobox>
 > #### Example config
 >
 > ```ini
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v3"
 > project_name = "monitor_spacy_training"
 > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
 > log_dataset_dir = "corpus"
 > model_log_interval = 1000
 > ```
 | Name                   | Description                                                                                                                                                                                                     |
 | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~                                                                           |
 | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                                                                                                        |
 | `model_log_interval`   | Steps to wait between logging model checkpoints to W&B dasboard (default: None). ~~Optional[int]~~                                                                                                              |
 | `log_dataset_dir`      | Directory containing dataset to be logged and versioned as W&B artifact (default: None). ~~Optional[str]~~                                                                                                      |
 | `run_name`             | The name of the run. If you don't specify a run_name, the name will be created by wandb library. (default: None ). ~~Optional[str]~~                                                                            |
 | `entity`               | An entity is a username or team name where you're sending runs. If you don't specify an entity, the run will be sent to your default entity, which is usually your username. (default: None). ~~Optional[str]~~ |
 <Project id="integrations/wandb">
 Get started with tracking your spaCy training runs in Weights & Biases using our
 project template. It trains on the IMDB Movie Review Dataset and includes a
 simple config with the built-in `WandbLogger`, as well as a custom example of
 creating variants of the config for a simple hyperparameter grid search and
 logging the results.
 </Project>
 ## Readers {#readers}
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -1016,20 +1016,22 @@ commands:
 [Weights & Biases](https://www.wandb.com/) is a popular platform for experiment
 tracking. spaCy integrates with it out-of-the-box via the
-[`WandbLogger`](/api/top-level#WandbLogger), which you can add as the
+[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger), which
-`[training.logger]` block of your training [config](/usage/training#config). The
+you can add as the `[training.logger]` block of your training
-results of each step are then logged in your project, together with the full
+[config](/usage/training#config). The results of each step are then logged in
-**training config**. This means that _every_ hyperparameter, registered function
+your project, together with the full **training config**. This means that
-name and argument will be tracked and you'll be able to see the impact it has on
+_every_ hyperparameter, registered function name and argument will be tracked
-your results.
+and you'll be able to see the impact it has on your results.
 > #### Example config
 >
 > ```ini
 > [training.logger]
-> @loggers = "spacy.WandbLogger.v2"
+> @loggers = "spacy.WandbLogger.v3"
 > project_name = "monitor_spacy_training"
 > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
 > log_dataset_dir = "corpus"
 > model_log_interval = 1000
 > ```
 ![Screenshot: Visualized training results](../images/wandb1.jpg)
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -944,8 +944,8 @@ During training, the results of each step are passed to a logger function. By
 default, these results are written to the console with the
 [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
 for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
-[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function
+[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger). On each
-receives a **dictionary** with the following keys:
+step, the logger function receives a **dictionary** with the following keys:
 | Key            | Value                                                                                                 |
 | -------------- | ----------------------------------------------------------------------------------------------------- |