Move WandB loggers into spacy-loggers (#9223)

* factor out the WandB logger into spacy-loggers Signed-off-by: Elia Robyn Speer <gh@arborelia.net> * depend on spacy-loggers so they are available Signed-off-by: Elia Robyn Speer <gh@arborelia.net> * remove docs of spacy.WandbLogger.v2 (moved to spacy-loggers) Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * Version number suggestions from code review Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * update references to WandbLogger Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * make order of deps more consistent Signed-off-by: Elia Robyn Speer <elia@explosion.ai> Co-authored-by: Elia Robyn Speer <elia@explosion.ai> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-09-14 08:02:40 +03:00 · 2021-09-29 05:12:50 -04:00 · 2021-09-29 05:12:50 -04:00 · 5b0b0ca809
commit 5b0b0ca809
parent fe5f5d6ac6
7 changed files with 21 additions and 235 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 # Our libraries
 spacy-legacy>=3.0.8,<3.1.0
+spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.10,<8.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -41,6 +41,7 @@ setup_requires =
 install_requires =
    # Our libraries
    spacy-legacy>=3.0.8,<3.1.0
+    spacy-loggers>=1.0.0,<2.0.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
--- a/spacy/training/init.py
+++ b/spacy/training/init.py
@ -7,5 +7,5 @@ from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F40
 from .iob_utils import biluo_tags_to_spans, tags_to_entities  # noqa: F401
 from .gold_io import docs_to_json, read_json_file  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
-from .loggers import console_logger, wandb_logger  # noqa: F401
+from .loggers import console_logger  # noqa: F401
 from .callbacks import create_copy_from_base_model  # noqa: F401
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@ -4,7 +4,6 @@ import tqdm
 import sys

 from ..util import registry
-from .. import util
 from ..errors import Errors

 if TYPE_CHECKING:
@ -100,166 +99,3 @@ def console_logger(progress_bar: bool = False):

    return setup_printer

-
-@registry.loggers("spacy.WandbLogger.v2")
-def wandb_logger(
-    project_name: str,
-    remove_config_values: List[str] = [],
-    model_log_interval: Optional[int] = None,
-    log_dataset_dir: Optional[str] = None,
-):
-    try:
-        import wandb
-
-        # test that these are available
-        from wandb import init, log, join  # noqa: F401
-    except ImportError:
-        raise ImportError(Errors.E880)
-
-    console = console_logger(progress_bar=False)
-
-    def setup_logger(
-        nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
-    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
-        config = nlp.config.interpolate()
-        config_dot = util.dict_to_dot(config)
-        for field in remove_config_values:
-            del config_dot[field]
-        config = util.dot_to_dict(config_dot)
-        run = wandb.init(project=project_name, config=config, reinit=True)
-        console_log_step, console_finalize = console(nlp, stdout, stderr)
-
-        def log_dir_artifact(
-            path: str,
-            name: str,
-            type: str,
-            metadata: Optional[Dict[str, Any]] = {},
-            aliases: Optional[List[str]] = [],
-        ):
-            dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
-            dataset_artifact.add_dir(path, name=name)
-            wandb.log_artifact(dataset_artifact, aliases=aliases)
-
-        if log_dataset_dir:
-            log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
-
-        def log_step(info: Optional[Dict[str, Any]]):
-            console_log_step(info)
-            if info is not None:
-                score = info["score"]
-                other_scores = info["other_scores"]
-                losses = info["losses"]
-                wandb.log({"score": score})
-                if losses:
-                    wandb.log({f"loss_{k}": v for k, v in losses.items()})
-                if isinstance(other_scores, dict):
-                    wandb.log(other_scores)
-                if model_log_interval and info.get("output_path"):
-                    if info["step"] % model_log_interval == 0 and info["step"] != 0:
-                        log_dir_artifact(
-                            path=info["output_path"],
-                            name="pipeline_" + run.id,
-                            type="checkpoint",
-                            metadata=info,
-                            aliases=[
-                                f"epoch {info['epoch']} step {info['step']}",
-                                "latest",
-                                "best"
-                                if info["score"] == max(info["checkpoints"])[0]
-                                else "",
-                            ],
-                        )
-
-        def finalize() -> None:
-            console_finalize()
-            wandb.join()
-
-        return log_step, finalize
-
-    return setup_logger
-
-
-@registry.loggers("spacy.WandbLogger.v3")
-def wandb_logger(
-    project_name: str,
-    remove_config_values: List[str] = [],
-    model_log_interval: Optional[int] = None,
-    log_dataset_dir: Optional[str] = None,
-    entity: Optional[str] = None,
-    run_name: Optional[str] = None,
-):
-    try:
-        import wandb
-
-        # test that these are available
-        from wandb import init, log, join  # noqa: F401
-    except ImportError:
-        raise ImportError(Errors.E880)
-
-    console = console_logger(progress_bar=False)
-
-    def setup_logger(
-        nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
-    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
-        config = nlp.config.interpolate()
-        config_dot = util.dict_to_dot(config)
-        for field in remove_config_values:
-            del config_dot[field]
-        config = util.dot_to_dict(config_dot)
-        run = wandb.init(
-            project=project_name, config=config, entity=entity, reinit=True
-        )
-
-        if run_name:
-            wandb.run.name = run_name
-
-        console_log_step, console_finalize = console(nlp, stdout, stderr)
-
-        def log_dir_artifact(
-            path: str,
-            name: str,
-            type: str,
-            metadata: Optional[Dict[str, Any]] = {},
-            aliases: Optional[List[str]] = [],
-        ):
-            dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
-            dataset_artifact.add_dir(path, name=name)
-            wandb.log_artifact(dataset_artifact, aliases=aliases)
-
-        if log_dataset_dir:
-            log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
-
-        def log_step(info: Optional[Dict[str, Any]]):
-            console_log_step(info)
-            if info is not None:
-                score = info["score"]
-                other_scores = info["other_scores"]
-                losses = info["losses"]
-                wandb.log({"score": score})
-                if losses:
-                    wandb.log({f"loss_{k}": v for k, v in losses.items()})
-                if isinstance(other_scores, dict):
-                    wandb.log(other_scores)
-                if model_log_interval and info.get("output_path"):
-                    if info["step"] % model_log_interval == 0 and info["step"] != 0:
-                        log_dir_artifact(
-                            path=info["output_path"],
-                            name="pipeline_" + run.id,
-                            type="checkpoint",
-                            metadata=info,
-                            aliases=[
-                                f"epoch {info['epoch']} step {info['step']}",
-                                "latest",
-                                "best"
-                                if info["score"] == max(info["checkpoints"])[0]
-                                else "",
-                            ],
-                        )
-
-        def finalize() -> None:
-            console_finalize()
-            wandb.join()
-
-        return log_step, finalize
-
-    return setup_logger
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -411,10 +411,13 @@ finished. To log each training step, a
 [`spacy train`](/api/cli#train), including information such as the training loss
 and the accuracy scores on the development set.

-There are two built-in logging functions: a logger printing results to the
-console in tabular format (which is the default), and one that also sends the
-results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
-using one of the built-in loggers listed here, you can also
+The built-in, default logger is the ConsoleLogger, which prints results to the
+console in tabular format. The 
+[spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
+a dependency of spaCy, enables other loggers: currently it provides one that sends
+results to a [Weights & Biases](https://www.wandb.com/) dashboard.
+
+Instead of using one of the built-in loggers, you can
 [implement your own](/usage/training#custom-logging).

 #### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
@ -463,63 +466,6 @@ start decreasing across epochs.

 </Accordion>

-#### spacy.WandbLogger.v3 {#WandbLogger tag="registered function"}
-
-> #### Installation
->
-> ```bash
-> $ pip install wandb
-> $ wandb login
-> ```
-
-Built-in logger that sends the results of each training step to the dashboard of
-the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
-& Biases should be installed, and you should be logged in. The logger will send
-the full config file to W&B, as well as various system information such as
-memory utilization, network traffic, disk IO, GPU statistics, etc. This will
-also include information such as your hostname and operating system, as well as
-the location of your Python executable.
-
-<Infobox variant="warning">
-
-Note that by default, the full (interpolated)
-[training config](/usage/training#config) is sent over to the W&B dashboard. If
-you prefer to **exclude certain information** such as path names, you can list
-those fields in "dot notation" in the `remove_config_values` parameter. These
-fields will then be removed from the config before uploading, but will otherwise
-remain in the config file stored on your local system.
-
-</Infobox>
-
-> #### Example config
->
-> ```ini
-> [training.logger]
-> @loggers = "spacy.WandbLogger.v3"
-> project_name = "monitor_spacy_training"
-> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
-> log_dataset_dir = "corpus"
-> model_log_interval = 1000
-> ```
-
-| Name                   | Description                                                                                                                                                                                                     |
-| ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~                                                                           |
-| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                                                                                                        |
-| `model_log_interval`   | Steps to wait between logging model checkpoints to W&B dasboard (default: None). ~~Optional[int]~~                                                                                                              |
-| `log_dataset_dir`      | Directory containing dataset to be logged and versioned as W&B artifact (default: None). ~~Optional[str]~~                                                                                                      |
-| `run_name`             | The name of the run. If you don't specify a run_name, the name will be created by wandb library. (default: None ). ~~Optional[str]~~                                                                            |
-| `entity`               | An entity is a username or team name where you're sending runs. If you don't specify an entity, the run will be sent to your default entity, which is usually your username. (default: None). ~~Optional[str]~~ |
-
-<Project id="integrations/wandb">
-
-Get started with tracking your spaCy training runs in Weights & Biases using our
-project template. It trains on the IMDB Movie Review Dataset and includes a
-simple config with the built-in `WandbLogger`, as well as a custom example of
-creating variants of the config for a simple hyperparameter grid search and
-logging the results.
-
-</Project>

 ## Readers {#readers}

--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -1016,20 +1016,22 @@ commands:

 [Weights & Biases](https://www.wandb.com/) is a popular platform for experiment
 tracking. spaCy integrates with it out-of-the-box via the
-[`WandbLogger`](/api/top-level#WandbLogger), which you can add as the
-`[training.logger]` block of your training [config](/usage/training#config). The
-results of each step are then logged in your project, together with the full
-**training config**. This means that _every_ hyperparameter, registered function
-name and argument will be tracked and you'll be able to see the impact it has on
-your results.
+[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger), which
+you can add as the `[training.logger]` block of your training
+[config](/usage/training#config). The results of each step are then logged in
+your project, together with the full **training config**. This means that
+_every_ hyperparameter, registered function name and argument will be tracked
+and you'll be able to see the impact it has on your results.

 > #### Example config
 >
 > ```ini
 > [training.logger]
-> @loggers = "spacy.WandbLogger.v2"
+> @loggers = "spacy.WandbLogger.v3"
 > project_name = "monitor_spacy_training"
 > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
+> log_dataset_dir = "corpus"
+> model_log_interval = 1000
 > ```

 ![Screenshot: Visualized training results](../images/wandb1.jpg)
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -944,8 +944,8 @@ During training, the results of each step are passed to a logger function. By
 default, these results are written to the console with the
 [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
 for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
-[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function
-receives a **dictionary** with the following keys:
+[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger). On each
+step, the logger function receives a **dictionary** with the following keys:

 | Key            | Value                                                                                                 |
 | -------------- | ----------------------------------------------------------------------------------------------------- |