Move WandB loggers into spacy-loggers (#9223)

* factor out the WandB logger into spacy-loggers

Signed-off-by: Elia Robyn Speer <gh@arborelia.net>

* depend on spacy-loggers so they are available

Signed-off-by: Elia Robyn Speer <gh@arborelia.net>

* remove docs of spacy.WandbLogger.v2 (moved to spacy-loggers)

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* Version number suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* update references to WandbLogger

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

* make order of deps more consistent

Signed-off-by: Elia Robyn Speer <elia@explosion.ai>

Co-authored-by: Elia Robyn Speer <elia@explosion.ai>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Elia Robyn Lake (Robyn Speer) 2021-09-29 05:12:50 -04:00 committed by GitHub
parent fe5f5d6ac6
commit 5b0b0ca809
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 21 additions and 235 deletions

View File

@ -1,5 +1,6 @@
# Our libraries
spacy-legacy>=3.0.8,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.10,<8.1.0

View File

@ -41,6 +41,7 @@ setup_requires =
install_requires =
# Our libraries
spacy-legacy>=3.0.8,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0

View File

@ -7,5 +7,5 @@ from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F40
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
from .gold_io import docs_to_json, read_json_file # noqa: F401
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
from .loggers import console_logger, wandb_logger # noqa: F401
from .loggers import console_logger # noqa: F401
from .callbacks import create_copy_from_base_model # noqa: F401

View File

@ -4,7 +4,6 @@ import tqdm
import sys
from ..util import registry
from .. import util
from ..errors import Errors
if TYPE_CHECKING:
@ -100,166 +99,3 @@ def console_logger(progress_bar: bool = False):
return setup_printer
@registry.loggers("spacy.WandbLogger.v2")
def wandb_logger(
project_name: str,
remove_config_values: List[str] = [],
model_log_interval: Optional[int] = None,
log_dataset_dir: Optional[str] = None,
):
try:
import wandb
# test that these are available
from wandb import init, log, join # noqa: F401
except ImportError:
raise ImportError(Errors.E880)
console = console_logger(progress_bar=False)
def setup_logger(
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
config = nlp.config.interpolate()
config_dot = util.dict_to_dot(config)
for field in remove_config_values:
del config_dot[field]
config = util.dot_to_dict(config_dot)
run = wandb.init(project=project_name, config=config, reinit=True)
console_log_step, console_finalize = console(nlp, stdout, stderr)
def log_dir_artifact(
path: str,
name: str,
type: str,
metadata: Optional[Dict[str, Any]] = {},
aliases: Optional[List[str]] = [],
):
dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
dataset_artifact.add_dir(path, name=name)
wandb.log_artifact(dataset_artifact, aliases=aliases)
if log_dataset_dir:
log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
def log_step(info: Optional[Dict[str, Any]]):
console_log_step(info)
if info is not None:
score = info["score"]
other_scores = info["other_scores"]
losses = info["losses"]
wandb.log({"score": score})
if losses:
wandb.log({f"loss_{k}": v for k, v in losses.items()})
if isinstance(other_scores, dict):
wandb.log(other_scores)
if model_log_interval and info.get("output_path"):
if info["step"] % model_log_interval == 0 and info["step"] != 0:
log_dir_artifact(
path=info["output_path"],
name="pipeline_" + run.id,
type="checkpoint",
metadata=info,
aliases=[
f"epoch {info['epoch']} step {info['step']}",
"latest",
"best"
if info["score"] == max(info["checkpoints"])[0]
else "",
],
)
def finalize() -> None:
console_finalize()
wandb.join()
return log_step, finalize
return setup_logger
@registry.loggers("spacy.WandbLogger.v3")
def wandb_logger(
project_name: str,
remove_config_values: List[str] = [],
model_log_interval: Optional[int] = None,
log_dataset_dir: Optional[str] = None,
entity: Optional[str] = None,
run_name: Optional[str] = None,
):
try:
import wandb
# test that these are available
from wandb import init, log, join # noqa: F401
except ImportError:
raise ImportError(Errors.E880)
console = console_logger(progress_bar=False)
def setup_logger(
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
config = nlp.config.interpolate()
config_dot = util.dict_to_dot(config)
for field in remove_config_values:
del config_dot[field]
config = util.dot_to_dict(config_dot)
run = wandb.init(
project=project_name, config=config, entity=entity, reinit=True
)
if run_name:
wandb.run.name = run_name
console_log_step, console_finalize = console(nlp, stdout, stderr)
def log_dir_artifact(
path: str,
name: str,
type: str,
metadata: Optional[Dict[str, Any]] = {},
aliases: Optional[List[str]] = [],
):
dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
dataset_artifact.add_dir(path, name=name)
wandb.log_artifact(dataset_artifact, aliases=aliases)
if log_dataset_dir:
log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
def log_step(info: Optional[Dict[str, Any]]):
console_log_step(info)
if info is not None:
score = info["score"]
other_scores = info["other_scores"]
losses = info["losses"]
wandb.log({"score": score})
if losses:
wandb.log({f"loss_{k}": v for k, v in losses.items()})
if isinstance(other_scores, dict):
wandb.log(other_scores)
if model_log_interval and info.get("output_path"):
if info["step"] % model_log_interval == 0 and info["step"] != 0:
log_dir_artifact(
path=info["output_path"],
name="pipeline_" + run.id,
type="checkpoint",
metadata=info,
aliases=[
f"epoch {info['epoch']} step {info['step']}",
"latest",
"best"
if info["score"] == max(info["checkpoints"])[0]
else "",
],
)
def finalize() -> None:
console_finalize()
wandb.join()
return log_step, finalize
return setup_logger

View File

@ -411,10 +411,13 @@ finished. To log each training step, a
[`spacy train`](/api/cli#train), including information such as the training loss
and the accuracy scores on the development set.
There are two built-in logging functions: a logger printing results to the
console in tabular format (which is the default), and one that also sends the
results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
using one of the built-in loggers listed here, you can also
The built-in, default logger is the ConsoleLogger, which prints results to the
console in tabular format. The
[spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
a dependency of spaCy, enables other loggers: currently it provides one that sends
results to a [Weights & Biases](https://www.wandb.com/) dashboard.
Instead of using one of the built-in loggers, you can
[implement your own](/usage/training#custom-logging).
#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
@ -463,63 +466,6 @@ start decreasing across epochs.
</Accordion>
#### spacy.WandbLogger.v3 {#WandbLogger tag="registered function"}
> #### Installation
>
> ```bash
> $ pip install wandb
> $ wandb login
> ```
Built-in logger that sends the results of each training step to the dashboard of
the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
& Biases should be installed, and you should be logged in. The logger will send
the full config file to W&B, as well as various system information such as
memory utilization, network traffic, disk IO, GPU statistics, etc. This will
also include information such as your hostname and operating system, as well as
the location of your Python executable.
<Infobox variant="warning">
Note that by default, the full (interpolated)
[training config](/usage/training#config) is sent over to the W&B dashboard. If
you prefer to **exclude certain information** such as path names, you can list
those fields in "dot notation" in the `remove_config_values` parameter. These
fields will then be removed from the config before uploading, but will otherwise
remain in the config file stored on your local system.
</Infobox>
> #### Example config
>
> ```ini
> [training.logger]
> @loggers = "spacy.WandbLogger.v3"
> project_name = "monitor_spacy_training"
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
> log_dataset_dir = "corpus"
> model_log_interval = 1000
> ```
| Name | Description |
| ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
| `model_log_interval` | Steps to wait between logging model checkpoints to W&B dasboard (default: None). ~~Optional[int]~~ |
| `log_dataset_dir` | Directory containing dataset to be logged and versioned as W&B artifact (default: None). ~~Optional[str]~~ |
| `run_name` | The name of the run. If you don't specify a run_name, the name will be created by wandb library. (default: None ). ~~Optional[str]~~ |
| `entity` | An entity is a username or team name where you're sending runs. If you don't specify an entity, the run will be sent to your default entity, which is usually your username. (default: None). ~~Optional[str]~~ |
<Project id="integrations/wandb">
Get started with tracking your spaCy training runs in Weights & Biases using our
project template. It trains on the IMDB Movie Review Dataset and includes a
simple config with the built-in `WandbLogger`, as well as a custom example of
creating variants of the config for a simple hyperparameter grid search and
logging the results.
</Project>
## Readers {#readers}

View File

@ -1016,20 +1016,22 @@ commands:
[Weights & Biases](https://www.wandb.com/) is a popular platform for experiment
tracking. spaCy integrates with it out-of-the-box via the
[`WandbLogger`](/api/top-level#WandbLogger), which you can add as the
`[training.logger]` block of your training [config](/usage/training#config). The
results of each step are then logged in your project, together with the full
**training config**. This means that _every_ hyperparameter, registered function
name and argument will be tracked and you'll be able to see the impact it has on
your results.
[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger), which
you can add as the `[training.logger]` block of your training
[config](/usage/training#config). The results of each step are then logged in
your project, together with the full **training config**. This means that
_every_ hyperparameter, registered function name and argument will be tracked
and you'll be able to see the impact it has on your results.
> #### Example config
>
> ```ini
> [training.logger]
> @loggers = "spacy.WandbLogger.v2"
> @loggers = "spacy.WandbLogger.v3"
> project_name = "monitor_spacy_training"
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
> log_dataset_dir = "corpus"
> model_log_interval = 1000
> ```
![Screenshot: Visualized training results](../images/wandb1.jpg)

View File

@ -944,8 +944,8 @@ During training, the results of each step are passed to a logger function. By
default, these results are written to the console with the
[`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function
receives a **dictionary** with the following keys:
[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger). On each
step, the logger function receives a **dictionary** with the following keys:
| Key | Value |
| -------------- | ----------------------------------------------------------------------------------------------------- |