mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Move WandB loggers into spacy-loggers (#9223)
* factor out the WandB logger into spacy-loggers Signed-off-by: Elia Robyn Speer <gh@arborelia.net> * depend on spacy-loggers so they are available Signed-off-by: Elia Robyn Speer <gh@arborelia.net> * remove docs of spacy.WandbLogger.v2 (moved to spacy-loggers) Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * Version number suggestions from code review Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * update references to WandbLogger Signed-off-by: Elia Robyn Speer <elia@explosion.ai> * make order of deps more consistent Signed-off-by: Elia Robyn Speer <elia@explosion.ai> Co-authored-by: Elia Robyn Speer <elia@explosion.ai> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
fe5f5d6ac6
commit
5b0b0ca809
|
@ -1,5 +1,6 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.8,<3.1.0
|
spacy-legacy>=3.0.8,<3.1.0
|
||||||
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.10,<8.1.0
|
thinc>=8.0.10,<8.1.0
|
||||||
|
|
|
@ -41,6 +41,7 @@ setup_requires =
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.8,<3.1.0
|
spacy-legacy>=3.0.8,<3.1.0
|
||||||
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
|
|
|
@ -7,5 +7,5 @@ from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F40
|
||||||
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
|
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
|
||||||
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
||||||
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
||||||
from .loggers import console_logger, wandb_logger # noqa: F401
|
from .loggers import console_logger # noqa: F401
|
||||||
from .callbacks import create_copy_from_base_model # noqa: F401
|
from .callbacks import create_copy_from_base_model # noqa: F401
|
||||||
|
|
|
@ -4,7 +4,6 @@ import tqdm
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .. import util
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -100,166 +99,3 @@ def console_logger(progress_bar: bool = False):
|
||||||
|
|
||||||
return setup_printer
|
return setup_printer
|
||||||
|
|
||||||
|
|
||||||
@registry.loggers("spacy.WandbLogger.v2")
|
|
||||||
def wandb_logger(
|
|
||||||
project_name: str,
|
|
||||||
remove_config_values: List[str] = [],
|
|
||||||
model_log_interval: Optional[int] = None,
|
|
||||||
log_dataset_dir: Optional[str] = None,
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
import wandb
|
|
||||||
|
|
||||||
# test that these are available
|
|
||||||
from wandb import init, log, join # noqa: F401
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError(Errors.E880)
|
|
||||||
|
|
||||||
console = console_logger(progress_bar=False)
|
|
||||||
|
|
||||||
def setup_logger(
|
|
||||||
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
|
|
||||||
) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
|
|
||||||
config = nlp.config.interpolate()
|
|
||||||
config_dot = util.dict_to_dot(config)
|
|
||||||
for field in remove_config_values:
|
|
||||||
del config_dot[field]
|
|
||||||
config = util.dot_to_dict(config_dot)
|
|
||||||
run = wandb.init(project=project_name, config=config, reinit=True)
|
|
||||||
console_log_step, console_finalize = console(nlp, stdout, stderr)
|
|
||||||
|
|
||||||
def log_dir_artifact(
|
|
||||||
path: str,
|
|
||||||
name: str,
|
|
||||||
type: str,
|
|
||||||
metadata: Optional[Dict[str, Any]] = {},
|
|
||||||
aliases: Optional[List[str]] = [],
|
|
||||||
):
|
|
||||||
dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
|
|
||||||
dataset_artifact.add_dir(path, name=name)
|
|
||||||
wandb.log_artifact(dataset_artifact, aliases=aliases)
|
|
||||||
|
|
||||||
if log_dataset_dir:
|
|
||||||
log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
|
|
||||||
|
|
||||||
def log_step(info: Optional[Dict[str, Any]]):
|
|
||||||
console_log_step(info)
|
|
||||||
if info is not None:
|
|
||||||
score = info["score"]
|
|
||||||
other_scores = info["other_scores"]
|
|
||||||
losses = info["losses"]
|
|
||||||
wandb.log({"score": score})
|
|
||||||
if losses:
|
|
||||||
wandb.log({f"loss_{k}": v for k, v in losses.items()})
|
|
||||||
if isinstance(other_scores, dict):
|
|
||||||
wandb.log(other_scores)
|
|
||||||
if model_log_interval and info.get("output_path"):
|
|
||||||
if info["step"] % model_log_interval == 0 and info["step"] != 0:
|
|
||||||
log_dir_artifact(
|
|
||||||
path=info["output_path"],
|
|
||||||
name="pipeline_" + run.id,
|
|
||||||
type="checkpoint",
|
|
||||||
metadata=info,
|
|
||||||
aliases=[
|
|
||||||
f"epoch {info['epoch']} step {info['step']}",
|
|
||||||
"latest",
|
|
||||||
"best"
|
|
||||||
if info["score"] == max(info["checkpoints"])[0]
|
|
||||||
else "",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
def finalize() -> None:
|
|
||||||
console_finalize()
|
|
||||||
wandb.join()
|
|
||||||
|
|
||||||
return log_step, finalize
|
|
||||||
|
|
||||||
return setup_logger
|
|
||||||
|
|
||||||
|
|
||||||
@registry.loggers("spacy.WandbLogger.v3")
|
|
||||||
def wandb_logger(
|
|
||||||
project_name: str,
|
|
||||||
remove_config_values: List[str] = [],
|
|
||||||
model_log_interval: Optional[int] = None,
|
|
||||||
log_dataset_dir: Optional[str] = None,
|
|
||||||
entity: Optional[str] = None,
|
|
||||||
run_name: Optional[str] = None,
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
import wandb
|
|
||||||
|
|
||||||
# test that these are available
|
|
||||||
from wandb import init, log, join # noqa: F401
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError(Errors.E880)
|
|
||||||
|
|
||||||
console = console_logger(progress_bar=False)
|
|
||||||
|
|
||||||
def setup_logger(
|
|
||||||
nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr
|
|
||||||
) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]:
|
|
||||||
config = nlp.config.interpolate()
|
|
||||||
config_dot = util.dict_to_dot(config)
|
|
||||||
for field in remove_config_values:
|
|
||||||
del config_dot[field]
|
|
||||||
config = util.dot_to_dict(config_dot)
|
|
||||||
run = wandb.init(
|
|
||||||
project=project_name, config=config, entity=entity, reinit=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if run_name:
|
|
||||||
wandb.run.name = run_name
|
|
||||||
|
|
||||||
console_log_step, console_finalize = console(nlp, stdout, stderr)
|
|
||||||
|
|
||||||
def log_dir_artifact(
|
|
||||||
path: str,
|
|
||||||
name: str,
|
|
||||||
type: str,
|
|
||||||
metadata: Optional[Dict[str, Any]] = {},
|
|
||||||
aliases: Optional[List[str]] = [],
|
|
||||||
):
|
|
||||||
dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
|
|
||||||
dataset_artifact.add_dir(path, name=name)
|
|
||||||
wandb.log_artifact(dataset_artifact, aliases=aliases)
|
|
||||||
|
|
||||||
if log_dataset_dir:
|
|
||||||
log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
|
|
||||||
|
|
||||||
def log_step(info: Optional[Dict[str, Any]]):
|
|
||||||
console_log_step(info)
|
|
||||||
if info is not None:
|
|
||||||
score = info["score"]
|
|
||||||
other_scores = info["other_scores"]
|
|
||||||
losses = info["losses"]
|
|
||||||
wandb.log({"score": score})
|
|
||||||
if losses:
|
|
||||||
wandb.log({f"loss_{k}": v for k, v in losses.items()})
|
|
||||||
if isinstance(other_scores, dict):
|
|
||||||
wandb.log(other_scores)
|
|
||||||
if model_log_interval and info.get("output_path"):
|
|
||||||
if info["step"] % model_log_interval == 0 and info["step"] != 0:
|
|
||||||
log_dir_artifact(
|
|
||||||
path=info["output_path"],
|
|
||||||
name="pipeline_" + run.id,
|
|
||||||
type="checkpoint",
|
|
||||||
metadata=info,
|
|
||||||
aliases=[
|
|
||||||
f"epoch {info['epoch']} step {info['step']}",
|
|
||||||
"latest",
|
|
||||||
"best"
|
|
||||||
if info["score"] == max(info["checkpoints"])[0]
|
|
||||||
else "",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
def finalize() -> None:
|
|
||||||
console_finalize()
|
|
||||||
wandb.join()
|
|
||||||
|
|
||||||
return log_step, finalize
|
|
||||||
|
|
||||||
return setup_logger
|
|
||||||
|
|
|
@ -411,10 +411,13 @@ finished. To log each training step, a
|
||||||
[`spacy train`](/api/cli#train), including information such as the training loss
|
[`spacy train`](/api/cli#train), including information such as the training loss
|
||||||
and the accuracy scores on the development set.
|
and the accuracy scores on the development set.
|
||||||
|
|
||||||
There are two built-in logging functions: a logger printing results to the
|
The built-in, default logger is the ConsoleLogger, which prints results to the
|
||||||
console in tabular format (which is the default), and one that also sends the
|
console in tabular format. The
|
||||||
results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
|
[spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as
|
||||||
using one of the built-in loggers listed here, you can also
|
a dependency of spaCy, enables other loggers: currently it provides one that sends
|
||||||
|
results to a [Weights & Biases](https://www.wandb.com/) dashboard.
|
||||||
|
|
||||||
|
Instead of using one of the built-in loggers, you can
|
||||||
[implement your own](/usage/training#custom-logging).
|
[implement your own](/usage/training#custom-logging).
|
||||||
|
|
||||||
#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
|
#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
|
||||||
|
@ -463,63 +466,6 @@ start decreasing across epochs.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
#### spacy.WandbLogger.v3 {#WandbLogger tag="registered function"}
|
|
||||||
|
|
||||||
> #### Installation
|
|
||||||
>
|
|
||||||
> ```bash
|
|
||||||
> $ pip install wandb
|
|
||||||
> $ wandb login
|
|
||||||
> ```
|
|
||||||
|
|
||||||
Built-in logger that sends the results of each training step to the dashboard of
|
|
||||||
the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
|
|
||||||
& Biases should be installed, and you should be logged in. The logger will send
|
|
||||||
the full config file to W&B, as well as various system information such as
|
|
||||||
memory utilization, network traffic, disk IO, GPU statistics, etc. This will
|
|
||||||
also include information such as your hostname and operating system, as well as
|
|
||||||
the location of your Python executable.
|
|
||||||
|
|
||||||
<Infobox variant="warning">
|
|
||||||
|
|
||||||
Note that by default, the full (interpolated)
|
|
||||||
[training config](/usage/training#config) is sent over to the W&B dashboard. If
|
|
||||||
you prefer to **exclude certain information** such as path names, you can list
|
|
||||||
those fields in "dot notation" in the `remove_config_values` parameter. These
|
|
||||||
fields will then be removed from the config before uploading, but will otherwise
|
|
||||||
remain in the config file stored on your local system.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
> #### Example config
|
|
||||||
>
|
|
||||||
> ```ini
|
|
||||||
> [training.logger]
|
|
||||||
> @loggers = "spacy.WandbLogger.v3"
|
|
||||||
> project_name = "monitor_spacy_training"
|
|
||||||
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
|
|
||||||
> log_dataset_dir = "corpus"
|
|
||||||
> model_log_interval = 1000
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
|
|
||||||
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
|
|
||||||
| `model_log_interval` | Steps to wait between logging model checkpoints to W&B dasboard (default: None). ~~Optional[int]~~ |
|
|
||||||
| `log_dataset_dir` | Directory containing dataset to be logged and versioned as W&B artifact (default: None). ~~Optional[str]~~ |
|
|
||||||
| `run_name` | The name of the run. If you don't specify a run_name, the name will be created by wandb library. (default: None ). ~~Optional[str]~~ |
|
|
||||||
| `entity` | An entity is a username or team name where you're sending runs. If you don't specify an entity, the run will be sent to your default entity, which is usually your username. (default: None). ~~Optional[str]~~ |
|
|
||||||
|
|
||||||
<Project id="integrations/wandb">
|
|
||||||
|
|
||||||
Get started with tracking your spaCy training runs in Weights & Biases using our
|
|
||||||
project template. It trains on the IMDB Movie Review Dataset and includes a
|
|
||||||
simple config with the built-in `WandbLogger`, as well as a custom example of
|
|
||||||
creating variants of the config for a simple hyperparameter grid search and
|
|
||||||
logging the results.
|
|
||||||
|
|
||||||
</Project>
|
|
||||||
|
|
||||||
## Readers {#readers}
|
## Readers {#readers}
|
||||||
|
|
||||||
|
|
|
@ -1016,20 +1016,22 @@ commands:
|
||||||
|
|
||||||
[Weights & Biases](https://www.wandb.com/) is a popular platform for experiment
|
[Weights & Biases](https://www.wandb.com/) is a popular platform for experiment
|
||||||
tracking. spaCy integrates with it out-of-the-box via the
|
tracking. spaCy integrates with it out-of-the-box via the
|
||||||
[`WandbLogger`](/api/top-level#WandbLogger), which you can add as the
|
[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger), which
|
||||||
`[training.logger]` block of your training [config](/usage/training#config). The
|
you can add as the `[training.logger]` block of your training
|
||||||
results of each step are then logged in your project, together with the full
|
[config](/usage/training#config). The results of each step are then logged in
|
||||||
**training config**. This means that _every_ hyperparameter, registered function
|
your project, together with the full **training config**. This means that
|
||||||
name and argument will be tracked and you'll be able to see the impact it has on
|
_every_ hyperparameter, registered function name and argument will be tracked
|
||||||
your results.
|
and you'll be able to see the impact it has on your results.
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [training.logger]
|
> [training.logger]
|
||||||
> @loggers = "spacy.WandbLogger.v2"
|
> @loggers = "spacy.WandbLogger.v3"
|
||||||
> project_name = "monitor_spacy_training"
|
> project_name = "monitor_spacy_training"
|
||||||
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
|
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
|
||||||
|
> log_dataset_dir = "corpus"
|
||||||
|
> model_log_interval = 1000
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
![Screenshot: Visualized training results](../images/wandb1.jpg)
|
![Screenshot: Visualized training results](../images/wandb1.jpg)
|
||||||
|
|
|
@ -944,8 +944,8 @@ During training, the results of each step are passed to a logger function. By
|
||||||
default, these results are written to the console with the
|
default, these results are written to the console with the
|
||||||
[`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
|
[`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
|
||||||
for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
|
for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
|
||||||
[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function
|
[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger). On each
|
||||||
receives a **dictionary** with the following keys:
|
step, the logger function receives a **dictionary** with the following keys:
|
||||||
|
|
||||||
| Key | Value |
|
| Key | Value |
|
||||||
| -------------- | ----------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------------------------------------------------------------------------------------------------- |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user