diff --git a/requirements.txt b/requirements.txt index 12fdf650f..6f9addbe9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ # Our libraries spacy-legacy>=3.0.8,<3.1.0 +spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.10,<8.1.0 diff --git a/setup.cfg b/setup.cfg index fe484f92e..45fa48ce5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,6 +41,7 @@ setup_requires = install_requires = # Our libraries spacy-legacy>=3.0.8,<3.1.0 + spacy-loggers>=1.0.0,<2.0.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 055f30f42..22f1e64b1 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -7,5 +7,5 @@ from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F40 from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401 from .gold_io import docs_to_json, read_json_file # noqa: F401 from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401 -from .loggers import console_logger, wandb_logger # noqa: F401 +from .loggers import console_logger # noqa: F401 from .callbacks import create_copy_from_base_model # noqa: F401 diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 137e89e56..d80c77b6a 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -4,7 +4,6 @@ import tqdm import sys from ..util import registry -from .. import util from ..errors import Errors if TYPE_CHECKING: @@ -100,166 +99,3 @@ def console_logger(progress_bar: bool = False): return setup_printer - -@registry.loggers("spacy.WandbLogger.v2") -def wandb_logger( - project_name: str, - remove_config_values: List[str] = [], - model_log_interval: Optional[int] = None, - log_dataset_dir: Optional[str] = None, -): - try: - import wandb - - # test that these are available - from wandb import init, log, join # noqa: F401 - except ImportError: - raise ImportError(Errors.E880) - - console = console_logger(progress_bar=False) - - def setup_logger( - nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr - ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]: - config = nlp.config.interpolate() - config_dot = util.dict_to_dot(config) - for field in remove_config_values: - del config_dot[field] - config = util.dot_to_dict(config_dot) - run = wandb.init(project=project_name, config=config, reinit=True) - console_log_step, console_finalize = console(nlp, stdout, stderr) - - def log_dir_artifact( - path: str, - name: str, - type: str, - metadata: Optional[Dict[str, Any]] = {}, - aliases: Optional[List[str]] = [], - ): - dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata) - dataset_artifact.add_dir(path, name=name) - wandb.log_artifact(dataset_artifact, aliases=aliases) - - if log_dataset_dir: - log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset") - - def log_step(info: Optional[Dict[str, Any]]): - console_log_step(info) - if info is not None: - score = info["score"] - other_scores = info["other_scores"] - losses = info["losses"] - wandb.log({"score": score}) - if losses: - wandb.log({f"loss_{k}": v for k, v in losses.items()}) - if isinstance(other_scores, dict): - wandb.log(other_scores) - if model_log_interval and info.get("output_path"): - if info["step"] % model_log_interval == 0 and info["step"] != 0: - log_dir_artifact( - path=info["output_path"], - name="pipeline_" + run.id, - type="checkpoint", - metadata=info, - aliases=[ - f"epoch {info['epoch']} step {info['step']}", - "latest", - "best" - if info["score"] == max(info["checkpoints"])[0] - else "", - ], - ) - - def finalize() -> None: - console_finalize() - wandb.join() - - return log_step, finalize - - return setup_logger - - -@registry.loggers("spacy.WandbLogger.v3") -def wandb_logger( - project_name: str, - remove_config_values: List[str] = [], - model_log_interval: Optional[int] = None, - log_dataset_dir: Optional[str] = None, - entity: Optional[str] = None, - run_name: Optional[str] = None, -): - try: - import wandb - - # test that these are available - from wandb import init, log, join # noqa: F401 - except ImportError: - raise ImportError(Errors.E880) - - console = console_logger(progress_bar=False) - - def setup_logger( - nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr - ) -> Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]: - config = nlp.config.interpolate() - config_dot = util.dict_to_dot(config) - for field in remove_config_values: - del config_dot[field] - config = util.dot_to_dict(config_dot) - run = wandb.init( - project=project_name, config=config, entity=entity, reinit=True - ) - - if run_name: - wandb.run.name = run_name - - console_log_step, console_finalize = console(nlp, stdout, stderr) - - def log_dir_artifact( - path: str, - name: str, - type: str, - metadata: Optional[Dict[str, Any]] = {}, - aliases: Optional[List[str]] = [], - ): - dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata) - dataset_artifact.add_dir(path, name=name) - wandb.log_artifact(dataset_artifact, aliases=aliases) - - if log_dataset_dir: - log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset") - - def log_step(info: Optional[Dict[str, Any]]): - console_log_step(info) - if info is not None: - score = info["score"] - other_scores = info["other_scores"] - losses = info["losses"] - wandb.log({"score": score}) - if losses: - wandb.log({f"loss_{k}": v for k, v in losses.items()}) - if isinstance(other_scores, dict): - wandb.log(other_scores) - if model_log_interval and info.get("output_path"): - if info["step"] % model_log_interval == 0 and info["step"] != 0: - log_dir_artifact( - path=info["output_path"], - name="pipeline_" + run.id, - type="checkpoint", - metadata=info, - aliases=[ - f"epoch {info['epoch']} step {info['step']}", - "latest", - "best" - if info["score"] == max(info["checkpoints"])[0] - else "", - ], - ) - - def finalize() -> None: - console_finalize() - wandb.join() - - return log_step, finalize - - return setup_logger diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index f9490803f..48c16e559 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -411,10 +411,13 @@ finished. To log each training step, a [`spacy train`](/api/cli#train), including information such as the training loss and the accuracy scores on the development set. -There are two built-in logging functions: a logger printing results to the -console in tabular format (which is the default), and one that also sends the -results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of -using one of the built-in loggers listed here, you can also +The built-in, default logger is the ConsoleLogger, which prints results to the +console in tabular format. The +[spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as +a dependency of spaCy, enables other loggers: currently it provides one that sends +results to a [Weights & Biases](https://www.wandb.com/) dashboard. + +Instead of using one of the built-in loggers, you can [implement your own](/usage/training#custom-logging). #### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"} @@ -463,63 +466,6 @@ start decreasing across epochs. -#### spacy.WandbLogger.v3 {#WandbLogger tag="registered function"} - -> #### Installation -> -> ```bash -> $ pip install wandb -> $ wandb login -> ``` - -Built-in logger that sends the results of each training step to the dashboard of -the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights -& Biases should be installed, and you should be logged in. The logger will send -the full config file to W&B, as well as various system information such as -memory utilization, network traffic, disk IO, GPU statistics, etc. This will -also include information such as your hostname and operating system, as well as -the location of your Python executable. - - - -Note that by default, the full (interpolated) -[training config](/usage/training#config) is sent over to the W&B dashboard. If -you prefer to **exclude certain information** such as path names, you can list -those fields in "dot notation" in the `remove_config_values` parameter. These -fields will then be removed from the config before uploading, but will otherwise -remain in the config file stored on your local system. - - - -> #### Example config -> -> ```ini -> [training.logger] -> @loggers = "spacy.WandbLogger.v3" -> project_name = "monitor_spacy_training" -> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] -> log_dataset_dir = "corpus" -> model_log_interval = 1000 -> ``` - -| Name | Description | -| ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | -| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | -| `model_log_interval` | Steps to wait between logging model checkpoints to W&B dasboard (default: None). ~~Optional[int]~~ | -| `log_dataset_dir` | Directory containing dataset to be logged and versioned as W&B artifact (default: None). ~~Optional[str]~~ | -| `run_name` | The name of the run. If you don't specify a run_name, the name will be created by wandb library. (default: None ). ~~Optional[str]~~ | -| `entity` | An entity is a username or team name where you're sending runs. If you don't specify an entity, the run will be sent to your default entity, which is usually your username. (default: None). ~~Optional[str]~~ | - - - -Get started with tracking your spaCy training runs in Weights & Biases using our -project template. It trains on the IMDB Movie Review Dataset and includes a -simple config with the built-in `WandbLogger`, as well as a custom example of -creating variants of the config for a simple hyperparameter grid search and -logging the results. - - ## Readers {#readers} diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 6f6cef7c8..e0e787a1d 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -1016,20 +1016,22 @@ commands: [Weights & Biases](https://www.wandb.com/) is a popular platform for experiment tracking. spaCy integrates with it out-of-the-box via the -[`WandbLogger`](/api/top-level#WandbLogger), which you can add as the -`[training.logger]` block of your training [config](/usage/training#config). The -results of each step are then logged in your project, together with the full -**training config**. This means that _every_ hyperparameter, registered function -name and argument will be tracked and you'll be able to see the impact it has on -your results. +[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger), which +you can add as the `[training.logger]` block of your training +[config](/usage/training#config). The results of each step are then logged in +your project, together with the full **training config**. This means that +_every_ hyperparameter, registered function name and argument will be tracked +and you'll be able to see the impact it has on your results. > #### Example config > > ```ini > [training.logger] -> @loggers = "spacy.WandbLogger.v2" +> @loggers = "spacy.WandbLogger.v3" > project_name = "monitor_spacy_training" > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"] +> log_dataset_dir = "corpus" +> model_log_interval = 1000 > ``` ![Screenshot: Visualized training results](../images/wandb1.jpg) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 94fdad209..c28b43ea6 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -944,8 +944,8 @@ During training, the results of each step are passed to a logger function. By default, these results are written to the console with the [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support for writing the log files to [Weights & Biases](https://www.wandb.com/) with the -[`WandbLogger`](/api/top-level#WandbLogger). On each step, the logger function -receives a **dictionary** with the following keys: +[`WandbLogger`](https://github.com/explosion/spacy-loggers#wandblogger). On each +step, the logger function receives a **dictionary** with the following keys: | Key | Value | | -------------- | ----------------------------------------------------------------------------------------------------- |