Add debug diff command in spaCy CLI (#10502)

* Add initial design for diff command For now, the diffing process looks like this: - The default config is created based from some values in the user config (e.g. which pipeline components were used, the lang, etc.) - The user must supply manually if it was optimized for acc/efficiency and if pretraining was involved. * Make diff command structure similar to siblings * Include gpu as a user option for CLI * Make variables more explicit * Fix type declaration for optimize enum * Improve docstrings for diff CLI * Add debug-diff to website API docs * Switch position of configs so that user config is modded * Add markdown flag for debug diff This commit adds a --markdown (--md) flag that allows easier copy-pasting to Github issues. Please note that this commit is dependent on an unreleased version of wasabi (for the time being). For posterity, the related PR is found here: https://github.com/ines/wasabi/pull/20 * Bump version of wasabi to 0.9.1 So that we can use the add_symbols parameter. * Apply suggestions from code review Co-authored-by: Ines Montani <ines@ines.io> * Update docs based on code review suggestions Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Change command name from diff -> diff-config * Clarify when options are relevant or not * Rerun prettier on cli.md Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2025-08-09 06:34:54 +03:00 · 2022-04-07 16:48:45 +08:00 · 2022-04-07 16:48:45 +08:00 · 02dafa3a84
commit 02dafa3a84
parent b91255a454
5 changed files with 321 additions and 2 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -7,7 +7,7 @@ thinc>=8.0.14,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.8.1,<1.1.0
+wasabi>=0.9.1,<1.1.0
 srsly>=2.4.1,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.5.0
--- a/setup.cfg
+++ b/setup.cfg
@ -48,7 +48,7 @@ install_requires =
    preshed>=3.0.2,<3.1.0
    thinc>=8.0.14,<8.1.0
    blis>=0.4.0,<0.8.0
-    wasabi>=0.8.1,<1.1.0
+    wasabi>=0.9.1,<1.1.0
    srsly>=2.4.1,<3.0.0
    catalogue>=2.0.6,<2.1.0
    typer>=0.3.0,<0.5.0
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -14,6 +14,7 @@ from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
+from .debug_diff import debug_diff  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
--- a/spacy/cli/debug_diff.py
+++ b/spacy/cli/debug_diff.py
@ -0,0 +1,89 @@
+from typing import Optional
+
+import typer
+from wasabi import Printer, diff_strings, MarkdownRenderer
+from pathlib import Path
+from thinc.api import Config
+
+from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
+from ..util import load_config
+from .init_config import init_config, Optimizations
+
+
+@debug_cli.command(
+    "diff-config",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def debug_diff_cli(
+    # fmt: off
+    ctx: typer.Context,
+    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
+    compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
+    optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
+    gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
+    pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
+    markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
+    # fmt: on
+):
+    """Show a diff of a config file with respect to spaCy's defaults or another config file. If
+    additional settings were used in the creation of the config file, then you
+    must supply these as extra parameters to the command when comparing to the default settings. The generated diff
+    can also be used when posting to the discussion forum to provide more
+    information for the maintainers.
+
+    The `optimize`, `gpu`, and `pretraining` options are only relevant when
+    comparing against the default configuration (or specifically when `compare_to` is None).
+
+    DOCS: https://spacy.io/api/cli#debug-diff
+    """
+    debug_diff(
+        config_path=config_path,
+        compare_to=compare_to,
+        gpu=gpu,
+        optimize=optimize,
+        pretraining=pretraining,
+        markdown=markdown,
+    )
+
+
+def debug_diff(
+    config_path: Path,
+    compare_to: Optional[Path],
+    gpu: bool,
+    optimize: Optimizations,
+    pretraining: bool,
+    markdown: bool,
+):
+    msg = Printer()
+    with show_validation_error(hint_fill=False):
+        user_config = load_config(config_path)
+        if compare_to:
+            other_config = load_config(compare_to)
+        else:
+            # Recreate a default config based from user's config
+            lang = user_config["nlp"]["lang"]
+            pipeline = list(user_config["nlp"]["pipeline"])
+            msg.info(f"Found user-defined language: '{lang}'")
+            msg.info(f"Found user-defined pipelines: {pipeline}")
+            other_config = init_config(
+                lang=lang,
+                pipeline=pipeline,
+                optimize=optimize.value,
+                gpu=gpu,
+                pretraining=pretraining,
+                silent=True,
+            )
+
+    user = user_config.to_str()
+    other = other_config.to_str()
+
+    if user == other:
+        msg.warn("No diff to show: configs are identical")
+    else:
+        diff_text = diff_strings(other, user, add_symbols=markdown)
+        if markdown:
+            md = MarkdownRenderer()
+            md.add(md.code_block(diff_text, "diff"))
+            print(md.text)
+        else:
+            print(diff_text)
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -626,6 +626,235 @@ will not be available.
 | overrides                  | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **PRINTS**                 | Debugging information.                                                                                                                                                                                             |

+### debug diff-config {#debug-diff tag="command"}
+
+Show a diff of a config file with respect to spaCy's defaults or another config
+file. If additional settings were used in the creation of the config file, then
+you must supply these as extra parameters to the command when comparing to the
+default settings. The generated diff can also be used when posting to the
+discussion forum to provide more information for the maintainers.
+
+```cli
+$ python -m spacy debug diff-config [config_path] [--compare-to] [--optimize] [--gpu] [--pretraining] [--markdown]
+```
+
+> #### Example
+>
+> ```cli
+> $ python -m spacy debug diff-config ./config.cfg
+> ```
+
+<Accordion title="Example output" spaced>
+
+```
+ℹ Found user-defined language: 'en'
+ℹ Found user-defined pipelines: ['tok2vec', 'tagger', 'parser',
+'ner']
+[paths]
+ train = "./data/train.spacy"
+ dev = "./data/dev.spacy"
+- train = null
+- dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = null
+ seed = 42
+- seed = 0
+
+[nlp]
+lang = "en"
+pipeline = ["tok2vec","tagger","parser","ner"]
+batch_size = 1000
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.ner]
+factory = "ner"
+incorrect_spans_key = null
+moves = null
+scorer = {"@scorers":"spacy.ner_scorer.v1"}
+update_with_oracle_cut_size = 100
+
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+- hidden_width = 64
+ hidden_width = 36
+maxout_pieces = 2
+use_upper = true
+nO = null
+
+[components.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+upstream = "*"
+
+[components.parser]
+factory = "parser"
+learn_tokens = false
+min_action_freq = 30
+moves = null
+scorer = {"@scorers":"spacy.parser_scorer.v1"}
+update_with_oracle_cut_size = 100
+
+[components.parser.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "parser"
+extra_state_tokens = false
+hidden_width = 128
+maxout_pieces = 3
+use_upper = true
+nO = null
+
+[components.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+upstream = "*"
+
+[components.tagger]
+factory = "tagger"
+neg_prefix = "!"
+overwrite = false
+scorer = {"@scorers":"spacy.tagger_scorer.v1"}
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+nO = null
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+upstream = "*"
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v2"
+width = ${components.tok2vec.model.encode.width}
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+rows = [5000,2500,2500,2500]
+include_static_vectors = false
+
+[components.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+tag_acc = 0.33
+dep_uas = 0.17
+dep_las = 0.17
+dep_las_per_type = null
+sents_p = null
+sents_r = null
+sents_f = 0.0
+ents_f = 0.33
+ents_p = 0.0
+ents_r = 0.0
+ents_per_type = null
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
+```
+
+</Accordion>
+
+| Name                 | Description                                                                                                                                                                                                                                                                                               |
+| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `config_path`        | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Union[Path, str] \(positional)~~                                                                                                                                                                  |
+| `compare_to`         | Path to another config file to diff against, or `None` to compare against default settings. ~~Optional[Union[Path, str] \(option)~~                                                                                                                                                                       |
+| `optimize`, `-o`     | `"efficiency"` or `"accuracy"`. Whether the config was optimized for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). Only relevant when comparing against a default config. Defaults to `"efficiency"`. ~~str (option)~~ |
+| `gpu`, `-G`          | Whether the config was made to run on a GPU. Only relevant when comparing against a default config. ~~bool (flag)~~                                                                                                                                                                                       |
+| `pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Only relevant when comparing against a default config. Defaults to `False`. ~~bool (flag)~~                                                                                                                                  |
+| `markdown`, `-md`    | Generate Markdown for Github issues. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                                 |
+| **PRINTS**           | Diff between the two config files.                                                                                                                                                                                                                                                                        |
+
 ### debug profile {#debug-profile tag="command"}

 Profile which functions take the most time in a spaCy pipeline. Input should be