Add debug diff command in spaCy CLI (#10502)

* Add initial design for diff command For now, the diffing process looks like this: - The default config is created based from some values in the user config (e.g. which pipeline components were used, the lang, etc.) - The user must supply manually if it was optimized for acc/efficiency and if pretraining was involved. * Make diff command structure similar to siblings * Include gpu as a user option for CLI * Make variables more explicit * Fix type declaration for optimize enum * Improve docstrings for diff CLI * Add debug-diff to website API docs * Switch position of configs so that user config is modded * Add markdown flag for debug diff This commit adds a --markdown (--md) flag that allows easier copy-pasting to Github issues. Please note that this commit is dependent on an unreleased version of wasabi (for the time being). For posterity, the related PR is found here: https://github.com/ines/wasabi/pull/20 * Bump version of wasabi to 0.9.1 So that we can use the add_symbols parameter. * Apply suggestions from code review Co-authored-by: Ines Montani <ines@ines.io> * Update docs based on code review suggestions Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Change command name from diff -> diff-config * Clarify when options are relevant or not * Rerun prettier on cli.md Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2025-11-25 12:25:51 +03:00 · 2022-04-07 16:48:45 +08:00 · 2022-04-07 16:48:45 +08:00 · 02dafa3a84
commit 02dafa3a84
parent b91255a454
5 changed files with 321 additions and 2 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -7,7 +7,7 @@ thinc>=8.0.14,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.8.1,<1.1.0
+wasabi>=0.9.1,<1.1.0
 srsly>=2.4.1,<3.0.0
 catalogue>=2.0.6,<2.1.0
 typer>=0.3.0,<0.5.0
--- a/setup.cfg
+++ b/setup.cfg
@ -48,7 +48,7 @@ install_requires =
    preshed>=3.0.2,<3.1.0
    thinc>=8.0.14,<8.1.0
    blis>=0.4.0,<0.8.0
-    wasabi>=0.8.1,<1.1.0
+    wasabi>=0.9.1,<1.1.0
    srsly>=2.4.1,<3.0.0
    catalogue>=2.0.6,<2.1.0
    typer>=0.3.0,<0.5.0
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -14,6 +14,7 @@ from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .debug_diff import debug_diff  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
--- a/spacy/cli/debug_diff.py
+++ b/spacy/cli/debug_diff.py
@ -0,0 +1,89 @@
 from typing import Optional
 import typer
 from wasabi import Printer, diff_strings, MarkdownRenderer
 from pathlib import Path
 from thinc.api import Config
 from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
 from ..util import load_config
 from .init_config import init_config, Optimizations
@debug_cli.command(
    "diff-config",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def debug_diff_cli(
    # fmt: off
    ctx: typer.Context,
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
    optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
    gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
    pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
    markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
    # fmt: on
 ):
    """Show a diff of a config file with respect to spaCy's defaults or another config file. If
    additional settings were used in the creation of the config file, then you
    must supply these as extra parameters to the command when comparing to the default settings. The generated diff
    can also be used when posting to the discussion forum to provide more
    information for the maintainers.
    The `optimize`, `gpu`, and `pretraining` options are only relevant when
    comparing against the default configuration (or specifically when `compare_to` is None).
    DOCS: https://spacy.io/api/cli#debug-diff
    """
    debug_diff(
        config_path=config_path,
        compare_to=compare_to,
        gpu=gpu,
        optimize=optimize,
        pretraining=pretraining,
        markdown=markdown,
    )
 def debug_diff(
    config_path: Path,
    compare_to: Optional[Path],
    gpu: bool,
    optimize: Optimizations,
    pretraining: bool,
    markdown: bool,
 ):
    msg = Printer()
    with show_validation_error(hint_fill=False):
        user_config = load_config(config_path)
        if compare_to:
            other_config = load_config(compare_to)
        else:
            # Recreate a default config based from user's config
            lang = user_config["nlp"]["lang"]
            pipeline = list(user_config["nlp"]["pipeline"])
            msg.info(f"Found user-defined language: '{lang}'")
            msg.info(f"Found user-defined pipelines: {pipeline}")
            other_config = init_config(
                lang=lang,
                pipeline=pipeline,
                optimize=optimize.value,
                gpu=gpu,
                pretraining=pretraining,
                silent=True,
            )
    user = user_config.to_str()
    other = other_config.to_str()
    if user == other:
        msg.warn("No diff to show: configs are identical")
    else:
        diff_text = diff_strings(other, user, add_symbols=markdown)
        if markdown:
            md = MarkdownRenderer()
            md.add(md.code_block(diff_text, "diff"))
            print(md.text)
        else:
            print(diff_text)
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -626,6 +626,235 @@ will not be available.
 | overrides                  | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **PRINTS**                 | Debugging information.                                                                                                                                                                                             |
 ### debug diff-config {#debug-diff tag="command"}
 Show a diff of a config file with respect to spaCy's defaults or another config
 file. If additional settings were used in the creation of the config file, then
 you must supply these as extra parameters to the command when comparing to the
 default settings. The generated diff can also be used when posting to the
 discussion forum to provide more information for the maintainers.
 ```cli
 $ python -m spacy debug diff-config [config_path] [--compare-to] [--optimize] [--gpu] [--pretraining] [--markdown]
 ```
 > #### Example
 >
 > ```cli
 > $ python -m spacy debug diff-config ./config.cfg
 > ```
 <Accordion title="Example output" spaced>
 ```
 ℹ Found user-defined language: 'en'
 ℹ Found user-defined pipelines: ['tok2vec', 'tagger', 'parser',
 'ner']
 [paths]
 + train = "./data/train.spacy"
 + dev = "./data/dev.spacy"
 - train = null
 - dev = null
 vectors = null
 init_tok2vec = null
 [system]
 gpu_allocator = null
 + seed = 42
 - seed = 0
 [nlp]
 lang = "en"
 pipeline = ["tok2vec","tagger","parser","ner"]
 batch_size = 1000
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 [components]
 [components.ner]
 factory = "ner"
 incorrect_spans_key = null
 moves = null
 scorer = {"@scorers":"spacy.ner_scorer.v1"}
 update_with_oracle_cut_size = 100
 [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "ner"
 extra_state_tokens = false
 - hidden_width = 64
 + hidden_width = 36
 maxout_pieces = 2
 use_upper = true
 nO = null
 [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 upstream = "*"
 [components.parser]
 factory = "parser"
 learn_tokens = false
 min_action_freq = 30
 moves = null
 scorer = {"@scorers":"spacy.parser_scorer.v1"}
 update_with_oracle_cut_size = 100
 [components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
 state_type = "parser"
 extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
 use_upper = true
 nO = null
 [components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 upstream = "*"
 [components.tagger]
 factory = "tagger"
 neg_prefix = "!"
 overwrite = false
 scorer = {"@scorers":"spacy.tagger_scorer.v1"}
 [components.tagger.model]
@architectures = "spacy.Tagger.v1"
 nO = null
 [components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.encode.width}
 upstream = "*"
 [components.tok2vec]
 factory = "tok2vec"
 [components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
 [components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
 attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
 rows = [5000,2500,2500,2500]
 include_static_vectors = false
 [components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
 width = 96
 depth = 4
 window_size = 1
 maxout_pieces = 3
 [corpora]
 [corpora.dev]
@readers = "spacy.Corpus.v1"
 path = ${paths.dev}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [corpora.train]
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = 0
 gold_preproc = false
 limit = 0
 augmenter = null
 [training]
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 annotating_components = []
 before_to_disk = null
 [training.batcher]
@batchers = "spacy.batch_by_words.v1"
 discard_oversize = false
 tolerance = 0.2
 get_length = null
 [training.batcher.size]
@schedules = "compounding.v1"
 start = 100
 stop = 1000
 compound = 1.001
 t = 0.0
 [training.logger]
@loggers = "spacy.ConsoleLogger.v1"
 progress_bar = false
 [training.optimizer]
@optimizers = "Adam.v1"
 beta1 = 0.9
 beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
 [training.score_weights]
 tag_acc = 0.33
 dep_uas = 0.17
 dep_las = 0.17
 dep_las_per_type = null
 sents_p = null
 sents_r = null
 sents_f = 0.0
 ents_f = 0.33
 ents_p = 0.0
 ents_r = 0.0
 ents_per_type = null
 [pretraining]
 [initialize]
 vectors = ${paths.vectors}
 init_tok2vec = ${paths.init_tok2vec}
 vocab_data = null
 lookups = null
 before_init = null
 after_init = null
 [initialize.components]
 [initialize.tokenizer]
 ```
 </Accordion>
 | Name                 | Description                                                                                                                                                                                                                                                                                               |
 | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `config_path`        | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Union[Path, str] \(positional)~~                                                                                                                                                                  |
 | `compare_to`         | Path to another config file to diff against, or `None` to compare against default settings. ~~Optional[Union[Path, str] \(option)~~                                                                                                                                                                       |
 | `optimize`, `-o`     | `"efficiency"` or `"accuracy"`. Whether the config was optimized for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). Only relevant when comparing against a default config. Defaults to `"efficiency"`. ~~str (option)~~ |
 | `gpu`, `-G`          | Whether the config was made to run on a GPU. Only relevant when comparing against a default config. ~~bool (flag)~~                                                                                                                                                                                       |
 | `pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Only relevant when comparing against a default config. Defaults to `False`. ~~bool (flag)~~                                                                                                                                  |
 | `markdown`, `-md`    | Generate Markdown for Github issues. Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                                 |
 | **PRINTS**           | Diff between the two config files.                                                                                                                                                                                                                                                                        |
 ### debug profile {#debug-profile tag="command"}
 Profile which functions take the most time in a spaCy pipeline. Input should be