Add debug diff command in spaCy CLI (#10502)

* Add initial design for diff command

For now, the diffing process looks like this:
- The default config is created based from some values in the user
config (e.g. which pipeline components were used, the lang, etc.)
- The user must supply manually if it was optimized for acc/efficiency
and if pretraining was involved.

* Make diff command structure similar to siblings

* Include gpu as a user option for CLI

* Make variables more explicit

* Fix type declaration for optimize enum

* Improve docstrings for diff CLI

* Add debug-diff to website API docs

* Switch position of configs so that user config is modded

* Add markdown flag for debug diff

This commit adds a --markdown (--md) flag that allows easier
copy-pasting to Github issues. Please note that this commit is dependent
on an unreleased version of wasabi (for the time being).

For posterity, the related PR is found here: https://github.com/ines/wasabi/pull/20

* Bump version of wasabi to 0.9.1

So that we can use the add_symbols parameter.

* Apply suggestions from code review

Co-authored-by: Ines Montani <ines@ines.io>

* Update docs based on code review suggestions

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Change command name from diff -> diff-config

* Clarify when options are relevant or not

* Rerun prettier on cli.md

Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Lj Miranda 2022-04-07 16:48:45 +08:00 committed by GitHub
parent b91255a454
commit 02dafa3a84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 321 additions and 2 deletions

View File

@ -7,7 +7,7 @@ thinc>=8.0.14,<8.1.0
blis>=0.4.0,<0.8.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.1,<1.1.0
wasabi>=0.9.1,<1.1.0
srsly>=2.4.1,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.5.0

View File

@ -48,7 +48,7 @@ install_requires =
preshed>=3.0.2,<3.1.0
thinc>=8.0.14,<8.1.0
blis>=0.4.0,<0.8.0
wasabi>=0.8.1,<1.1.0
wasabi>=0.9.1,<1.1.0
srsly>=2.4.1,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.5.0

View File

@ -14,6 +14,7 @@ from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401

89
spacy/cli/debug_diff.py Normal file
View File

@ -0,0 +1,89 @@
from typing import Optional
import typer
from wasabi import Printer, diff_strings, MarkdownRenderer
from pathlib import Path
from thinc.api import Config
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
from ..util import load_config
from .init_config import init_config, Optimizations
@debug_cli.command(
"diff-config",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def debug_diff_cli(
# fmt: off
ctx: typer.Context,
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
# fmt: on
):
"""Show a diff of a config file with respect to spaCy's defaults or another config file. If
additional settings were used in the creation of the config file, then you
must supply these as extra parameters to the command when comparing to the default settings. The generated diff
can also be used when posting to the discussion forum to provide more
information for the maintainers.
The `optimize`, `gpu`, and `pretraining` options are only relevant when
comparing against the default configuration (or specifically when `compare_to` is None).
DOCS: https://spacy.io/api/cli#debug-diff
"""
debug_diff(
config_path=config_path,
compare_to=compare_to,
gpu=gpu,
optimize=optimize,
pretraining=pretraining,
markdown=markdown,
)
def debug_diff(
config_path: Path,
compare_to: Optional[Path],
gpu: bool,
optimize: Optimizations,
pretraining: bool,
markdown: bool,
):
msg = Printer()
with show_validation_error(hint_fill=False):
user_config = load_config(config_path)
if compare_to:
other_config = load_config(compare_to)
else:
# Recreate a default config based from user's config
lang = user_config["nlp"]["lang"]
pipeline = list(user_config["nlp"]["pipeline"])
msg.info(f"Found user-defined language: '{lang}'")
msg.info(f"Found user-defined pipelines: {pipeline}")
other_config = init_config(
lang=lang,
pipeline=pipeline,
optimize=optimize.value,
gpu=gpu,
pretraining=pretraining,
silent=True,
)
user = user_config.to_str()
other = other_config.to_str()
if user == other:
msg.warn("No diff to show: configs are identical")
else:
diff_text = diff_strings(other, user, add_symbols=markdown)
if markdown:
md = MarkdownRenderer()
md.add(md.code_block(diff_text, "diff"))
print(md.text)
else:
print(diff_text)

View File

@ -626,6 +626,235 @@ will not be available.
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **PRINTS** | Debugging information. |
### debug diff-config {#debug-diff tag="command"}
Show a diff of a config file with respect to spaCy's defaults or another config
file. If additional settings were used in the creation of the config file, then
you must supply these as extra parameters to the command when comparing to the
default settings. The generated diff can also be used when posting to the
discussion forum to provide more information for the maintainers.
```cli
$ python -m spacy debug diff-config [config_path] [--compare-to] [--optimize] [--gpu] [--pretraining] [--markdown]
```
> #### Example
>
> ```cli
> $ python -m spacy debug diff-config ./config.cfg
> ```
<Accordion title="Example output" spaced>
```
Found user-defined language: 'en'
Found user-defined pipelines: ['tok2vec', 'tagger', 'parser',
'ner']
[paths]
+ train = "./data/train.spacy"
+ dev = "./data/dev.spacy"
- train = null
- dev = null
vectors = null
init_tok2vec = null
[system]
gpu_allocator = null
+ seed = 42
- seed = 0
[nlp]
lang = "en"
pipeline = ["tok2vec","tagger","parser","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
[components]
[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
- hidden_width = 64
+ hidden_width = 36
maxout_pieces = 2
use_upper = true
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"
[components.parser]
factory = "parser"
learn_tokens = false
min_action_freq = 30
moves = null
scorer = {"@scorers":"spacy.parser_scorer.v1"}
update_with_oracle_cut_size = 100
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = true
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"
[components.tagger]
factory = "tagger"
neg_prefix = "!"
overwrite = false
scorer = {"@scorers":"spacy.tagger_scorer.v1"}
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001
[training.score_weights]
tag_acc = 0.33
dep_uas = 0.17
dep_las = 0.17
dep_las_per_type = null
sents_p = null
sents_r = null
sents_f = 0.0
ents_f = 0.33
ents_p = 0.0
ents_r = 0.0
ents_per_type = null
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.tokenizer]
```
</Accordion>
| Name | Description |
| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Union[Path, str] \(positional)~~ |
| `compare_to` | Path to another config file to diff against, or `None` to compare against default settings. ~~Optional[Union[Path, str] \(option)~~ |
| `optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether the config was optimized for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). Only relevant when comparing against a default config. Defaults to `"efficiency"`. ~~str (option)~~ |
| `gpu`, `-G` | Whether the config was made to run on a GPU. Only relevant when comparing against a default config. ~~bool (flag)~~ |
| `pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Only relevant when comparing against a default config. Defaults to `False`. ~~bool (flag)~~ |
| `markdown`, `-md` | Generate Markdown for Github issues. Defaults to `False`. ~~bool (flag)~~ |
| **PRINTS** | Diff between the two config files. |
### debug profile {#debug-profile tag="command"}
Profile which functions take the most time in a spaCy pipeline. Input should be