mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-04 21:23:22 +03:00
Add debug diff command in spaCy CLI (#10502)
* Add initial design for diff command For now, the diffing process looks like this: - The default config is created based from some values in the user config (e.g. which pipeline components were used, the lang, etc.) - The user must supply manually if it was optimized for acc/efficiency and if pretraining was involved. * Make diff command structure similar to siblings * Include gpu as a user option for CLI * Make variables more explicit * Fix type declaration for optimize enum * Improve docstrings for diff CLI * Add debug-diff to website API docs * Switch position of configs so that user config is modded * Add markdown flag for debug diff This commit adds a --markdown (--md) flag that allows easier copy-pasting to Github issues. Please note that this commit is dependent on an unreleased version of wasabi (for the time being). For posterity, the related PR is found here: https://github.com/ines/wasabi/pull/20 * Bump version of wasabi to 0.9.1 So that we can use the add_symbols parameter. * Apply suggestions from code review Co-authored-by: Ines Montani <ines@ines.io> * Update docs based on code review suggestions Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Change command name from diff -> diff-config * Clarify when options are relevant or not * Rerun prettier on cli.md Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
b91255a454
commit
02dafa3a84
|
@ -7,7 +7,7 @@ thinc>=8.0.14,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.9.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.1,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.5.0
|
typer>=0.3.0,<0.5.0
|
||||||
|
|
|
@ -48,7 +48,7 @@ install_requires =
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.14,<8.1.0
|
thinc>=8.0.14,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.9.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.1,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.5.0
|
typer>=0.3.0,<0.5.0
|
||||||
|
|
|
@ -14,6 +14,7 @@ from .pretrain import pretrain # noqa: F401
|
||||||
from .debug_data import debug_data # noqa: F401
|
from .debug_data import debug_data # noqa: F401
|
||||||
from .debug_config import debug_config # noqa: F401
|
from .debug_config import debug_config # noqa: F401
|
||||||
from .debug_model import debug_model # noqa: F401
|
from .debug_model import debug_model # noqa: F401
|
||||||
|
from .debug_diff import debug_diff # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
|
|
89
spacy/cli/debug_diff.py
Normal file
89
spacy/cli/debug_diff.py
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from wasabi import Printer, diff_strings, MarkdownRenderer
|
||||||
|
from pathlib import Path
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
|
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
|
from ..util import load_config
|
||||||
|
from .init_config import init_config, Optimizations
|
||||||
|
|
||||||
|
|
||||||
|
@debug_cli.command(
|
||||||
|
"diff-config",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
|
def debug_diff_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context,
|
||||||
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
|
compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
|
||||||
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
|
||||||
|
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
|
||||||
|
pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
|
||||||
|
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Show a diff of a config file with respect to spaCy's defaults or another config file. If
|
||||||
|
additional settings were used in the creation of the config file, then you
|
||||||
|
must supply these as extra parameters to the command when comparing to the default settings. The generated diff
|
||||||
|
can also be used when posting to the discussion forum to provide more
|
||||||
|
information for the maintainers.
|
||||||
|
|
||||||
|
The `optimize`, `gpu`, and `pretraining` options are only relevant when
|
||||||
|
comparing against the default configuration (or specifically when `compare_to` is None).
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/cli#debug-diff
|
||||||
|
"""
|
||||||
|
debug_diff(
|
||||||
|
config_path=config_path,
|
||||||
|
compare_to=compare_to,
|
||||||
|
gpu=gpu,
|
||||||
|
optimize=optimize,
|
||||||
|
pretraining=pretraining,
|
||||||
|
markdown=markdown,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def debug_diff(
|
||||||
|
config_path: Path,
|
||||||
|
compare_to: Optional[Path],
|
||||||
|
gpu: bool,
|
||||||
|
optimize: Optimizations,
|
||||||
|
pretraining: bool,
|
||||||
|
markdown: bool,
|
||||||
|
):
|
||||||
|
msg = Printer()
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
user_config = load_config(config_path)
|
||||||
|
if compare_to:
|
||||||
|
other_config = load_config(compare_to)
|
||||||
|
else:
|
||||||
|
# Recreate a default config based from user's config
|
||||||
|
lang = user_config["nlp"]["lang"]
|
||||||
|
pipeline = list(user_config["nlp"]["pipeline"])
|
||||||
|
msg.info(f"Found user-defined language: '{lang}'")
|
||||||
|
msg.info(f"Found user-defined pipelines: {pipeline}")
|
||||||
|
other_config = init_config(
|
||||||
|
lang=lang,
|
||||||
|
pipeline=pipeline,
|
||||||
|
optimize=optimize.value,
|
||||||
|
gpu=gpu,
|
||||||
|
pretraining=pretraining,
|
||||||
|
silent=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
user = user_config.to_str()
|
||||||
|
other = other_config.to_str()
|
||||||
|
|
||||||
|
if user == other:
|
||||||
|
msg.warn("No diff to show: configs are identical")
|
||||||
|
else:
|
||||||
|
diff_text = diff_strings(other, user, add_symbols=markdown)
|
||||||
|
if markdown:
|
||||||
|
md = MarkdownRenderer()
|
||||||
|
md.add(md.code_block(diff_text, "diff"))
|
||||||
|
print(md.text)
|
||||||
|
else:
|
||||||
|
print(diff_text)
|
|
@ -626,6 +626,235 @@ will not be available.
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||||
| **PRINTS** | Debugging information. |
|
| **PRINTS** | Debugging information. |
|
||||||
|
|
||||||
|
### debug diff-config {#debug-diff tag="command"}
|
||||||
|
|
||||||
|
Show a diff of a config file with respect to spaCy's defaults or another config
|
||||||
|
file. If additional settings were used in the creation of the config file, then
|
||||||
|
you must supply these as extra parameters to the command when comparing to the
|
||||||
|
default settings. The generated diff can also be used when posting to the
|
||||||
|
discussion forum to provide more information for the maintainers.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy debug diff-config [config_path] [--compare-to] [--optimize] [--gpu] [--pretraining] [--markdown]
|
||||||
|
```
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```cli
|
||||||
|
> $ python -m spacy debug diff-config ./config.cfg
|
||||||
|
> ```
|
||||||
|
|
||||||
|
<Accordion title="Example output" spaced>
|
||||||
|
|
||||||
|
```
|
||||||
|
ℹ Found user-defined language: 'en'
|
||||||
|
ℹ Found user-defined pipelines: ['tok2vec', 'tagger', 'parser',
|
||||||
|
'ner']
|
||||||
|
[paths]
|
||||||
|
+ train = "./data/train.spacy"
|
||||||
|
+ dev = "./data/dev.spacy"
|
||||||
|
- train = null
|
||||||
|
- dev = null
|
||||||
|
vectors = null
|
||||||
|
init_tok2vec = null
|
||||||
|
|
||||||
|
[system]
|
||||||
|
gpu_allocator = null
|
||||||
|
+ seed = 42
|
||||||
|
- seed = 0
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["tok2vec","tagger","parser","ner"]
|
||||||
|
batch_size = 1000
|
||||||
|
disabled = []
|
||||||
|
before_creation = null
|
||||||
|
after_creation = null
|
||||||
|
after_pipeline_creation = null
|
||||||
|
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
incorrect_spans_key = null
|
||||||
|
moves = null
|
||||||
|
scorer = {"@scorers":"spacy.ner_scorer.v1"}
|
||||||
|
update_with_oracle_cut_size = 100
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v2"
|
||||||
|
state_type = "ner"
|
||||||
|
extra_state_tokens = false
|
||||||
|
- hidden_width = 64
|
||||||
|
+ hidden_width = 36
|
||||||
|
maxout_pieces = 2
|
||||||
|
use_upper = true
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
upstream = "*"
|
||||||
|
|
||||||
|
[components.parser]
|
||||||
|
factory = "parser"
|
||||||
|
learn_tokens = false
|
||||||
|
min_action_freq = 30
|
||||||
|
moves = null
|
||||||
|
scorer = {"@scorers":"spacy.parser_scorer.v1"}
|
||||||
|
update_with_oracle_cut_size = 100
|
||||||
|
|
||||||
|
[components.parser.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v2"
|
||||||
|
state_type = "parser"
|
||||||
|
extra_state_tokens = false
|
||||||
|
hidden_width = 128
|
||||||
|
maxout_pieces = 3
|
||||||
|
use_upper = true
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.parser.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
upstream = "*"
|
||||||
|
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
neg_prefix = "!"
|
||||||
|
overwrite = false
|
||||||
|
scorer = {"@scorers":"spacy.tagger_scorer.v1"}
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
upstream = "*"
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
|
||||||
|
[components.tok2vec.model.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v2"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
|
||||||
|
rows = [5000,2500,2500,2500]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[components.tok2vec.model.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||||
|
width = 96
|
||||||
|
depth = 4
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.dev}
|
||||||
|
max_length = 0
|
||||||
|
gold_preproc = false
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
max_length = 0
|
||||||
|
gold_preproc = false
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[training]
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
seed = ${system.seed}
|
||||||
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
|
dropout = 0.1
|
||||||
|
accumulate_gradient = 1
|
||||||
|
patience = 1600
|
||||||
|
max_epochs = 0
|
||||||
|
max_steps = 20000
|
||||||
|
eval_frequency = 200
|
||||||
|
frozen_components = []
|
||||||
|
annotating_components = []
|
||||||
|
before_to_disk = null
|
||||||
|
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
discard_oversize = false
|
||||||
|
tolerance = 0.2
|
||||||
|
get_length = null
|
||||||
|
|
||||||
|
[training.batcher.size]
|
||||||
|
@schedules = "compounding.v1"
|
||||||
|
start = 100
|
||||||
|
stop = 1000
|
||||||
|
compound = 1.001
|
||||||
|
t = 0.0
|
||||||
|
|
||||||
|
[training.logger]
|
||||||
|
@loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
progress_bar = false
|
||||||
|
|
||||||
|
[training.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
beta1 = 0.9
|
||||||
|
beta2 = 0.999
|
||||||
|
L2_is_weight_decay = true
|
||||||
|
L2 = 0.01
|
||||||
|
grad_clip = 1.0
|
||||||
|
use_averages = false
|
||||||
|
eps = 0.00000001
|
||||||
|
learn_rate = 0.001
|
||||||
|
|
||||||
|
[training.score_weights]
|
||||||
|
tag_acc = 0.33
|
||||||
|
dep_uas = 0.17
|
||||||
|
dep_las = 0.17
|
||||||
|
dep_las_per_type = null
|
||||||
|
sents_p = null
|
||||||
|
sents_r = null
|
||||||
|
sents_f = 0.0
|
||||||
|
ents_f = 0.33
|
||||||
|
ents_p = 0.0
|
||||||
|
ents_r = 0.0
|
||||||
|
ents_per_type = null
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
vectors = ${paths.vectors}
|
||||||
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
vocab_data = null
|
||||||
|
lookups = null
|
||||||
|
before_init = null
|
||||||
|
after_init = null
|
||||||
|
|
||||||
|
[initialize.components]
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
||||||
|
```
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Union[Path, str] \(positional)~~ |
|
||||||
|
| `compare_to` | Path to another config file to diff against, or `None` to compare against default settings. ~~Optional[Union[Path, str] \(option)~~ |
|
||||||
|
| `optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether the config was optimized for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). Only relevant when comparing against a default config. Defaults to `"efficiency"`. ~~str (option)~~ |
|
||||||
|
| `gpu`, `-G` | Whether the config was made to run on a GPU. Only relevant when comparing against a default config. ~~bool (flag)~~ |
|
||||||
|
| `pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Only relevant when comparing against a default config. Defaults to `False`. ~~bool (flag)~~ |
|
||||||
|
| `markdown`, `-md` | Generate Markdown for Github issues. Defaults to `False`. ~~bool (flag)~~ |
|
||||||
|
| **PRINTS** | Diff between the two config files. |
|
||||||
|
|
||||||
### debug profile {#debug-profile tag="command"}
|
### debug profile {#debug-profile tag="command"}
|
||||||
|
|
||||||
Profile which functions take the most time in a spaCy pipeline. Input should be
|
Profile which functions take the most time in a spaCy pipeline. Input should be
|
||||||
|
|
Loading…
Reference in New Issue
Block a user