mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Add debug diff command in spaCy CLI (#10502)
* Add initial design for diff command For now, the diffing process looks like this: - The default config is created based from some values in the user config (e.g. which pipeline components were used, the lang, etc.) - The user must supply manually if it was optimized for acc/efficiency and if pretraining was involved. * Make diff command structure similar to siblings * Include gpu as a user option for CLI * Make variables more explicit * Fix type declaration for optimize enum * Improve docstrings for diff CLI * Add debug-diff to website API docs * Switch position of configs so that user config is modded * Add markdown flag for debug diff This commit adds a --markdown (--md) flag that allows easier copy-pasting to Github issues. Please note that this commit is dependent on an unreleased version of wasabi (for the time being). For posterity, the related PR is found here: https://github.com/ines/wasabi/pull/20 * Bump version of wasabi to 0.9.1 So that we can use the add_symbols parameter. * Apply suggestions from code review Co-authored-by: Ines Montani <ines@ines.io> * Update docs based on code review suggestions Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Change command name from diff -> diff-config * Clarify when options are relevant or not * Rerun prettier on cli.md Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
b91255a454
commit
02dafa3a84
|
@ -7,7 +7,7 @@ thinc>=8.0.14,<8.1.0
|
|||
blis>=0.4.0,<0.8.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.8.1,<1.1.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
srsly>=2.4.1,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
|
|
|
@ -48,7 +48,7 @@ install_requires =
|
|||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.14,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
wasabi>=0.8.1,<1.1.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
srsly>=2.4.1,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
|
|
|
@ -14,6 +14,7 @@ from .pretrain import pretrain # noqa: F401
|
|||
from .debug_data import debug_data # noqa: F401
|
||||
from .debug_config import debug_config # noqa: F401
|
||||
from .debug_model import debug_model # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
|
|
89
spacy/cli/debug_diff.py
Normal file
89
spacy/cli/debug_diff.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from wasabi import Printer, diff_strings, MarkdownRenderer
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
|
||||
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ..util import load_config
|
||||
from .init_config import init_config, Optimizations
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
"diff-config",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def debug_diff_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
|
||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
|
||||
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
|
||||
pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
|
||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
|
||||
# fmt: on
|
||||
):
|
||||
"""Show a diff of a config file with respect to spaCy's defaults or another config file. If
|
||||
additional settings were used in the creation of the config file, then you
|
||||
must supply these as extra parameters to the command when comparing to the default settings. The generated diff
|
||||
can also be used when posting to the discussion forum to provide more
|
||||
information for the maintainers.
|
||||
|
||||
The `optimize`, `gpu`, and `pretraining` options are only relevant when
|
||||
comparing against the default configuration (or specifically when `compare_to` is None).
|
||||
|
||||
DOCS: https://spacy.io/api/cli#debug-diff
|
||||
"""
|
||||
debug_diff(
|
||||
config_path=config_path,
|
||||
compare_to=compare_to,
|
||||
gpu=gpu,
|
||||
optimize=optimize,
|
||||
pretraining=pretraining,
|
||||
markdown=markdown,
|
||||
)
|
||||
|
||||
|
||||
def debug_diff(
|
||||
config_path: Path,
|
||||
compare_to: Optional[Path],
|
||||
gpu: bool,
|
||||
optimize: Optimizations,
|
||||
pretraining: bool,
|
||||
markdown: bool,
|
||||
):
|
||||
msg = Printer()
|
||||
with show_validation_error(hint_fill=False):
|
||||
user_config = load_config(config_path)
|
||||
if compare_to:
|
||||
other_config = load_config(compare_to)
|
||||
else:
|
||||
# Recreate a default config based from user's config
|
||||
lang = user_config["nlp"]["lang"]
|
||||
pipeline = list(user_config["nlp"]["pipeline"])
|
||||
msg.info(f"Found user-defined language: '{lang}'")
|
||||
msg.info(f"Found user-defined pipelines: {pipeline}")
|
||||
other_config = init_config(
|
||||
lang=lang,
|
||||
pipeline=pipeline,
|
||||
optimize=optimize.value,
|
||||
gpu=gpu,
|
||||
pretraining=pretraining,
|
||||
silent=True,
|
||||
)
|
||||
|
||||
user = user_config.to_str()
|
||||
other = other_config.to_str()
|
||||
|
||||
if user == other:
|
||||
msg.warn("No diff to show: configs are identical")
|
||||
else:
|
||||
diff_text = diff_strings(other, user, add_symbols=markdown)
|
||||
if markdown:
|
||||
md = MarkdownRenderer()
|
||||
md.add(md.code_block(diff_text, "diff"))
|
||||
print(md.text)
|
||||
else:
|
||||
print(diff_text)
|
|
@ -626,6 +626,235 @@ will not be available.
|
|||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||
| **PRINTS** | Debugging information. |
|
||||
|
||||
### debug diff-config {#debug-diff tag="command"}
|
||||
|
||||
Show a diff of a config file with respect to spaCy's defaults or another config
|
||||
file. If additional settings were used in the creation of the config file, then
|
||||
you must supply these as extra parameters to the command when comparing to the
|
||||
default settings. The generated diff can also be used when posting to the
|
||||
discussion forum to provide more information for the maintainers.
|
||||
|
||||
```cli
|
||||
$ python -m spacy debug diff-config [config_path] [--compare-to] [--optimize] [--gpu] [--pretraining] [--markdown]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```cli
|
||||
> $ python -m spacy debug diff-config ./config.cfg
|
||||
> ```
|
||||
|
||||
<Accordion title="Example output" spaced>
|
||||
|
||||
```
|
||||
ℹ Found user-defined language: 'en'
|
||||
ℹ Found user-defined pipelines: ['tok2vec', 'tagger', 'parser',
|
||||
'ner']
|
||||
[paths]
|
||||
+ train = "./data/train.spacy"
|
||||
+ dev = "./data/dev.spacy"
|
||||
- train = null
|
||||
- dev = null
|
||||
vectors = null
|
||||
init_tok2vec = null
|
||||
|
||||
[system]
|
||||
gpu_allocator = null
|
||||
+ seed = 42
|
||||
- seed = 0
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec","tagger","parser","ner"]
|
||||
batch_size = 1000
|
||||
disabled = []
|
||||
before_creation = null
|
||||
after_creation = null
|
||||
after_pipeline_creation = null
|
||||
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
||||
|
||||
[components]
|
||||
|
||||
[components.ner]
|
||||
factory = "ner"
|
||||
incorrect_spans_key = null
|
||||
moves = null
|
||||
scorer = {"@scorers":"spacy.ner_scorer.v1"}
|
||||
update_with_oracle_cut_size = 100
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
- hidden_width = 64
|
||||
+ hidden_width = 36
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
nO = null
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
upstream = "*"
|
||||
|
||||
[components.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 30
|
||||
moves = null
|
||||
scorer = {"@scorers":"spacy.parser_scorer.v1"}
|
||||
update_with_oracle_cut_size = 100
|
||||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = true
|
||||
nO = null
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
upstream = "*"
|
||||
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
neg_prefix = "!"
|
||||
overwrite = false
|
||||
scorer = {"@scorers":"spacy.tagger_scorer.v1"}
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
nO = null
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
upstream = "*"
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
|
||||
rows = [5000,2500,2500,2500]
|
||||
include_static_vectors = false
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
|
||||
[corpora]
|
||||
|
||||
[corpora.dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.dev}
|
||||
max_length = 0
|
||||
gold_preproc = false
|
||||
limit = 0
|
||||
augmenter = null
|
||||
|
||||
[corpora.train]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.train}
|
||||
max_length = 0
|
||||
gold_preproc = false
|
||||
limit = 0
|
||||
augmenter = null
|
||||
|
||||
[training]
|
||||
dev_corpus = "corpora.dev"
|
||||
train_corpus = "corpora.train"
|
||||
seed = ${system.seed}
|
||||
gpu_allocator = ${system.gpu_allocator}
|
||||
dropout = 0.1
|
||||
accumulate_gradient = 1
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 200
|
||||
frozen_components = []
|
||||
annotating_components = []
|
||||
before_to_disk = null
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
discard_oversize = false
|
||||
tolerance = 0.2
|
||||
get_length = null
|
||||
|
||||
[training.batcher.size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
t = 0.0
|
||||
|
||||
[training.logger]
|
||||
@loggers = "spacy.ConsoleLogger.v1"
|
||||
progress_bar = false
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = false
|
||||
eps = 0.00000001
|
||||
learn_rate = 0.001
|
||||
|
||||
[training.score_weights]
|
||||
tag_acc = 0.33
|
||||
dep_uas = 0.17
|
||||
dep_las = 0.17
|
||||
dep_las_per_type = null
|
||||
sents_p = null
|
||||
sents_r = null
|
||||
sents_f = 0.0
|
||||
ents_f = 0.33
|
||||
ents_p = 0.0
|
||||
ents_r = 0.0
|
||||
ents_per_type = null
|
||||
|
||||
[pretraining]
|
||||
|
||||
[initialize]
|
||||
vectors = ${paths.vectors}
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
vocab_data = null
|
||||
lookups = null
|
||||
before_init = null
|
||||
after_init = null
|
||||
|
||||
[initialize.components]
|
||||
|
||||
[initialize.tokenizer]
|
||||
```
|
||||
|
||||
</Accordion>
|
||||
|
||||
| Name | Description |
|
||||
| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Union[Path, str] \(positional)~~ |
|
||||
| `compare_to` | Path to another config file to diff against, or `None` to compare against default settings. ~~Optional[Union[Path, str] \(option)~~ |
|
||||
| `optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether the config was optimized for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). Only relevant when comparing against a default config. Defaults to `"efficiency"`. ~~str (option)~~ |
|
||||
| `gpu`, `-G` | Whether the config was made to run on a GPU. Only relevant when comparing against a default config. ~~bool (flag)~~ |
|
||||
| `pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Only relevant when comparing against a default config. Defaults to `False`. ~~bool (flag)~~ |
|
||||
| `markdown`, `-md` | Generate Markdown for Github issues. Defaults to `False`. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Diff between the two config files. |
|
||||
|
||||
### debug profile {#debug-profile tag="command"}
|
||||
|
||||
Profile which functions take the most time in a spaCy pipeline. Input should be
|
||||
|
|
Loading…
Reference in New Issue
Block a user