diff --git a/requirements.txt b/requirements.txt index 7b9d343a9..71b6f3279 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ thinc>=8.0.14,<8.1.0 blis>=0.4.0,<0.8.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.8.1,<1.1.0 +wasabi>=0.9.1,<1.1.0 srsly>=2.4.1,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.5.0 diff --git a/setup.cfg b/setup.cfg index 3c5ba884a..65f7dc528 100644 --- a/setup.cfg +++ b/setup.cfg @@ -48,7 +48,7 @@ install_requires = preshed>=3.0.2,<3.1.0 thinc>=8.0.14,<8.1.0 blis>=0.4.0,<0.8.0 - wasabi>=0.8.1,<1.1.0 + wasabi>=0.9.1,<1.1.0 srsly>=2.4.1,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.5.0 diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index fd8da262e..ce76ef9a9 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -14,6 +14,7 @@ from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .debug_config import debug_config # noqa: F401 from .debug_model import debug_model # noqa: F401 +from .debug_diff import debug_diff # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_pipeline import init_pipeline_cli # noqa: F401 diff --git a/spacy/cli/debug_diff.py b/spacy/cli/debug_diff.py new file mode 100644 index 000000000..6697c38ae --- /dev/null +++ b/spacy/cli/debug_diff.py @@ -0,0 +1,89 @@ +from typing import Optional + +import typer +from wasabi import Printer, diff_strings, MarkdownRenderer +from pathlib import Path +from thinc.api import Config + +from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides +from ..util import load_config +from .init_config import init_config, Optimizations + + +@debug_cli.command( + "diff-config", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def debug_diff_cli( + # fmt: off + ctx: typer.Context, + config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), + compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True), + optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."), + gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."), + pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."), + markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues") + # fmt: on +): + """Show a diff of a config file with respect to spaCy's defaults or another config file. If + additional settings were used in the creation of the config file, then you + must supply these as extra parameters to the command when comparing to the default settings. The generated diff + can also be used when posting to the discussion forum to provide more + information for the maintainers. + + The `optimize`, `gpu`, and `pretraining` options are only relevant when + comparing against the default configuration (or specifically when `compare_to` is None). + + DOCS: https://spacy.io/api/cli#debug-diff + """ + debug_diff( + config_path=config_path, + compare_to=compare_to, + gpu=gpu, + optimize=optimize, + pretraining=pretraining, + markdown=markdown, + ) + + +def debug_diff( + config_path: Path, + compare_to: Optional[Path], + gpu: bool, + optimize: Optimizations, + pretraining: bool, + markdown: bool, +): + msg = Printer() + with show_validation_error(hint_fill=False): + user_config = load_config(config_path) + if compare_to: + other_config = load_config(compare_to) + else: + # Recreate a default config based from user's config + lang = user_config["nlp"]["lang"] + pipeline = list(user_config["nlp"]["pipeline"]) + msg.info(f"Found user-defined language: '{lang}'") + msg.info(f"Found user-defined pipelines: {pipeline}") + other_config = init_config( + lang=lang, + pipeline=pipeline, + optimize=optimize.value, + gpu=gpu, + pretraining=pretraining, + silent=True, + ) + + user = user_config.to_str() + other = other_config.to_str() + + if user == other: + msg.warn("No diff to show: configs are identical") + else: + diff_text = diff_strings(other, user, add_symbols=markdown) + if markdown: + md = MarkdownRenderer() + md.add(md.code_block(diff_text, "diff")) + print(md.text) + else: + print(diff_text) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 89e2e87d9..e801ff0a6 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -626,6 +626,235 @@ will not be available. | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **PRINTS** | Debugging information. | +### debug diff-config {#debug-diff tag="command"} + +Show a diff of a config file with respect to spaCy's defaults or another config +file. If additional settings were used in the creation of the config file, then +you must supply these as extra parameters to the command when comparing to the +default settings. The generated diff can also be used when posting to the +discussion forum to provide more information for the maintainers. + +```cli +$ python -m spacy debug diff-config [config_path] [--compare-to] [--optimize] [--gpu] [--pretraining] [--markdown] +``` + +> #### Example +> +> ```cli +> $ python -m spacy debug diff-config ./config.cfg +> ``` + + + +``` +ℹ Found user-defined language: 'en' +ℹ Found user-defined pipelines: ['tok2vec', 'tagger', 'parser', +'ner'] +[paths] ++ train = "./data/train.spacy" ++ dev = "./data/dev.spacy" +- train = null +- dev = null +vectors = null +init_tok2vec = null + +[system] +gpu_allocator = null ++ seed = 42 +- seed = 0 + +[nlp] +lang = "en" +pipeline = ["tok2vec","tagger","parser","ner"] +batch_size = 1000 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.ner] +factory = "ner" +incorrect_spans_key = null +moves = null +scorer = {"@scorers":"spacy.ner_scorer.v1"} +update_with_oracle_cut_size = 100 + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "ner" +extra_state_tokens = false +- hidden_width = 64 ++ hidden_width = 36 +maxout_pieces = 2 +use_upper = true +nO = null + +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.parser] +factory = "parser" +learn_tokens = false +min_action_freq = 30 +moves = null +scorer = {"@scorers":"spacy.parser_scorer.v1"} +update_with_oracle_cut_size = 100 + +[components.parser.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "parser" +extra_state_tokens = false +hidden_width = 128 +maxout_pieces = 3 +use_upper = true +nO = null + +[components.parser.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tagger] +factory = "tagger" +neg_prefix = "!" +overwrite = false +scorer = {"@scorers":"spacy.tagger_scorer.v1"} + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v2" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v2" +width = ${components.tok2vec.model.encode.width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +accumulate_gradient = 1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +annotating_components = [] +before_to_disk = null + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] +tag_acc = 0.33 +dep_uas = 0.17 +dep_las = 0.17 +dep_las_per_type = null +sents_p = null +sents_r = null +sents_f = 0.0 +ents_f = 0.33 +ents_p = 0.0 +ents_r = 0.0 +ents_per_type = null + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] +``` + + + +| Name | Description | +| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Union[Path, str] \(positional)~~ | +| `compare_to` | Path to another config file to diff against, or `None` to compare against default settings. ~~Optional[Union[Path, str] \(option)~~ | +| `optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether the config was optimized for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). Only relevant when comparing against a default config. Defaults to `"efficiency"`. ~~str (option)~~ | +| `gpu`, `-G` | Whether the config was made to run on a GPU. Only relevant when comparing against a default config. ~~bool (flag)~~ | +| `pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Only relevant when comparing against a default config. Defaults to `False`. ~~bool (flag)~~ | +| `markdown`, `-md` | Generate Markdown for Github issues. Defaults to `False`. ~~bool (flag)~~ | +| **PRINTS** | Diff between the two config files. | + ### debug profile {#debug-profile tag="command"} Profile which functions take the most time in a spaCy pipeline. Input should be