From a0a195688f87ffa6c37d7d4994a37a2acfd7e2ad Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Mon, 31 Jul 2023 08:45:04 -0400 Subject: [PATCH] Tests for CLI app - `init config` generates `train`-able config (#12173) * remove migration support form * initial test commit * add fixture * add combo test * pull out parameter example data * fix formatting on examples * remove unused import * remove unncessary fmt:off instructions * only set logger level if verbose flag is explicitly set --------- Co-authored-by: svlandeg --- spacy/cli/assemble.py | 3 +- spacy/cli/find_threshold.py | 4 +- spacy/cli/init_pipeline.py | 9 +- spacy/cli/train.py | 3 +- spacy/tests/test_cli_app.py | 161 +++++++++++++++++++++++++++++++++++- 5 files changed, 172 insertions(+), 8 deletions(-) diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py index ee2500b27..f74bbacb5 100644 --- a/spacy/cli/assemble.py +++ b/spacy/cli/assemble.py @@ -40,7 +40,8 @@ def assemble_cli( DOCS: https://spacy.io/api/cli#assemble """ - util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + if verbose: + util.logger.setLevel(logging.DEBUG) # Make sure all files and paths exists if they are needed if not config_path or (str(config_path) != "-" and not config_path.exists()): msg.fail("Config file not found", config_path, exits=1) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 7aa32c0c6..48077fa51 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -52,8 +52,8 @@ def find_threshold_cli( DOCS: https://spacy.io/api/cli#find-threshold """ - - util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + if verbose: + util.logger.setLevel(logging.DEBUG) import_code(code_path) find_threshold( model=model, diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 13202cb60..21eea8edf 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -39,7 +39,8 @@ def init_vectors_cli( you can use in the [initialize] block of your config to initialize a model with vectors. """ - util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + if verbose: + util.logger.setLevel(logging.DEBUG) msg.info(f"Creating blank nlp object for language '{lang}'") nlp = util.get_lang_class(lang)() if jsonl_loc is not None: @@ -87,7 +88,8 @@ def init_pipeline_cli( use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") # fmt: on ): - util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + if verbose: + util.logger.setLevel(logging.DEBUG) overrides = parse_config_overrides(ctx.args) import_code(code_path) setup_gpu(use_gpu) @@ -116,7 +118,8 @@ def init_labels_cli( """Generate JSON files for the labels in the data. This helps speed up the training process, since spaCy won't have to preprocess the data to extract the labels.""" - util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + if verbose: + util.logger.setLevel(logging.DEBUG) if not output_path.exists(): output_path.mkdir(parents=True) overrides = parse_config_overrides(ctx.args) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8bdabd39c..c72e13b26 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -47,7 +47,8 @@ def train_cli( DOCS: https://spacy.io/api/cli#train """ - util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + if verbose: + util.logger.setLevel(logging.DEBUG) overrides = parse_config_overrides(ctx.args) import_code(code_path) train(config_path, output_path, use_gpu=use_gpu, overrides=overrides) diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 0e6d8e252..e6f3b5912 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -6,7 +6,7 @@ import srsly from typer.testing import CliRunner from spacy.cli._util import app, get_git_version -from spacy.tokens import Doc, DocBin +from spacy.tokens import Doc, DocBin, Span from .util import make_tempdir, normalize_whitespace @@ -267,3 +267,162 @@ def test_find_function_invalid(): function = "spacy.TextCatBOW.v666" result = CliRunner().invoke(app, ["find-function", function]) assert f"Couldn't find registered function: '{function}'" in result.stdout + + +example_words_1 = ["I", "like", "cats"] +example_words_2 = ["I", "like", "dogs"] +example_lemmas_1 = ["I", "like", "cat"] +example_lemmas_2 = ["I", "like", "dog"] +example_tags = ["PRP", "VBP", "NNS"] +example_morphs = [ + "Case=Nom|Number=Sing|Person=1|PronType=Prs", + "Tense=Pres|VerbForm=Fin", + "Number=Plur", +] +example_deps = ["nsubj", "ROOT", "dobj"] +example_pos = ["PRON", "VERB", "NOUN"] +example_ents = ["O", "O", "I-ANIMAL"] +example_spans = [(2, 3, "ANIMAL")] + +TRAIN_EXAMPLE_1 = dict( + words=example_words_1, + lemmas=example_lemmas_1, + tags=example_tags, + morphs=example_morphs, + deps=example_deps, + heads=[1, 1, 1], + pos=example_pos, + ents=example_ents, + spans=example_spans, + cats={"CAT": 1.0, "DOG": 0.0}, +) +TRAIN_EXAMPLE_2 = dict( + words=example_words_2, + lemmas=example_lemmas_2, + tags=example_tags, + morphs=example_morphs, + deps=example_deps, + heads=[1, 1, 1], + pos=example_pos, + ents=example_ents, + spans=example_spans, + cats={"CAT": 0.0, "DOG": 1.0}, +) + + +@pytest.mark.slow +@pytest.mark.parametrize( + "component,examples", + [ + ("tagger", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), + ("morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), + ("trainable_lemmatizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), + ("parser", [TRAIN_EXAMPLE_1] * 30), + ("ner", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), + ("spancat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), + ("textcat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), + ], +) +def test_init_config_trainable(component, examples, en_vocab): + if component == "textcat": + train_docs = [] + for example in examples: + doc = Doc(en_vocab, words=example["words"]) + doc.cats = example["cats"] + train_docs.append(doc) + elif component == "spancat": + train_docs = [] + for example in examples: + doc = Doc(en_vocab, words=example["words"]) + doc.spans["sc"] = [ + Span(doc, start, end, label) for start, end, label in example["spans"] + ] + train_docs.append(doc) + else: + train_docs = [] + for example in examples: + # cats, spans are not valid kwargs for instantiating a Doc + example = {k: v for k, v in example.items() if k not in ("cats", "spans")} + doc = Doc(en_vocab, **example) + train_docs.append(doc) + + with make_tempdir() as d_in: + train_bin = DocBin(docs=train_docs) + train_bin.to_disk(d_in / "train.spacy") + dev_bin = DocBin(docs=train_docs) + dev_bin.to_disk(d_in / "dev.spacy") + init_config_result = CliRunner().invoke( + app, + [ + "init", + "config", + f"{d_in}/config.cfg", + "--lang", + "en", + "--pipeline", + component, + ], + ) + assert init_config_result.exit_code == 0 + train_result = CliRunner().invoke( + app, + [ + "train", + f"{d_in}/config.cfg", + "--paths.train", + f"{d_in}/train.spacy", + "--paths.dev", + f"{d_in}/dev.spacy", + "--output", + f"{d_in}/model", + ], + ) + assert train_result.exit_code == 0 + assert Path(d_in / "model" / "model-last").exists() + + +@pytest.mark.slow +@pytest.mark.parametrize( + "component,examples", + [("tagger,parser,morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2] * 15)], +) +def test_init_config_trainable_multiple(component, examples, en_vocab): + train_docs = [] + for example in examples: + example = {k: v for k, v in example.items() if k not in ("cats", "spans")} + doc = Doc(en_vocab, **example) + train_docs.append(doc) + + with make_tempdir() as d_in: + train_bin = DocBin(docs=train_docs) + train_bin.to_disk(d_in / "train.spacy") + dev_bin = DocBin(docs=train_docs) + dev_bin.to_disk(d_in / "dev.spacy") + init_config_result = CliRunner().invoke( + app, + [ + "init", + "config", + f"{d_in}/config.cfg", + "--lang", + "en", + "--pipeline", + component, + ], + ) + assert init_config_result.exit_code == 0 + train_result = CliRunner().invoke( + app, + [ + "train", + f"{d_in}/config.cfg", + "--paths.train", + f"{d_in}/train.spacy", + "--paths.dev", + f"{d_in}/dev.spacy", + "--output", + f"{d_in}/model", + ], + ) + assert train_result.exit_code == 0 + assert Path(d_in / "model" / "model-last").exists()