Tests for CLI app - init config generates train-able config (#12173)

* remove migration support form * initial test commit * add fixture * add combo test * pull out parameter example data * fix formatting on examples * remove unused import * remove unncessary fmt:off instructions * only set logger level if verbose flag is explicitly set --------- Co-authored-by: svlandeg <svlandeg@github.com>
2025-10-31 07:57:35 +03:00 · 2023-07-31 08:45:04 -04:00 · 2023-07-31 08:45:04 -04:00 · a0a195688f
commit a0a195688f
parent 186889ec9c
5 changed files with 172 additions and 8 deletions
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@ -40,7 +40,8 @@ def assemble_cli(
    DOCS: https://spacy.io/api/cli#assemble
    """
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    if verbose:
        util.logger.setLevel(logging.DEBUG)
    # Make sure all files and paths exists if they are needed
    if not config_path or (str(config_path) != "-" and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@ -52,8 +52,8 @@ def find_threshold_cli(
    DOCS: https://spacy.io/api/cli#find-threshold
    """
-
+    if verbose:
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+        util.logger.setLevel(logging.DEBUG)
    import_code(code_path)
    find_threshold(
        model=model,
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@ -39,7 +39,8 @@ def init_vectors_cli(
    you can use in the [initialize] block of your config to initialize
    a model with vectors.
    """
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    if verbose:
        util.logger.setLevel(logging.DEBUG)
    msg.info(f"Creating blank nlp object for language '{lang}'")
    nlp = util.get_lang_class(lang)()
    if jsonl_loc is not None:
@ -87,7 +88,8 @@ def init_pipeline_cli(
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
    # fmt: on
 ):
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    if verbose:
        util.logger.setLevel(logging.DEBUG)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    setup_gpu(use_gpu)
@ -116,7 +118,8 @@ def init_labels_cli(
    """Generate JSON files for the labels in the data. This helps speed up the
    training process, since spaCy won't have to preprocess the data to
    extract the labels."""
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    if verbose:
        util.logger.setLevel(logging.DEBUG)
    if not output_path.exists():
        output_path.mkdir(parents=True)
    overrides = parse_config_overrides(ctx.args)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -47,7 +47,8 @@ def train_cli(
    DOCS: https://spacy.io/api/cli#train
    """
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    if verbose:
        util.logger.setLevel(logging.DEBUG)
    overrides = parse_config_overrides(ctx.args)
    import_code(code_path)
    train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@ -6,7 +6,7 @@ import srsly
 from typer.testing import CliRunner
 from spacy.cli._util import app, get_git_version
-from spacy.tokens import Doc, DocBin
+from spacy.tokens import Doc, DocBin, Span
 from .util import make_tempdir, normalize_whitespace
@ -267,3 +267,162 @@ def test_find_function_invalid():
    function = "spacy.TextCatBOW.v666"
    result = CliRunner().invoke(app, ["find-function", function])
    assert f"Couldn't find registered function: '{function}'" in result.stdout
 example_words_1 = ["I", "like", "cats"]
 example_words_2 = ["I", "like", "dogs"]
 example_lemmas_1 = ["I", "like", "cat"]
 example_lemmas_2 = ["I", "like", "dog"]
 example_tags = ["PRP", "VBP", "NNS"]
 example_morphs = [
    "Case=Nom|Number=Sing|Person=1|PronType=Prs",
    "Tense=Pres|VerbForm=Fin",
    "Number=Plur",
 ]
 example_deps = ["nsubj", "ROOT", "dobj"]
 example_pos = ["PRON", "VERB", "NOUN"]
 example_ents = ["O", "O", "I-ANIMAL"]
 example_spans = [(2, 3, "ANIMAL")]
 TRAIN_EXAMPLE_1 = dict(
    words=example_words_1,
    lemmas=example_lemmas_1,
    tags=example_tags,
    morphs=example_morphs,
    deps=example_deps,
    heads=[1, 1, 1],
    pos=example_pos,
    ents=example_ents,
    spans=example_spans,
    cats={"CAT": 1.0, "DOG": 0.0},
 )
 TRAIN_EXAMPLE_2 = dict(
    words=example_words_2,
    lemmas=example_lemmas_2,
    tags=example_tags,
    morphs=example_morphs,
    deps=example_deps,
    heads=[1, 1, 1],
    pos=example_pos,
    ents=example_ents,
    spans=example_spans,
    cats={"CAT": 0.0, "DOG": 1.0},
 )
@pytest.mark.slow
@pytest.mark.parametrize(
    "component,examples",
    [
        ("tagger", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
        ("morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
        ("trainable_lemmatizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
        ("parser", [TRAIN_EXAMPLE_1] * 30),
        ("ner", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
        ("spancat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
        ("textcat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
    ],
 )
 def test_init_config_trainable(component, examples, en_vocab):
    if component == "textcat":
        train_docs = []
        for example in examples:
            doc = Doc(en_vocab, words=example["words"])
            doc.cats = example["cats"]
            train_docs.append(doc)
    elif component == "spancat":
        train_docs = []
        for example in examples:
            doc = Doc(en_vocab, words=example["words"])
            doc.spans["sc"] = [
                Span(doc, start, end, label) for start, end, label in example["spans"]
            ]
            train_docs.append(doc)
    else:
        train_docs = []
        for example in examples:
            # cats, spans are not valid kwargs for instantiating a Doc
            example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
            doc = Doc(en_vocab, **example)
            train_docs.append(doc)
    with make_tempdir() as d_in:
        train_bin = DocBin(docs=train_docs)
        train_bin.to_disk(d_in / "train.spacy")
        dev_bin = DocBin(docs=train_docs)
        dev_bin.to_disk(d_in / "dev.spacy")
        init_config_result = CliRunner().invoke(
            app,
            [
                "init",
                "config",
                f"{d_in}/config.cfg",
                "--lang",
                "en",
                "--pipeline",
                component,
            ],
        )
        assert init_config_result.exit_code == 0
        train_result = CliRunner().invoke(
            app,
            [
                "train",
                f"{d_in}/config.cfg",
                "--paths.train",
                f"{d_in}/train.spacy",
                "--paths.dev",
                f"{d_in}/dev.spacy",
                "--output",
                f"{d_in}/model",
            ],
        )
        assert train_result.exit_code == 0
        assert Path(d_in / "model" / "model-last").exists()
@pytest.mark.slow
@pytest.mark.parametrize(
    "component,examples",
    [("tagger,parser,morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2] * 15)],
 )
 def test_init_config_trainable_multiple(component, examples, en_vocab):
    train_docs = []
    for example in examples:
        example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
        doc = Doc(en_vocab, **example)
        train_docs.append(doc)
    with make_tempdir() as d_in:
        train_bin = DocBin(docs=train_docs)
        train_bin.to_disk(d_in / "train.spacy")
        dev_bin = DocBin(docs=train_docs)
        dev_bin.to_disk(d_in / "dev.spacy")
        init_config_result = CliRunner().invoke(
            app,
            [
                "init",
                "config",
                f"{d_in}/config.cfg",
                "--lang",
                "en",
                "--pipeline",
                component,
            ],
        )
        assert init_config_result.exit_code == 0
        train_result = CliRunner().invoke(
            app,
            [
                "train",
                f"{d_in}/config.cfg",
                "--paths.train",
                f"{d_in}/train.spacy",
                "--paths.dev",
                f"{d_in}/dev.spacy",
                "--output",
                f"{d_in}/model",
            ],
        )
        assert train_result.exit_code == 0
        assert Path(d_in / "model" / "model-last").exists()