From a0a195688f87ffa6c37d7d4994a37a2acfd7e2ad Mon Sep 17 00:00:00 2001
From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com>
Date: Mon, 31 Jul 2023 08:45:04 -0400
Subject: [PATCH] Tests for CLI app - `init config` generates `train`-able
 config (#12173)

* remove migration support form

* initial test commit

* add fixture

* add combo test

* pull out parameter example data

* fix formatting on examples

* remove unused import

* remove unncessary fmt:off instructions

* only set logger level if verbose flag is explicitly set

---------

Co-authored-by: svlandeg <svlandeg@github.com>
---
 spacy/cli/assemble.py       |   3 +-
 spacy/cli/find_threshold.py |   4 +-
 spacy/cli/init_pipeline.py  |   9 +-
 spacy/cli/train.py          |   3 +-
 spacy/tests/test_cli_app.py | 161 +++++++++++++++++++++++++++++++++++-
 5 files changed, 172 insertions(+), 8 deletions(-)

diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py
index ee2500b27..f74bbacb5 100644
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@@ -40,7 +40,8 @@ def assemble_cli(
 
     DOCS: https://spacy.io/api/cli#assemble
     """
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    if verbose:
+        util.logger.setLevel(logging.DEBUG)
     # Make sure all files and paths exists if they are needed
     if not config_path or (str(config_path) != "-" and not config_path.exists()):
         msg.fail("Config file not found", config_path, exits=1)
diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py
index 7aa32c0c6..48077fa51 100644
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@@ -52,8 +52,8 @@ def find_threshold_cli(
 
     DOCS: https://spacy.io/api/cli#find-threshold
     """
-
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    if verbose:
+        util.logger.setLevel(logging.DEBUG)
     import_code(code_path)
     find_threshold(
         model=model,
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 13202cb60..21eea8edf 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -39,7 +39,8 @@ def init_vectors_cli(
     you can use in the [initialize] block of your config to initialize
     a model with vectors.
     """
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    if verbose:
+        util.logger.setLevel(logging.DEBUG)
     msg.info(f"Creating blank nlp object for language '{lang}'")
     nlp = util.get_lang_class(lang)()
     if jsonl_loc is not None:
@@ -87,7 +88,8 @@ def init_pipeline_cli(
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
     # fmt: on
 ):
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    if verbose:
+        util.logger.setLevel(logging.DEBUG)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
     setup_gpu(use_gpu)
@@ -116,7 +118,8 @@ def init_labels_cli(
     """Generate JSON files for the labels in the data. This helps speed up the
     training process, since spaCy won't have to preprocess the data to
     extract the labels."""
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    if verbose:
+        util.logger.setLevel(logging.DEBUG)
     if not output_path.exists():
         output_path.mkdir(parents=True)
     overrides = parse_config_overrides(ctx.args)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 8bdabd39c..c72e13b26 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -47,7 +47,8 @@ def train_cli(
 
     DOCS: https://spacy.io/api/cli#train
     """
-    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    if verbose:
+        util.logger.setLevel(logging.DEBUG)
     overrides = parse_config_overrides(ctx.args)
     import_code(code_path)
     train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 0e6d8e252..e6f3b5912 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -6,7 +6,7 @@ import srsly
 from typer.testing import CliRunner
 
 from spacy.cli._util import app, get_git_version
-from spacy.tokens import Doc, DocBin
+from spacy.tokens import Doc, DocBin, Span
 
 from .util import make_tempdir, normalize_whitespace
 
@@ -267,3 +267,162 @@ def test_find_function_invalid():
     function = "spacy.TextCatBOW.v666"
     result = CliRunner().invoke(app, ["find-function", function])
     assert f"Couldn't find registered function: '{function}'" in result.stdout
+
+
+example_words_1 = ["I", "like", "cats"]
+example_words_2 = ["I", "like", "dogs"]
+example_lemmas_1 = ["I", "like", "cat"]
+example_lemmas_2 = ["I", "like", "dog"]
+example_tags = ["PRP", "VBP", "NNS"]
+example_morphs = [
+    "Case=Nom|Number=Sing|Person=1|PronType=Prs",
+    "Tense=Pres|VerbForm=Fin",
+    "Number=Plur",
+]
+example_deps = ["nsubj", "ROOT", "dobj"]
+example_pos = ["PRON", "VERB", "NOUN"]
+example_ents = ["O", "O", "I-ANIMAL"]
+example_spans = [(2, 3, "ANIMAL")]
+
+TRAIN_EXAMPLE_1 = dict(
+    words=example_words_1,
+    lemmas=example_lemmas_1,
+    tags=example_tags,
+    morphs=example_morphs,
+    deps=example_deps,
+    heads=[1, 1, 1],
+    pos=example_pos,
+    ents=example_ents,
+    spans=example_spans,
+    cats={"CAT": 1.0, "DOG": 0.0},
+)
+TRAIN_EXAMPLE_2 = dict(
+    words=example_words_2,
+    lemmas=example_lemmas_2,
+    tags=example_tags,
+    morphs=example_morphs,
+    deps=example_deps,
+    heads=[1, 1, 1],
+    pos=example_pos,
+    ents=example_ents,
+    spans=example_spans,
+    cats={"CAT": 0.0, "DOG": 1.0},
+)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "component,examples",
+    [
+        ("tagger", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+        ("morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+        ("trainable_lemmatizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+        ("parser", [TRAIN_EXAMPLE_1] * 30),
+        ("ner", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+        ("spancat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+        ("textcat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+    ],
+)
+def test_init_config_trainable(component, examples, en_vocab):
+    if component == "textcat":
+        train_docs = []
+        for example in examples:
+            doc = Doc(en_vocab, words=example["words"])
+            doc.cats = example["cats"]
+            train_docs.append(doc)
+    elif component == "spancat":
+        train_docs = []
+        for example in examples:
+            doc = Doc(en_vocab, words=example["words"])
+            doc.spans["sc"] = [
+                Span(doc, start, end, label) for start, end, label in example["spans"]
+            ]
+            train_docs.append(doc)
+    else:
+        train_docs = []
+        for example in examples:
+            # cats, spans are not valid kwargs for instantiating a Doc
+            example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
+            doc = Doc(en_vocab, **example)
+            train_docs.append(doc)
+
+    with make_tempdir() as d_in:
+        train_bin = DocBin(docs=train_docs)
+        train_bin.to_disk(d_in / "train.spacy")
+        dev_bin = DocBin(docs=train_docs)
+        dev_bin.to_disk(d_in / "dev.spacy")
+        init_config_result = CliRunner().invoke(
+            app,
+            [
+                "init",
+                "config",
+                f"{d_in}/config.cfg",
+                "--lang",
+                "en",
+                "--pipeline",
+                component,
+            ],
+        )
+        assert init_config_result.exit_code == 0
+        train_result = CliRunner().invoke(
+            app,
+            [
+                "train",
+                f"{d_in}/config.cfg",
+                "--paths.train",
+                f"{d_in}/train.spacy",
+                "--paths.dev",
+                f"{d_in}/dev.spacy",
+                "--output",
+                f"{d_in}/model",
+            ],
+        )
+        assert train_result.exit_code == 0
+        assert Path(d_in / "model" / "model-last").exists()
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "component,examples",
+    [("tagger,parser,morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2] * 15)],
+)
+def test_init_config_trainable_multiple(component, examples, en_vocab):
+    train_docs = []
+    for example in examples:
+        example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
+        doc = Doc(en_vocab, **example)
+        train_docs.append(doc)
+
+    with make_tempdir() as d_in:
+        train_bin = DocBin(docs=train_docs)
+        train_bin.to_disk(d_in / "train.spacy")
+        dev_bin = DocBin(docs=train_docs)
+        dev_bin.to_disk(d_in / "dev.spacy")
+        init_config_result = CliRunner().invoke(
+            app,
+            [
+                "init",
+                "config",
+                f"{d_in}/config.cfg",
+                "--lang",
+                "en",
+                "--pipeline",
+                component,
+            ],
+        )
+        assert init_config_result.exit_code == 0
+        train_result = CliRunner().invoke(
+            app,
+            [
+                "train",
+                f"{d_in}/config.cfg",
+                "--paths.train",
+                f"{d_in}/train.spacy",
+                "--paths.dev",
+                f"{d_in}/dev.spacy",
+                "--output",
+                f"{d_in}/model",
+            ],
+        )
+        assert train_result.exit_code == 0
+        assert Path(d_in / "model" / "model-last").exists()