spaCy/spacy/tests/test_cli_app.py

import os
from pathlib import Path
from typer.testing import CliRunner
from spacy.tokens import DocBin, Doc

from spacy.cli._util import app
from .util import make_tempdir


def test_convert_auto():
    with make_tempdir() as d_in, make_tempdir() as d_out:
        for f in ["data1.iob", "data2.iob", "data3.iob"]:
            Path(d_in / f).touch()

        # ensure that "automatic" suffix detection works
        result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
        assert "Generated output file" in result.stdout
        out_files = os.listdir(d_out)
        assert len(out_files) == 3
        assert "data1.spacy" in out_files
        assert "data2.spacy" in out_files
        assert "data3.spacy" in out_files


def test_convert_auto_conflict():
    with make_tempdir() as d_in, make_tempdir() as d_out:
        for f in ["data1.iob", "data2.iob", "data3.json"]:
            Path(d_in / f).touch()

        # ensure that "automatic" suffix detection warns when there are different file types
        result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
        assert "All input files must be same type" in result.stdout
        out_files = os.listdir(d_out)
        assert len(out_files) == 0


def test_benchmark_accuracy_alias():
    # Verify that the `evaluate` alias works correctly.
    result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
    result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
    assert result_benchmark.stdout == result_evaluate.stdout.replace(
        "spacy evaluate", "spacy benchmark accuracy"
    )


def test_debug_data_trainable_lemmatizer_cli(en_vocab):
    train_docs = [
        Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
        Doc(
            en_vocab,
            words=["Dogs", "are", "great", "too"],
            lemmas=["dog", "be", "great", "too"],
        ),
    ]
    dev_docs = [
        Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]),
        Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]),
    ]
    with make_tempdir() as d_in:
        train_bin = DocBin(docs=train_docs)
        train_bin.to_disk(d_in / "train.spacy")
        dev_bin = DocBin(docs=dev_docs)
        dev_bin.to_disk(d_in / "dev.spacy")
        # `debug data` requires an input pipeline config
        CliRunner().invoke(
            app,
            [
                "init",
                "config",
                f"{d_in}/config.cfg",
                "--lang",
                "en",
                "--pipeline",
                "trainable_lemmatizer",
            ],
        )
        result_debug_data = CliRunner().invoke(
            app,
            [
                "debug",
                "data",
                f"{d_in}/config.cfg",
                "--paths.train",
                f"{d_in}/train.spacy",
                "--paths.dev",
                f"{d_in}/dev.spacy",
            ],
        )
        # Instead of checking specific wording of the output, which may change,
        # we'll check that this section of the debug output is present.
        assert "= Trainable Lemmatizer =" in result_debug_data.stdout
fix processing of "auto" in convert (#12050) * fix processing of "auto" in walk_directory * add check for None * move AUTO check to convert and fix verification of args * add specific CLI test with CliRunner * cleanup * more cleanup * update docstring 2023-01-05 12:21:00 +03:00			`import os`
			`from pathlib import Path`
			`from typer.testing import CliRunner`
`trainable_lemmatizer` in `debug data` (#11419) * WIP * rm ipython embeds * rm total * WIP * cleanup * cleanup + reword * rm component function * remove migration support form * fix reference dataset for dev data * additional fixes - set approach to identifying unique trees - adjust line length on messages - add logic for detecting docs without annotations * use 0 instead of none for no annotation * partial annotation support * initial tests for _compile_gold lemma attributes Using the example data from the edit tree lemmatizer tests for: - lemmatizer_trees - partial_lemma_annotations - n_low_cardinality_lemmas - no_lemma_annotations * adds output test for cli app * switch msg level * rm unclear uniqueness check * Revert "rm unclear uniqueness check" This reverts commit 6ea2b3524bd66417b3d54854bb7d27ea7e6029ae. * remove good message on uniqueness * formatting * use en_vocab fixture * clarify data set source in messages * remove unnecessary import Co-authored-by: svlandeg <svlandeg@github.com> 2023-01-26 19:36:50 +03:00			`from spacy.tokens import DocBin, Doc`
fix processing of "auto" in convert (#12050) * fix processing of "auto" in walk_directory * add check for None * move AUTO check to convert and fix verification of args * add specific CLI test with CliRunner * cleanup * more cleanup * update docstring 2023-01-05 12:21:00 +03:00
			`from spacy.cli._util import app`
			`from .util import make_tempdir`


			`def test_convert_auto():`
			`with make_tempdir() as d_in, make_tempdir() as d_out:`
			`for f in ["data1.iob", "data2.iob", "data3.iob"]:`
			`Path(d_in / f).touch()`

			`# ensure that "automatic" suffix detection works`
			`result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])`
			`assert "Generated output file" in result.stdout`
			`out_files = os.listdir(d_out)`
			`assert len(out_files) == 3`
			`assert "data1.spacy" in out_files`
			`assert "data2.spacy" in out_files`
			`assert "data3.spacy" in out_files`


			`def test_convert_auto_conflict():`
			`with make_tempdir() as d_in, make_tempdir() as d_out:`
			`for f in ["data1.iob", "data2.iob", "data3.json"]:`
			`Path(d_in / f).touch()`

			`# ensure that "automatic" suffix detection warns when there are different file types`
			`result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])`
			`assert "All input files must be same type" in result.stdout`
			`out_files = os.listdir(d_out)`
			`assert len(out_files) == 0`
Add a `spacy benchmark speed` subcommand (#11902) * Add a `spacy evaluate speed` subcommand This subcommand reports the mean batch performance of a model on a data set with a 95% confidence interval. For reliability, it first performs some warmup rounds. Then it will measure performance on batches with randomly shuffled documents. To avoid having too many spaCy commands, `speed` is a subcommand of `evaluate` and accuracy evaluation is moved to its own `evaluate accuracy` subcommand. * Fix import cycle * Restore `spacy evaluate`, make `spacy benchmark speed` an alias * Add documentation for `spacy benchmark` * CREATES -> PRINTS * WPS -> words/s * Disable formatting of benchmark speed arguments * Fail with an error message when trying to speed bench empty corpus * Make it clearer that `benchmark accuracy` is a replacement for `evaluate` * Fix docstring webpage reference * tests: check `evaluate` output against `benchmark accuracy` 2023-01-12 13:55:21 +03:00

			`def test_benchmark_accuracy_alias():`
			# Verify that the `evaluate` alias works correctly.
			`result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])`
			`result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])`
			`assert result_benchmark.stdout == result_evaluate.stdout.replace(`
			`"spacy evaluate", "spacy benchmark accuracy"`
			`)`
`trainable_lemmatizer` in `debug data` (#11419) * WIP * rm ipython embeds * rm total * WIP * cleanup * cleanup + reword * rm component function * remove migration support form * fix reference dataset for dev data * additional fixes - set approach to identifying unique trees - adjust line length on messages - add logic for detecting docs without annotations * use 0 instead of none for no annotation * partial annotation support * initial tests for _compile_gold lemma attributes Using the example data from the edit tree lemmatizer tests for: - lemmatizer_trees - partial_lemma_annotations - n_low_cardinality_lemmas - no_lemma_annotations * adds output test for cli app * switch msg level * rm unclear uniqueness check * Revert "rm unclear uniqueness check" This reverts commit 6ea2b3524bd66417b3d54854bb7d27ea7e6029ae. * remove good message on uniqueness * formatting * use en_vocab fixture * clarify data set source in messages * remove unnecessary import Co-authored-by: svlandeg <svlandeg@github.com> 2023-01-26 19:36:50 +03:00

			`def test_debug_data_trainable_lemmatizer_cli(en_vocab):`
			`train_docs = [`
			`Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),`
			`Doc(`
			`en_vocab,`
			`words=["Dogs", "are", "great", "too"],`
			`lemmas=["dog", "be", "great", "too"],`
			`),`
			`]`
			`dev_docs = [`
			`Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]),`
			`Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]),`
			`]`
			`with make_tempdir() as d_in:`
			`train_bin = DocBin(docs=train_docs)`
			`train_bin.to_disk(d_in / "train.spacy")`
			`dev_bin = DocBin(docs=dev_docs)`
			`dev_bin.to_disk(d_in / "dev.spacy")`
			# `debug data` requires an input pipeline config
			`CliRunner().invoke(`
			`app,`
			`[`
			`"init",`
			`"config",`
			`f"{d_in}/config.cfg",`
			`"--lang",`
			`"en",`
			`"--pipeline",`
			`"trainable_lemmatizer",`
			`],`
			`)`
			`result_debug_data = CliRunner().invoke(`
			`app,`
			`[`
			`"debug",`
			`"data",`
			`f"{d_in}/config.cfg",`
			`"--paths.train",`
			`f"{d_in}/train.spacy",`
			`"--paths.dev",`
			`f"{d_in}/dev.spacy",`
			`],`
			`)`
			`# Instead of checking specific wording of the output, which may change,`
			`# we'll check that this section of the debug output is present.`
			`assert "= Trainable Lemmatizer =" in result_debug_data.stdout`