import os from pathlib import Path from typer.testing import CliRunner from spacy.tokens import DocBin, Doc from spacy.cli._util import app from .util import make_tempdir, normalize_whitespace def test_convert_auto(): with make_tempdir() as d_in, make_tempdir() as d_out: for f in ["data1.iob", "data2.iob", "data3.iob"]: Path(d_in / f).touch() # ensure that "automatic" suffix detection works result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)]) assert "Generated output file" in result.stdout out_files = os.listdir(d_out) assert len(out_files) == 3 assert "data1.spacy" in out_files assert "data2.spacy" in out_files assert "data3.spacy" in out_files def test_convert_auto_conflict(): with make_tempdir() as d_in, make_tempdir() as d_out: for f in ["data1.iob", "data2.iob", "data3.json"]: Path(d_in / f).touch() # ensure that "automatic" suffix detection warns when there are different file types result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)]) assert "All input files must be same type" in result.stdout out_files = os.listdir(d_out) assert len(out_files) == 0 def test_benchmark_accuracy_alias(): # Verify that the `evaluate` alias works correctly. result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"]) result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"]) assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace( result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy") ) def test_debug_data_trainable_lemmatizer_cli(en_vocab): train_docs = [ Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]), Doc( en_vocab, words=["Dogs", "are", "great", "too"], lemmas=["dog", "be", "great", "too"], ), ] dev_docs = [ Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]), Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]), ] with make_tempdir() as d_in: train_bin = DocBin(docs=train_docs) train_bin.to_disk(d_in / "train.spacy") dev_bin = DocBin(docs=dev_docs) dev_bin.to_disk(d_in / "dev.spacy") # `debug data` requires an input pipeline config CliRunner().invoke( app, [ "init", "config", f"{d_in}/config.cfg", "--lang", "en", "--pipeline", "trainable_lemmatizer", ], ) result_debug_data = CliRunner().invoke( app, [ "debug", "data", f"{d_in}/config.cfg", "--paths.train", f"{d_in}/train.spacy", "--paths.dev", f"{d_in}/dev.spacy", ], ) # Instead of checking specific wording of the output, which may change, # we'll check that this section of the debug output is present. assert "= Trainable Lemmatizer =" in result_debug_data.stdout