diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 873a3ff66..1ed51cd1f 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -1,8 +1,12 @@ import os from pathlib import Path +import pytest from typer.testing import CliRunner +import spacy from spacy.cli._util import app +from spacy.language import Language +from spacy.tokens import DocBin from .util import make_tempdir @@ -31,3 +35,178 @@ def test_convert_auto_conflict(): assert "All input files must be same type" in result.stdout out_files = os.listdir(d_out) assert len(out_files) == 0 + + +NOOP_CONFIG = """ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null + +[system] +seed = 0 +gpu_allocator = null + +[nlp] +lang = "xx" +pipeline = ["noop", "noop2"] +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +batch_size = 1000 +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.noop] +factory = "noop" + +[components.noop2] +factory = "noop2" + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +gold_preproc = false +max_length = 0 +limit = 0 +augmenter = null + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +gold_preproc = false +max_length = 0 +limit = 0 +augmenter = null + +[training] +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +accumulate_gradient = 1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +annotating_components = [] +dev_corpus = "corpora.dev" + +train_corpus = "corpora.train" +before_to_disk = null +before_update = null + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] +""" + + +@pytest.fixture +def data_paths(): + nlp = spacy.blank("xx") + doc = nlp("ok") + with make_tempdir() as tdir: + db = DocBin() + # debug data will *fail* if there aren't enough docs + for ii in range(100): + db.add(doc) + fpath = tdir / "data.spacy" + db.to_disk(fpath) + + args = [ + "--paths.train", + str(fpath), + "--paths.dev", + str(fpath), + ] + yield args + + +@pytest.fixture +def code_paths(): + noop_base = """ +from spacy.language import Language + +@Language.component("{}") +def noop(doc): + return doc +""" + + with make_tempdir() as temp_d: + # write code files to load + paths = [] + for ff in ["noop", "noop2"]: + pyfile = temp_d / f"{ff}.py" + pyfile.write_text(noop_base.format(ff)) + paths.append(pyfile) + + args = ["--code", ",".join([str(pp) for pp in paths])] + yield args + + +@pytest.fixture +def noop_config(): + with make_tempdir() as temp_d: + cfg = temp_d / "config.cfg" + cfg.write_text(NOOP_CONFIG) + + yield cfg + + +def test_multi_code_debug_data(code_paths, data_paths, noop_config): + # check that it fails without the code arg + result = CliRunner().invoke(app, ["debug", "data", str(noop_config), *data_paths]) + assert result.exit_code == 1 + + # check that it succeeds with the code arg + code_arg = ["--code", ",".join([str(pp) for pp in code_paths])] + result = CliRunner().invoke( + app, ["debug", "data", str(noop_config), *data_paths, *code_paths] + ) + assert result.exit_code == 0