import os import subprocess import sys from pathlib import Path import pytest import srsly from typer.testing import CliRunner import spacy from spacy.cli._util import app, get_git_version from spacy.tokens import Doc, DocBin, Span from .util import make_tempdir, normalize_whitespace def has_git(): try: get_git_version() return True except RuntimeError: return False def test_convert_auto(): with make_tempdir() as d_in, make_tempdir() as d_out: for f in ["data1.iob", "data2.iob", "data3.iob"]: Path(d_in / f).touch() # ensure that "automatic" suffix detection works result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)]) assert "Generated output file" in result.stdout out_files = os.listdir(d_out) assert len(out_files) == 3 assert "data1.spacy" in out_files assert "data2.spacy" in out_files assert "data3.spacy" in out_files def test_convert_auto_conflict(): with make_tempdir() as d_in, make_tempdir() as d_out: for f in ["data1.iob", "data2.iob", "data3.json"]: Path(d_in / f).touch() # ensure that "automatic" suffix detection warns when there are different file types result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)]) assert "All input files must be same type" in result.stdout out_files = os.listdir(d_out) assert len(out_files) == 0 NOOP_CONFIG = """ [paths] train = null dev = null vectors = null init_tok2vec = null [system] seed = 0 gpu_allocator = null [nlp] lang = "mul" pipeline = ["noop", "noop2"] disabled = [] before_creation = null after_creation = null after_pipeline_creation = null batch_size = 1000 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} [components] [components.noop] factory = "noop" [components.noop2] factory = "noop2" [corpora] [corpora.dev] @readers = "spacy.Corpus.v1" path = ${paths.dev} gold_preproc = false max_length = 0 limit = 0 augmenter = null [corpora.train] @readers = "spacy.Corpus.v1" path = ${paths.train} gold_preproc = false max_length = 0 limit = 0 augmenter = null [training] seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} dropout = 0.1 accumulate_gradient = 1 patience = 1600 max_epochs = 0 max_steps = 100 eval_frequency = 200 frozen_components = [] annotating_components = [] dev_corpus = "corpora.dev" train_corpus = "corpora.train" before_to_disk = null before_update = null [training.batcher] @batchers = "spacy.batch_by_words.v1" discard_oversize = false tolerance = 0.2 get_length = null [training.batcher.size] @schedules = "compounding.v1" start = 100 stop = 1000 compound = 1.001 t = 0.0 [training.logger] @loggers = "spacy.ConsoleLogger.v1" progress_bar = false [training.optimizer] @optimizers = "Adam.v1" beta1 = 0.9 beta2 = 0.999 L2_is_weight_decay = true L2 = 0.01 grad_clip = 1.0 use_averages = false eps = 0.00000001 learn_rate = 0.001 [training.score_weights] [pretraining] [initialize] vectors = ${paths.vectors} init_tok2vec = ${paths.init_tok2vec} vocab_data = null lookups = null before_init = null after_init = null [initialize.components] [initialize.tokenizer] """ @pytest.fixture def data_paths(): nlp = spacy.blank("mul") doc = nlp("ok") with make_tempdir() as tdir: db = DocBin() # debug data will *fail* if there aren't enough docs for ii in range(100): db.add(doc) fpath = tdir / "data.spacy" db.to_disk(fpath) args = [ "--paths.train", str(fpath), "--paths.dev", str(fpath), ] yield args @pytest.fixture def code_paths(): noop_base = """ from spacy.language import Language @Language.component("{}") def noop(doc): return doc """ with make_tempdir() as temp_d: # write code files to load paths = [] for ff in ["noop", "noop2"]: pyfile = temp_d / f"{ff}.py" pyfile.write_text(noop_base.format(ff)) paths.append(pyfile) args = ["--code", ",".join([str(pp) for pp in paths])] yield args @pytest.fixture def noop_config(): with make_tempdir() as temp_d: cfg = temp_d / "config.cfg" cfg.write_text(NOOP_CONFIG) yield cfg @pytest.mark.slow @pytest.mark.parametrize( "cmd", ["debug config", "debug data", "train", "assemble"], ) def test_multi_code(cmd, code_paths, data_paths, noop_config): # check that it fails without the code arg cmd = cmd.split() output = ["."] if cmd[0] == "assemble" else [] cmd = [sys.executable, "-m", "spacy"] + cmd result = subprocess.run([*cmd, str(noop_config), *output, *data_paths]) assert result.returncode == 1 # check that it succeeds with the code arg result = subprocess.run([*cmd, str(noop_config), *output, *data_paths, *code_paths]) assert result.returncode == 0 @pytest.mark.slow def test_multi_code_evaluate(code_paths, data_paths, noop_config): # Evaluation requires a model, not a config, so this works differently from # the other commands. # Train a model to evaluate cmd = f"{sys.executable} -m spacy train {noop_config} -o model".split() result = subprocess.run([*cmd, *data_paths, *code_paths]) assert result.returncode == 0 # now do the evaluation eval_data = data_paths[-1] cmd = f"{sys.executable} -m spacy evaluate model/model-best {eval_data}".split() # check that it fails without the code arg result = subprocess.run(cmd) assert result.returncode == 1 # check that it succeeds with the code arg result = subprocess.run([*cmd, *code_paths]) assert result.returncode == 0 def test_benchmark_accuracy_alias(): # Verify that the `evaluate` alias works correctly. result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"]) result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"]) assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace( result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy") ) def test_debug_data_trainable_lemmatizer_cli(en_vocab): train_docs = [ Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]), Doc( en_vocab, words=["Dogs", "are", "great", "too"], lemmas=["dog", "be", "great", "too"], ), ] dev_docs = [ Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]), Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]), ] with make_tempdir() as d_in: train_bin = DocBin(docs=train_docs) train_bin.to_disk(d_in / "train.spacy") dev_bin = DocBin(docs=dev_docs) dev_bin.to_disk(d_in / "dev.spacy") # `debug data` requires an input pipeline config CliRunner().invoke( app, [ "init", "config", f"{d_in}/config.cfg", "--lang", "en", "--pipeline", "trainable_lemmatizer", ], ) result_debug_data = CliRunner().invoke( app, [ "debug", "data", f"{d_in}/config.cfg", "--paths.train", f"{d_in}/train.spacy", "--paths.dev", f"{d_in}/dev.spacy", ], ) # Instead of checking specific wording of the output, which may change, # we'll check that this section of the debug output is present. assert "= Trainable Lemmatizer =" in result_debug_data.stdout # project tests CFG_FILE = "myconfig.cfg" SAMPLE_PROJECT = { "title": "Sample project", "description": "This is a project for testing", "assets": [ { "dest": "assets/spacy-readme.md", "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md", "checksum": "411b2c89ccf34288fae8ed126bf652f7", }, { "dest": "assets/citation.cff", "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff", "checksum": "c996bfd80202d480eb2e592369714e5e", "extra": True, }, ], "commands": [ { "name": "ok", "help": "print ok", "script": ["python -c \"print('okokok')\""], }, { "name": "create", "help": "make a file", "script": [f"python -m spacy init config {CFG_FILE}"], "outputs": [f"{CFG_FILE}"], }, ], } SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT) @pytest.fixture def project_dir(): with make_tempdir() as pdir: (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT) yield pdir def test_project_document(project_dir): readme_path = project_dir / "README.md" assert not readme_path.exists(), "README already exists" result = CliRunner().invoke( app, ["project", "document", str(project_dir), "-o", str(readme_path)] ) assert result.exit_code == 0 assert readme_path.is_file() text = readme_path.read_text("utf-8") assert SAMPLE_PROJECT["description"] in text def test_project_assets(project_dir): asset_dir = project_dir / "assets" assert not asset_dir.exists(), "Assets dir is already present" result = CliRunner().invoke(app, ["project", "assets", str(project_dir)]) assert result.exit_code == 0 assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded" # check that extras work result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)]) assert result.exit_code == 0 assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded" def test_project_run(project_dir): # make sure dry run works test_file = project_dir / CFG_FILE result = CliRunner().invoke( app, ["project", "run", "--dry", "create", str(project_dir)] ) assert result.exit_code == 0 assert not test_file.is_file() result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) assert result.exit_code == 0 assert test_file.is_file() result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)]) assert result.exit_code == 0 assert "okokok" in result.stdout @pytest.mark.skipif(not has_git(), reason="git not installed") @pytest.mark.parametrize( "options", [ "", # "--sparse", "--branch v3", "--repo https://github.com/explosion/projects --branch v3", ], ) def test_project_clone(options): with make_tempdir() as workspace: out = workspace / "project" target = "benchmarks/ner_conll03" if not options: options = [] else: options = options.split() result = CliRunner().invoke( app, ["project", "clone", target, *options, str(out)] ) assert result.exit_code == 0 assert (out / "README.md").is_file() def test_project_push_pull(project_dir): proj = dict(SAMPLE_PROJECT) remote = "xyz" with make_tempdir() as remote_dir: proj["remotes"] = {remote: str(remote_dir)} proj_text = srsly.yaml_dumps(proj) (project_dir / "project.yml").write_text(proj_text) test_file = project_dir / CFG_FILE result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) assert result.exit_code == 0 assert test_file.is_file() result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)]) assert result.exit_code == 0 test_file.unlink() assert not test_file.exists() result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)]) assert result.exit_code == 0 assert test_file.is_file() def test_find_function_valid(): # example of architecture in main code base function = "spacy.TextCatBOW.v3" result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"]) assert f"Found registered function '{function}'" in result.stdout assert "textcat.py" in result.stdout result = CliRunner().invoke(app, ["find-function", function]) assert f"Found registered function '{function}'" in result.stdout assert "textcat.py" in result.stdout # example of architecture in spacy-legacy function = "spacy.TextCatBOW.v1" result = CliRunner().invoke(app, ["find-function", function]) assert f"Found registered function '{function}'" in result.stdout assert "spacy_legacy" in result.stdout assert "textcat.py" in result.stdout def test_find_function_invalid(): # invalid registry function = "spacy.TextCatBOW.v3" registry = "foobar" result = CliRunner().invoke( app, ["find-function", function, "--registry", registry] ) assert f"Unknown function registry: '{registry}'" in result.stdout # invalid function function = "spacy.TextCatBOW.v666" result = CliRunner().invoke(app, ["find-function", function]) assert f"Couldn't find registered function: '{function}'" in result.stdout example_words_1 = ["I", "like", "cats"] example_words_2 = ["I", "like", "dogs"] example_lemmas_1 = ["I", "like", "cat"] example_lemmas_2 = ["I", "like", "dog"] example_tags = ["PRP", "VBP", "NNS"] example_morphs = [ "Case=Nom|Number=Sing|Person=1|PronType=Prs", "Tense=Pres|VerbForm=Fin", "Number=Plur", ] example_deps = ["nsubj", "ROOT", "dobj"] example_pos = ["PRON", "VERB", "NOUN"] example_ents = ["O", "O", "I-ANIMAL"] example_spans = [(2, 3, "ANIMAL")] TRAIN_EXAMPLE_1 = dict( words=example_words_1, lemmas=example_lemmas_1, tags=example_tags, morphs=example_morphs, deps=example_deps, heads=[1, 1, 1], pos=example_pos, ents=example_ents, spans=example_spans, cats={"CAT": 1.0, "DOG": 0.0}, ) TRAIN_EXAMPLE_2 = dict( words=example_words_2, lemmas=example_lemmas_2, tags=example_tags, morphs=example_morphs, deps=example_deps, heads=[1, 1, 1], pos=example_pos, ents=example_ents, spans=example_spans, cats={"CAT": 0.0, "DOG": 1.0}, ) @pytest.mark.slow @pytest.mark.parametrize( "component,examples", [ ("tagger", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), ("morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), ("trainable_lemmatizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), ("parser", [TRAIN_EXAMPLE_1] * 30), ("ner", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), ("spancat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), ("textcat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]), ], ) def test_init_config_trainable(component, examples, en_vocab): if component == "textcat": train_docs = [] for example in examples: doc = Doc(en_vocab, words=example["words"]) doc.cats = example["cats"] train_docs.append(doc) elif component == "spancat": train_docs = [] for example in examples: doc = Doc(en_vocab, words=example["words"]) doc.spans["sc"] = [ Span(doc, start, end, label) for start, end, label in example["spans"] ] train_docs.append(doc) else: train_docs = [] for example in examples: # cats, spans are not valid kwargs for instantiating a Doc example = {k: v for k, v in example.items() if k not in ("cats", "spans")} doc = Doc(en_vocab, **example) train_docs.append(doc) with make_tempdir() as d_in: train_bin = DocBin(docs=train_docs) train_bin.to_disk(d_in / "train.spacy") dev_bin = DocBin(docs=train_docs) dev_bin.to_disk(d_in / "dev.spacy") init_config_result = CliRunner().invoke( app, [ "init", "config", f"{d_in}/config.cfg", "--lang", "en", "--pipeline", component, ], ) assert init_config_result.exit_code == 0 train_result = CliRunner().invoke( app, [ "train", f"{d_in}/config.cfg", "--paths.train", f"{d_in}/train.spacy", "--paths.dev", f"{d_in}/dev.spacy", "--output", f"{d_in}/model", ], ) assert train_result.exit_code == 0 assert Path(d_in / "model" / "model-last").exists() @pytest.mark.slow @pytest.mark.parametrize( "component,examples", [("tagger,parser,morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2] * 15)], ) def test_init_config_trainable_multiple(component, examples, en_vocab): train_docs = [] for example in examples: example = {k: v for k, v in example.items() if k not in ("cats", "spans")} doc = Doc(en_vocab, **example) train_docs.append(doc) with make_tempdir() as d_in: train_bin = DocBin(docs=train_docs) train_bin.to_disk(d_in / "train.spacy") dev_bin = DocBin(docs=train_docs) dev_bin.to_disk(d_in / "dev.spacy") init_config_result = CliRunner().invoke( app, [ "init", "config", f"{d_in}/config.cfg", "--lang", "en", "--pipeline", component, ], ) assert init_config_result.exit_code == 0 train_result = CliRunner().invoke( app, [ "train", f"{d_in}/config.cfg", "--paths.train", f"{d_in}/train.spacy", "--paths.dev", f"{d_in}/dev.spacy", "--output", f"{d_in}/model", ], ) assert train_result.exit_code == 0 assert Path(d_in / "model" / "model-last").exists()