diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index cc97922a2..e82165f1d 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -202,28 +202,39 @@ def noop_config():
 
 @pytest.mark.parametrize(
     "cmd",
-    [
-        ["debug", "config"],
-        ["debug", "data"],
-        ["train"],
-        ["assemble"],
-    ],
+    ["debug config", "debug data", "train", "assemble"],
 )
 def test_multi_code(cmd, code_paths, data_paths, noop_config):
     # check that it fails without the code arg
-    output = ["."] if cmd[0] in ("pretrain", "assemble") else []
+    cmd = cmd.split()
+    output = ["."] if cmd[0] == "assemble" else []
     cmd = ["python", "-m", "spacy"] + cmd
     result = subprocess.run([*cmd, str(noop_config), *output, *data_paths])
     assert result.returncode == 1
 
     # check that it succeeds with the code arg
-    result = subprocess.run(
-        [
-            *cmd,
-            str(noop_config),
-            *output,
-            *data_paths,
-            *code_paths,
-        ]
-    )
+    result = subprocess.run([*cmd, str(noop_config), *output, *data_paths, *code_paths])
+    assert result.returncode == 0
+
+
+def test_multi_code_evaluate(code_paths, data_paths, noop_config):
+    # Evaluation requires a model, not a config, so this works differently from
+    # the other commands.
+
+    # Train a model to evaluate
+    cmd = f"python -m spacy train {noop_config} -o model".split()
+    result = subprocess.run([*cmd, *data_paths, *code_paths])
+    assert result.returncode == 0
+
+    # now do the evaluation
+
+    eval_data = data_paths[-1]
+    cmd = f"python -m spacy evaluate model/model-best {eval_data}".split()
+
+    # check that it fails without the code arg
+    result = subprocess.run(cmd)
+    assert result.returncode == 1
+
+    # check that it succeeds with the code arg
+    result = subprocess.run([*cmd, *code_paths])
     assert result.returncode == 0