From b4e457d9fe43c186b4540107ca70e1d1bb60c75a Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 1 Aug 2023 22:24:02 +0900
Subject: [PATCH] Accept multiple code files in all CLI commands (#12101)

* Add support for multiple code files to all relevant commands

Prior to this, only the package command supported multiple code files.

* Update docs

* Add debug data test, plus generic fixtures

One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.

* Add debug config test and restructure

The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.

* Use a more generic, parametrized test

* Add output arg for assemble and pretrain

Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.

* Add evaluate test and some cleanup

* Mark tests as slow

* Revert argument name change

* Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Format API CLI docs

* isort

* Fix imports in tests

* isort

* Undo changes to package CLI help

* Fix python executable and lang code in test

* Fix executable in another test

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 spacy/cli/_util.py          |   7 ++
 spacy/cli/assemble.py       |   6 +-
 spacy/cli/debug_config.py   |   6 +-
 spacy/cli/debug_data.py     |   6 +-
 spacy/cli/evaluate.py       |   6 +-
 spacy/cli/package.py        |   2 +-
 spacy/cli/pretrain.py       |   6 +-
 spacy/cli/train.py          |   6 +-
 spacy/tests/test_cli_app.py | 207 ++++++++++++++++++++++++++++++++++++
 website/docs/api/cli.mdx    | 107 ++++++++++---------
 10 files changed, 287 insertions(+), 72 deletions(-)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index f2ac20ae5..ca92cdd23 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -350,6 +350,13 @@ def show_validation_error(
         msg.fail("Config validation error", e, exits=1)
 
 
+def import_code_paths(code_paths: str) -> None:
+    """Helper to import comma-separated list of code paths."""
+    code_paths = [Path(p.strip()) for p in string_to_list(code_paths)]
+    for code_path in code_paths:
+        import_code(code_path)
+
+
 def import_code(code_path: Optional[Union[Path, str]]) -> None:
     """Helper to import Python file provided in training commands / commands
     using the config. This makes custom registered functions available.
diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py
index ee2500b27..f552c8459 100644
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@@ -11,7 +11,7 @@ from ._util import (
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -26,7 +26,7 @@ def assemble_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
     output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     # fmt: on
 ):
@@ -45,7 +45,7 @@ def assemble_cli(
     if not config_path or (str(config_path) != "-" and not config_path.exists()):
         msg.fail("Config file not found", config_path, exits=1)
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     with show_validation_error(config_path):
         config = util.load_config(config_path, overrides=overrides, interpolate=False)
     msg.divider("Initializing pipeline")
diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 0e5382cd9..7818b4087 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -13,7 +13,7 @@ from ._util import (
     Arg,
     Opt,
     debug_cli,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -27,7 +27,7 @@ def debug_config_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
     show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
     # fmt: on
@@ -44,7 +44,7 @@ def debug_config_cli(
     DOCS: https://spacy.io/api/cli#debug-config
     """
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     debug_config(
         config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
     )
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 4c44a8c0e..714969be1 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -40,7 +40,7 @@ from ._util import (
     _format_number,
     app,
     debug_cli,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     show_validation_error,
 )
@@ -72,7 +72,7 @@ def debug_data_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
     verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
     no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
@@ -92,7 +92,7 @@ def debug_data_cli(
             "--help for an overview of the other available debugging commands."
         )
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     debug_data(
         config_path,
         config_overrides=overrides,
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 6235b658d..f035aa3ce 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -10,7 +10,7 @@ from .. import displacy, util
 from ..scorer import Scorer
 from ..tokens import Doc
 from ..training import Corpus
-from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
+from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu
 
 
 @benchmark_cli.command(
@@ -22,7 +22,7 @@ def evaluate_cli(
     model: str = Arg(..., help="Model name or path"),
     data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
     output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
@@ -42,7 +42,7 @@ def evaluate_cli(
 
     DOCS: https://spacy.io/api/cli#benchmark-accuracy
     """
-    import_code(code_path)
+    import_code_paths(code_path)
     evaluate(
         model,
         data_path,
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 4545578e6..01449f957 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -20,7 +20,7 @@ def package_cli(
     # fmt: off
     input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
     output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
-    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
+    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be included in the package"),
     meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
     create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
     name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 446c40510..73337a7ca 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -11,7 +11,7 @@ from ._util import (
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     setup_gpu,
     show_validation_error,
@@ -27,7 +27,7 @@ def pretrain_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
     output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
@@ -56,7 +56,7 @@ def pretrain_cli(
     DOCS: https://spacy.io/api/cli#pretrain
     """
     config_overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
     setup_gpu(use_gpu)
     msg.info(f"Loading config from: {config_path}")
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 8bdabd39c..eb1a1a2c1 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -13,7 +13,7 @@ from ._util import (
     Arg,
     Opt,
     app,
-    import_code,
+    import_code_paths,
     parse_config_overrides,
     setup_gpu,
     show_validation_error,
@@ -28,7 +28,7 @@ def train_cli(
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
     output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
-    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
     # fmt: on
@@ -49,7 +49,7 @@ def train_cli(
     """
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
     overrides = parse_config_overrides(ctx.args)
-    import_code(code_path)
+    import_code_paths(code_path)
     train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
 
 
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 3a426113b..2424138d3 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -1,10 +1,13 @@
 import os
+import subprocess
+import sys
 from pathlib import Path
 
 import pytest
 import srsly
 from typer.testing import CliRunner
 
+import spacy
 from spacy.cli._util import app, get_git_version
 from spacy.tokens import Doc, DocBin
 
@@ -46,6 +49,210 @@ def test_convert_auto_conflict():
         assert len(out_files) == 0
 
 
+NOOP_CONFIG = """
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = "mul"
+pipeline = ["noop", "noop2"]
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.noop]
+factory = "noop"
+
+[components.noop2]
+factory = "noop2"
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[training]
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+max_steps = 100
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+dev_corpus = "corpora.dev"
+
+train_corpus = "corpora.train"
+before_to_disk = null
+before_update = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
+"""
+
+
+@pytest.fixture
+def data_paths():
+    nlp = spacy.blank("mul")
+    doc = nlp("ok")
+    with make_tempdir() as tdir:
+        db = DocBin()
+        # debug data will *fail* if there aren't enough docs
+        for ii in range(100):
+            db.add(doc)
+        fpath = tdir / "data.spacy"
+        db.to_disk(fpath)
+
+        args = [
+            "--paths.train",
+            str(fpath),
+            "--paths.dev",
+            str(fpath),
+        ]
+        yield args
+
+
+@pytest.fixture
+def code_paths():
+    noop_base = """
+from spacy.language import Language
+
+@Language.component("{}")
+def noop(doc):
+    return doc
+"""
+
+    with make_tempdir() as temp_d:
+        # write code files to load
+        paths = []
+        for ff in ["noop", "noop2"]:
+            pyfile = temp_d / f"{ff}.py"
+            pyfile.write_text(noop_base.format(ff))
+            paths.append(pyfile)
+
+        args = ["--code", ",".join([str(pp) for pp in paths])]
+        yield args
+
+
+@pytest.fixture
+def noop_config():
+    with make_tempdir() as temp_d:
+        cfg = temp_d / "config.cfg"
+        cfg.write_text(NOOP_CONFIG)
+
+        yield cfg
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "cmd",
+    ["debug config", "debug data", "train", "assemble"],
+)
+def test_multi_code(cmd, code_paths, data_paths, noop_config):
+    # check that it fails without the code arg
+    cmd = cmd.split()
+    output = ["."] if cmd[0] == "assemble" else []
+    cmd = [sys.executable, "-m", "spacy"] + cmd
+    result = subprocess.run([*cmd, str(noop_config), *output, *data_paths])
+    assert result.returncode == 1
+
+    # check that it succeeds with the code arg
+    result = subprocess.run([*cmd, str(noop_config), *output, *data_paths, *code_paths])
+    assert result.returncode == 0
+
+
+@pytest.mark.slow
+def test_multi_code_evaluate(code_paths, data_paths, noop_config):
+    # Evaluation requires a model, not a config, so this works differently from
+    # the other commands.
+
+    # Train a model to evaluate
+    cmd = f"{sys.executable} -m spacy train {noop_config} -o model".split()
+    result = subprocess.run([*cmd, *data_paths, *code_paths])
+    assert result.returncode == 0
+
+    # now do the evaluation
+
+    eval_data = data_paths[-1]
+    cmd = f"{sys.executable} -m spacy evaluate model/model-best {eval_data}".split()
+
+    # check that it fails without the code arg
+    result = subprocess.run(cmd)
+    assert result.returncode == 1
+
+    # check that it succeeds with the code arg
+    result = subprocess.run([*cmd, *code_paths])
+    assert result.returncode == 0
+
+
 def test_benchmark_accuracy_alias():
     # Verify that the `evaluate` alias works correctly.
     result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index b6a32f722..002b0b39d 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -175,15 +175,15 @@ validation error with more details.
 $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 ```
 
-| Name                   | Description                                                                                                                                                                          |
-| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                            |
-| `output_file`          | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                   |
-| `--code`, `-c`         | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                    |
-| `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                                                                        |
-| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                   |
+| Name                   | Description                                                                                                                                                                                            |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                                              |
+| `output_file`          | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                                     |
+| `--code`, `-c`         | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                      |
+| `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                                                                                          |
+| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                                     |
 
 ### init vectors {id="init-vectors",version="3",tag="command"}
 
@@ -242,7 +242,7 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_path`     | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~                                                                                                               |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--verbose`, `-V` | Show more detailed messages for debugging purposes. ~~bool (flag)~~                                                                                                                                                |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
@@ -446,7 +446,7 @@ File       /path/to/thinc/thinc/schedules.py (line 91)
 | Name                     | Description                                                                                                                                                                                                                    |
 | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`            | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~             |
-| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                                           |
+| `--code`, `-c`           | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                         |
 | `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~                                             |
 | `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
 | `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                     |
@@ -631,7 +631,7 @@ will not be available.
 | Name                       | Description                                                                                                                                                                                                        |
 | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`              | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
-| `--code`, `-c`             | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`             | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~                                                                                                                                                       |
 | `--verbose`, `-V`          | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                     |
 | `--no-format`, `-NF`       | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~                                                                                                                           |
@@ -1055,7 +1055,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                          |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
@@ -1125,6 +1125,7 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`                                      | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_dir`                                       | Directory to save binary weights to on each epoch. ~~Path (positional)~~                                                                                                                                           |
+| `--code`, `-c`                                     | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~             |
 | `--code`, `-c`                                     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
 | `--resume-path`, `-r`                              | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~                                                                                                                          |
 | `--epoch-resume`, `-er`                            | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~                                                                |
@@ -1162,19 +1163,19 @@ skew. To render a sample of dependency parses in a HTML file using the
 $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
 ```
 
-| Name                                                 | Description                                                                                                                                                                          |
-| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                                              | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`                                          | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                            |
-| `--output`, `-o`                                     | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                  |
-| `--code`, `-c` <Tag variant="new">3</Tag>            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gold-preproc`, `-G`                               | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--gpu-id`, `-g`                                     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
-| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
-| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                           |
-| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                            |
+| Name                                                 | Description                                                                                                                                                                                            |
+| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                                              | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                             |
+| `data_path`                                          | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                                              |
+| `--output`, `-o`                                     | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~                                                                                                    |
+| `--code`, `-c` <Tag variant="new">3</Tag>            | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gold-preproc`, `-G`                               | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                                                |
+| `--gpu-id`, `-g`                                     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                                             |
+| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~                              |
+| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                                             |
+| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                                              |
 
 ### speed {id="benchmark-speed", version="3.5", tag="command"}
 
@@ -1216,19 +1217,19 @@ When a directory is provided it is traversed recursively to collect all files.
 $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 ```
 
-| Name                      | Description                                                                                                                                                                          |
-| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                  |
-| `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                 |
-| `output-file`             | Output `DocBin` path. ~~str (positional)~~                                                                                                                                           |
-| `--code`, `-c`            | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                            |
-| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                    |
-| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--batch-size`, `-b`      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                  |
-| `--n-process`, `-n`       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                         |
-| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
-| **CREATES**               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |
+| Name                      | Description                                                                                                                                                                                            |
+| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                   | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                    |
+| `data_path`               | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~                                                                   |
+| `output-file`             | Output `DocBin` path. ~~str (positional)~~                                                                                                                                                             |
+| `--code`, `-c`            | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--text-key`, `-tk`       | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~                                                                                              |
+| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~                                      |
+| `--gpu-id`, `-g`          | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--batch-size`, `-b`      | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                                    |
+| `--n-process`, `-n`       | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~                                                                                                                           |
+| `--help`, `-h`            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
+| **CREATES**               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                                               |
 
 ## find-threshold {id="find-threshold",version="3.5",tag="command"}
 
@@ -1255,19 +1256,19 @@ be provided.
 > $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
 > ```
 
-| Name                     | Description                                                                                                                                                                          |
-| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                           |
-| `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                |
-| `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                         |
-| `threshold_key`          | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                        |
-| `scores_key`             | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                            |
-| `--n_trials`, `-n`       | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                   |
-| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
-| `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                              |
-| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                     |
-| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| Name                     | Description                                                                                                                                                                                            |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model`                  | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~                                                                                                             |
+| `data_path`              | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~                                                                                                                  |
+| `pipe_name`              | Name of pipe to examine thresholds for. ~~str (positional)~~                                                                                                                                           |
+| `threshold_key`          | Key of threshold attribute in component's configuration. ~~str (positional)~~                                                                                                                          |
+| `scores_key`             | Name of score to metric to optimize. ~~str (positional)~~                                                                                                                                              |
+| `--n_trials`, `-n`       | Number of trials to determine optimal thresholds. ~~int (option)~~                                                                                                                                     |
+| `--code`, `-c`           | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--gpu-id`, `-g`         | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                                         |
+| `--gold-preproc`, `-G`   | Use gold preprocessing. ~~bool (flag)~~                                                                                                                                                                |
+| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~                                                                                                                                       |
+| `--help`, `-h`           | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                             |
 
 ## assemble {id="assemble",tag="command"}
 
@@ -1291,7 +1292,7 @@ $ python -m spacy assemble [config_path] [output_dir] [--code] [--verbose] [over
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `config_path`     | Path to the [config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
 | `output_dir`      | Directory to store the final pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                   |
-| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions). ~~Optional[Path] \(option)~~                                                |
+| `--code`, `-c`    | Comma-separated paths to Python files with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~        |
 | `--verbose`, `-V` | Show more detailed messages during processing. ~~bool (flag)~~                                                                                                                                                |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                    |
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.data ./data`. ~~Any (option/flag)~~                            |