add custom code support to CLI speed benchmark (#13247)

* add custom code support to CLI speed benchmark * sort imports * better copying for warmup docs
2025-12-16 14:44:19 +03:00 · 2024-01-26 07:29:22 -05:00 · 2024-01-26 07:29:22 -05:00 · 00e938a7c3
commit 00e938a7c3
parent 68b85ea950
2 changed files with 17 additions and 14 deletions
--- a/spacy/cli/benchmark_speed.py
+++ b/spacy/cli/benchmark_speed.py
@ -13,7 +13,7 @@ from .. import util
 from ..language import Language
 from ..tokens import Doc
 from ..training import Corpus
-from ._util import Arg, Opt, benchmark_cli, setup_gpu
+from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu
@benchmark_cli.command(
@ -30,12 +30,14 @@ def benchmark_speed_cli(
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
    warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    # fmt: on
 ):
    """
    Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
    data in the binary .spacy format.
    """
    import_code(code_path)
    setup_gpu(use_gpu=use_gpu, silent=False)
    nlp = util.load_model(model)
@ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray):
 def warmup(
    nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
 ) -> numpy.ndarray:
-    docs = warmup_epochs * docs
+    docs = [doc.copy() for doc in docs * warmup_epochs]
    return annotate(nlp, docs, batch_size)
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -1268,13 +1268,14 @@ the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is
 warmed up before any measurements are taken.
 ```cli
-$ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
+$ python -m spacy benchmark speed [model] [data_path] [--code] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
 ```
 | Name                 | Description                                                                                                                                                                          |
-| -------------------- | -------------------------------------------------------------------------------------------------------- |
+| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `model`              | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~                                                                             |
 | `data_path`          | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                             |
 | `--code`, `-c`       | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~                                                                                       |
 | `--no-shuffle`       | Do not shuffle documents in the benchmark data. ~~bool (flag)~~                                                                                                                      |
 | `--gpu-id`, `-g`     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |