Add a spacy evaluate speed subcommand

This subcommand reports the mean batch performance of a model on a data set with a 95% confidence interval. For reliability, it first performs some warmup rounds. Then it will measure performance on batches with randomly shuffled documents. To avoid having too many spaCy commands, `speed` is a subcommand of `evaluate` and accuracy evaluation is moved to its own `evaluate accuracy` subcommand.
2025-08-04 12:20:20 +03:00 · 2022-11-30 11:56:11 +01:00 · 2022-11-30 11:56:11 +01:00 · 6c43081b78
commit 6c43081b78
parent 6f9d630f7e
5 changed files with 189 additions and 5 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -15,7 +15,8 @@ from .debug_data import debug_data  # noqa: F401
 from .debug_config import debug_config  # noqa: F401
 from .debug_model import debug_model  # noqa: F401
 from .debug_diff import debug_diff  # noqa: F401
-from .evaluate import evaluate  # noqa: F401
+from .evaluate_accuracy import evaluate  # noqa: F401
+from .evaluate_speed import evaluate_cli  # noqa: F401
 from .convert import convert  # noqa: F401
 from .init_pipeline import init_pipeline_cli  # noqa: F401
 from .init_config import init_config, fill_config  # noqa: F401
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -46,6 +46,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
 commands to check and validate your config files, training and evaluation data,
 and custom model implementations.
 """
+EVALUATE_HELP = """Commands for evaluating pipelines."""
 INIT_HELP = """Commands for initializing configs and pipeline packages."""

 # Wrappers for Typer's annotations. Initially created to set defaults and to
@ -56,10 +57,12 @@ Opt = typer.Option
 app = typer.Typer(name=NAME, help=HELP)
 project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
 debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
+evaluate_cli = typer.Typer(name="evaluate", help=EVALUATE_HELP, no_args_is_help=True)
 init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)

 app.add_typer(project_cli)
 app.add_typer(debug_cli)
+app.add_typer(evaluate_cli)
 app.add_typer(init_cli)


--- a/spacy/cli/evaluate_accuracy.py
+++ b/spacy/cli/evaluate_accuracy.py
@ -4,18 +4,20 @@ from pathlib import Path
 import re
 import srsly
 from thinc.api import fix_random_seed
+import typer

 from ..training import Corpus
 from ..tokens import Doc
-from ._util import app, Arg, Opt, setup_gpu, import_code
+from ._util import Arg, Opt, evaluate_cli, setup_gpu, import_code
 from ..scorer import Scorer
 from .. import util
 from .. import displacy


-@app.command("evaluate")
-def evaluate_cli(
+@evaluate_cli.command("accuracy", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},)
+def accuracy_cli(
    # fmt: off
+    ctx: typer.Context,
    model: str = Arg(..., help="Model name or path"),
    data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
    output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
--- a/spacy/cli/evaluate_speed.py
+++ b/spacy/cli/evaluate_speed.py
@ -0,0 +1,178 @@
+from typing import Iterable, List, Optional
+import random
+from itertools import islice
+import numpy
+from pathlib import Path
+import time
+from tqdm import tqdm
+import typer
+
+from .. import Language, util
+from ..tokens import Doc
+from ..training import Corpus
+from ._util import Arg, Opt, evaluate_cli, setup_gpu
+
+
+@evaluate_cli.command(
+    "speed",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def benchmark_cli(
+    ctx: typer.Context,
+    model: str = Arg(..., help="Model name or path"),
+    data_path: Path = Arg(
+        ..., help="Location of binary evaluation data in .spacy format", exists=True
+    ),
+    batch_size: Optional[int] = Opt(
+        None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"
+    ),
+    no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    n_batches: int = Opt(
+        50,
+        "--batches",
+        help="Minimum number of batches to benchmark",
+        min=30,
+    ),
+    warmup_epochs: int = Opt(
+        3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"
+    ),
+):
+    """
+    Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
+    data in the binary .spacy format.
+    """
+    setup_gpu(use_gpu=use_gpu, silent=False)
+
+    nlp = util.load_model(model)
+    batch_size = batch_size if batch_size is not None else nlp.batch_size
+    corpus = Corpus(data_path)
+    docs = [eg.predicted for eg in corpus(nlp)]
+
+    print(f"Warming up for {warmup_epochs} epochs...")
+    warmup(nlp, docs, warmup_epochs, batch_size)
+
+    print()
+    print(f"Benchmarking {n_batches} batches...")
+    wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
+
+    print()
+    print_outliers(wps)
+    print_mean_with_ci(wps)
+
+
+# Lowercased, behaves as a context manager function.
+class time_context:
+    """Register the running time of a context."""
+
+    def __enter__(self):
+        self.start = time.perf_counter()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.elapsed = time.perf_counter() - self.start
+
+
+class Quartiles:
+    """Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
+    of a sample."""
+
+    q1: float
+    q2: float
+    q3: float
+    iqr: float
+
+    def __init__(self, sample: numpy.ndarray) -> None:
+        self.q1 = numpy.quantile(sample, 0.25)
+        self.q2 = numpy.quantile(sample, 0.5)
+        self.q3 = numpy.quantile(sample, 0.75)
+        self.iqr = self.q3 - self.q1
+
+
+def annotate(
+    nlp: Language, docs: List[Doc], batch_size: Optional[int]
+) -> numpy.ndarray:
+    docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
+    wps = []
+    while True:
+        with time_context() as elapsed:
+            batch_docs = list(
+                islice(docs, batch_size if batch_size else nlp.batch_size)
+            )
+        if len(batch_docs) == 0:
+            break
+        n_tokens = count_tokens(batch_docs)
+        wps.append(n_tokens / elapsed.elapsed)
+
+    return numpy.array(wps)
+
+
+def benchmark(
+    nlp: Language,
+    docs: List[Doc],
+    n_batches: int,
+    batch_size: int,
+    shuffle: bool,
+) -> numpy.ndarray:
+    if shuffle:
+        bench_docs = [
+            nlp.make_doc(random.choice(docs).text)
+            for _ in range(n_batches * batch_size)
+        ]
+    else:
+        bench_docs = [
+            nlp.make_doc(docs[i % len(docs)].text)
+            for i in range(n_batches * batch_size)
+        ]
+
+    return annotate(nlp, bench_docs, batch_size)
+
+
+def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
+    """Apply a statistic to repeated random samples of an array."""
+    return numpy.fromiter(
+        (
+            statistic(numpy.random.choice(x, len(x), replace=True))
+            for _ in range(iterations)
+        ),
+        numpy.float64,
+    )
+
+
+def count_tokens(docs: Iterable[Doc]) -> int:
+    return sum(len(doc) for doc in docs)
+
+
+def print_mean_with_ci(sample: numpy.ndarray):
+    mean = numpy.mean(sample)
+    bootstrap_means = bootstrap(sample)
+    bootstrap_means.sort()
+
+    # 95% confidence interval
+    low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
+    high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
+
+    print(f"Mean: {mean:.1f} WPS (95% CI: {low-mean:.1f} +{high-mean:.1f})")
+
+
+def print_outliers(sample: numpy.ndarray):
+    quartiles = Quartiles(sample)
+
+    n_outliers = numpy.sum(
+        (sample < (quartiles.q1 - 1.5 * quartiles.iqr))
+        | (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
+    )
+    n_extreme_outliers = numpy.sum(
+        (sample < (quartiles.q1 - 3.0 * quartiles.iqr))
+        | (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
+    )
+    print(
+        f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
+    )
+
+
+def warmup(
+    nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
+) -> numpy.ndarray:
+    docs = warmup_epochs * docs
+    return annotate(nlp, docs, batch_size)
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -8,7 +8,7 @@ from wasabi import msg

 import spacy
 from spacy import util
-from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat
+from spacy.cli.evaluate_accuracy import print_prf_per_type, print_textcats_auc_per_cat
 from spacy.lang.en import English
 from spacy.language import Language
 from spacy.pipeline import TextCategorizer