mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			176 lines
		
	
	
		
			5.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			176 lines
		
	
	
		
			5.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import random
 | |
| import time
 | |
| from itertools import islice
 | |
| from pathlib import Path
 | |
| from typing import Iterable, List, Optional
 | |
| 
 | |
| import numpy
 | |
| import typer
 | |
| from tqdm import tqdm
 | |
| from wasabi import msg
 | |
| 
 | |
| from .. import util
 | |
| from ..language import Language
 | |
| from ..tokens import Doc
 | |
| from ..training import Corpus
 | |
| from ._util import Arg, Opt, benchmark_cli, setup_gpu
 | |
| 
 | |
| 
 | |
| @benchmark_cli.command(
 | |
|     "speed",
 | |
|     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 | |
| )
 | |
| def benchmark_speed_cli(
 | |
|     # fmt: off
 | |
|     ctx: typer.Context,
 | |
|     model: str = Arg(..., help="Model name or path"),
 | |
|     data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
 | |
|     batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
 | |
|     no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
 | |
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
 | |
|     n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
 | |
|     warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
 | |
|     # fmt: on
 | |
| ):
 | |
|     """
 | |
|     Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
 | |
|     data in the binary .spacy format.
 | |
|     """
 | |
|     setup_gpu(use_gpu=use_gpu, silent=False)
 | |
| 
 | |
|     nlp = util.load_model(model)
 | |
|     batch_size = batch_size if batch_size is not None else nlp.batch_size
 | |
|     corpus = Corpus(data_path)
 | |
|     docs = [eg.predicted for eg in corpus(nlp)]
 | |
| 
 | |
|     if len(docs) == 0:
 | |
|         msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
 | |
| 
 | |
|     print(f"Warming up for {warmup_epochs} epochs...")
 | |
|     warmup(nlp, docs, warmup_epochs, batch_size)
 | |
| 
 | |
|     print()
 | |
|     print(f"Benchmarking {n_batches} batches...")
 | |
|     wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
 | |
| 
 | |
|     print()
 | |
|     print_outliers(wps)
 | |
|     print_mean_with_ci(wps)
 | |
| 
 | |
| 
 | |
| # Lowercased, behaves as a context manager function.
 | |
| class time_context:
 | |
|     """Register the running time of a context."""
 | |
| 
 | |
|     def __enter__(self):
 | |
|         self.start = time.perf_counter()
 | |
|         return self
 | |
| 
 | |
|     def __exit__(self, type, value, traceback):
 | |
|         self.elapsed = time.perf_counter() - self.start
 | |
| 
 | |
| 
 | |
| class Quartiles:
 | |
|     """Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
 | |
|     of a sample."""
 | |
| 
 | |
|     q1: float
 | |
|     q2: float
 | |
|     q3: float
 | |
|     iqr: float
 | |
| 
 | |
|     def __init__(self, sample: numpy.ndarray) -> None:
 | |
|         self.q1 = numpy.quantile(sample, 0.25)
 | |
|         self.q2 = numpy.quantile(sample, 0.5)
 | |
|         self.q3 = numpy.quantile(sample, 0.75)
 | |
|         self.iqr = self.q3 - self.q1
 | |
| 
 | |
| 
 | |
| def annotate(
 | |
|     nlp: Language, docs: List[Doc], batch_size: Optional[int]
 | |
| ) -> numpy.ndarray:
 | |
|     docs = nlp.pipe(tqdm(docs, unit="doc", disable=None), batch_size=batch_size)
 | |
|     wps = []
 | |
|     while True:
 | |
|         with time_context() as elapsed:
 | |
|             batch_docs = list(
 | |
|                 islice(docs, batch_size if batch_size else nlp.batch_size)
 | |
|             )
 | |
|         if len(batch_docs) == 0:
 | |
|             break
 | |
|         n_tokens = count_tokens(batch_docs)
 | |
|         wps.append(n_tokens / elapsed.elapsed)
 | |
| 
 | |
|     return numpy.array(wps)
 | |
| 
 | |
| 
 | |
| def benchmark(
 | |
|     nlp: Language,
 | |
|     docs: List[Doc],
 | |
|     n_batches: int,
 | |
|     batch_size: int,
 | |
|     shuffle: bool,
 | |
| ) -> numpy.ndarray:
 | |
|     if shuffle:
 | |
|         bench_docs = [
 | |
|             nlp.make_doc(random.choice(docs).text)
 | |
|             for _ in range(n_batches * batch_size)
 | |
|         ]
 | |
|     else:
 | |
|         bench_docs = [
 | |
|             nlp.make_doc(docs[i % len(docs)].text)
 | |
|             for i in range(n_batches * batch_size)
 | |
|         ]
 | |
| 
 | |
|     return annotate(nlp, bench_docs, batch_size)
 | |
| 
 | |
| 
 | |
| def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
 | |
|     """Apply a statistic to repeated random samples of an array."""
 | |
|     return numpy.fromiter(
 | |
|         (
 | |
|             statistic(numpy.random.choice(x, len(x), replace=True))
 | |
|             for _ in range(iterations)
 | |
|         ),
 | |
|         numpy.float64,
 | |
|     )
 | |
| 
 | |
| 
 | |
| def count_tokens(docs: Iterable[Doc]) -> int:
 | |
|     return sum(len(doc) for doc in docs)
 | |
| 
 | |
| 
 | |
| def print_mean_with_ci(sample: numpy.ndarray):
 | |
|     mean = numpy.mean(sample)
 | |
|     bootstrap_means = bootstrap(sample)
 | |
|     bootstrap_means.sort()
 | |
| 
 | |
|     # 95% confidence interval
 | |
|     low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
 | |
|     high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
 | |
| 
 | |
|     print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
 | |
| 
 | |
| 
 | |
| def print_outliers(sample: numpy.ndarray):
 | |
|     quartiles = Quartiles(sample)
 | |
| 
 | |
|     n_outliers = numpy.sum(
 | |
|         (sample < (quartiles.q1 - 1.5 * quartiles.iqr))
 | |
|         | (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
 | |
|     )
 | |
|     n_extreme_outliers = numpy.sum(
 | |
|         (sample < (quartiles.q1 - 3.0 * quartiles.iqr))
 | |
|         | (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
 | |
|     )
 | |
|     print(
 | |
|         f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
 | |
|     )
 | |
| 
 | |
| 
 | |
| def warmup(
 | |
|     nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
 | |
| ) -> numpy.ndarray:
 | |
|     docs = warmup_epochs * docs
 | |
|     return annotate(nlp, docs, batch_size)
 |