Reduce memory arena contention

Previously there was one memory arena for all threads, making it
the bottleneck for multi-threaded performance. As the number of
threads increased, the contention for the lock on the arena would
grow, causing other threads to wait to acquire it.

This commit makes it use 8 memory arenas, and round-robbins how
they are assigned to threads. Threads keep track of the index that
they should use into the arena array, assigned the first time the
arena is accessed on a given thread.

When an image is first created, it is allocated from an arena.
When the logic to have multiple arenas is enabled, it then keeps
track of the index on the image, so that when deleted it can be
returned to the correct arena.

Effectively this means that in single-threaded programs, this
should not really have an effect. We also do not do this logic if
the GIL is enabled, as it effectively acts as the lock on the
default arena for us.

As expected, this approach has no real noticable effect on regular
CPython. On free-threaded CPython, however, there is a massive
difference (measuring up to about 70%).
This commit is contained in:
Kevin Newton 2025-01-24 14:14:33 -05:00
parent 51df14282f
commit fdd80169fd
4 changed files with 288 additions and 67 deletions

View File

@ -8,18 +8,21 @@
# ------------------------------
from __future__ import annotations
import distutils.ccompiler
import os
import re
import shutil
import struct
import subprocess
import sys
import tempfile
import warnings
from collections.abc import Iterator
from typing import Any
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
from setuptools.errors import CompileError
def get_version() -> str:
@ -292,6 +295,47 @@ def _pkg_config(name: str) -> tuple[list[str], list[str]] | None:
return None
def _try_compile(compiler: distutils.ccompiler.CCompiler, code: str) -> bool:
try:
with tempfile.TemporaryDirectory() as d:
fn = os.path.join(d, "test.c")
with open(fn, "w") as f:
f.write(code)
compiler.compile([fn], output_dir=d, extra_preargs=["-Werror"])
return True
except CompileError:
return False
def _try_compile_attr(compiler: distutils.ccompiler.CCompiler, attr: str) -> bool:
code = f"""
#pragma GCC diagnostic error "-Wattributes"
#pragma clang diagnostic error "-Wattributes"
int {attr} foo;
int main() {{
return 0;
}}
"""
return _try_compile(compiler, code)
def _try_compile_tls_define_macros(
compiler: distutils.ccompiler.CCompiler,
) -> list[tuple[str, str | None]]:
if _try_compile_attr(compiler, "thread_local"): # C23
return [("HAVE_THREAD_LOCAL", None)]
elif _try_compile_attr(compiler, "_Thread_local"): # C11/C17
return [("HAVE__THREAD_LOCAL", None)]
elif _try_compile_attr(compiler, "__thread"): # GCC/clang
return [("HAVE___THREAD", None)]
elif _try_compile_attr(compiler, "__declspec(thread)"): # MSVC
return [("HAVE___DECLSPEC_THREAD_", None)]
else:
return []
class pil_build_ext(build_ext):
class ext_feature:
features = [
@ -426,13 +470,14 @@ class pil_build_ext(build_ext):
def _update_extension(
self,
name: str,
libraries: list[str] | list[str | bool | None],
libraries: list[str] | list[str | bool | None] | None = None,
define_macros: list[tuple[str, str | None]] | None = None,
sources: list[str] | None = None,
) -> None:
for extension in self.extensions:
if extension.name == name:
extension.libraries += libraries
if libraries is not None:
extension.libraries += libraries
if define_macros is not None:
extension.define_macros += define_macros
if sources is not None:
@ -890,7 +935,10 @@ class pil_build_ext(build_ext):
defs.append(("PILLOW_VERSION", f'"{PILLOW_VERSION}"'))
self._update_extension("PIL._imaging", libs, defs)
tls_define_macros = _try_compile_tls_define_macros(self.compiler)
self._update_extension("PIL._imaging", libs, defs + tls_define_macros)
self._update_extension("PIL._imagingmath", define_macros=tls_define_macros)
self._update_extension("PIL._imagingmorph", define_macros=tls_define_macros)
#
# additional libraries
@ -913,7 +961,9 @@ class pil_build_ext(build_ext):
libs.append(feature.get("fribidi"))
else: # building FriBiDi shim from src/thirdparty
srcs.append("src/thirdparty/fribidi-shim/fribidi.c")
self._update_extension("PIL._imagingft", libs, defs, srcs)
self._update_extension(
"PIL._imagingft", libs, defs + tls_define_macros, srcs
)
else:
self._remove_extension("PIL._imagingft")
@ -922,19 +972,19 @@ class pil_build_ext(build_ext):
libs = [feature.get("lcms")]
if sys.platform == "win32":
libs.extend(["user32", "gdi32"])
self._update_extension("PIL._imagingcms", libs)
self._update_extension("PIL._imagingcms", libs, tls_define_macros)
else:
self._remove_extension("PIL._imagingcms")
webp = feature.get("webp")
if isinstance(webp, str):
libs = [webp, webp + "mux", webp + "demux"]
self._update_extension("PIL._webp", libs)
self._update_extension("PIL._webp", libs, tls_define_macros)
else:
self._remove_extension("PIL._webp")
tk_libs = ["psapi"] if sys.platform in ("win32", "cygwin") else []
self._update_extension("PIL._imagingtk", tk_libs)
self._update_extension("PIL._imagingtk", tk_libs, tls_define_macros)
build_ext.build_extensions(self)

View File

@ -3938,34 +3938,50 @@ _get_stats(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = &ImagingDefaultArena;
long stats_new_count = 0;
long stats_allocated_blocks = 0;
long stats_reused_blocks = 0;
long stats_reallocated_blocks = 0;
long stats_freed_blocks = 0;
long blocks_cached = 0;
v = PyLong_FromLong(arena->stats_new_count);
uint8_t index;
ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(index, arena) {
MUTEX_LOCK(&arena->mutex);
stats_new_count += arena->stats_new_count;
stats_allocated_blocks += arena->stats_allocated_blocks;
stats_reused_blocks += arena->stats_reused_blocks;
stats_reallocated_blocks += arena->stats_reallocated_blocks;
stats_freed_blocks += arena->stats_freed_blocks;
blocks_cached += arena->blocks_cached;
MUTEX_UNLOCK(&arena->mutex);
}
v = PyLong_FromLong(stats_new_count);
PyDict_SetItemString(d, "new_count", v ? v : Py_None);
Py_XDECREF(v);
v = PyLong_FromLong(arena->stats_allocated_blocks);
v = PyLong_FromLong(stats_allocated_blocks);
PyDict_SetItemString(d, "allocated_blocks", v ? v : Py_None);
Py_XDECREF(v);
v = PyLong_FromLong(arena->stats_reused_blocks);
v = PyLong_FromLong(stats_reused_blocks);
PyDict_SetItemString(d, "reused_blocks", v ? v : Py_None);
Py_XDECREF(v);
v = PyLong_FromLong(arena->stats_reallocated_blocks);
v = PyLong_FromLong(stats_reallocated_blocks);
PyDict_SetItemString(d, "reallocated_blocks", v ? v : Py_None);
Py_XDECREF(v);
v = PyLong_FromLong(arena->stats_freed_blocks);
v = PyLong_FromLong(stats_freed_blocks);
PyDict_SetItemString(d, "freed_blocks", v ? v : Py_None);
Py_XDECREF(v);
v = PyLong_FromLong(arena->blocks_cached);
v = PyLong_FromLong(blocks_cached);
PyDict_SetItemString(d, "blocks_cached", v ? v : Py_None);
Py_XDECREF(v);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
return d;
}
@ -3975,14 +3991,17 @@ _reset_stats(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = &ImagingDefaultArena;
arena->stats_new_count = 0;
arena->stats_allocated_blocks = 0;
arena->stats_reused_blocks = 0;
arena->stats_reallocated_blocks = 0;
arena->stats_freed_blocks = 0;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
uint8_t index;
ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(index, arena) {
MUTEX_LOCK(&arena->mutex);
arena->stats_new_count = 0;
arena->stats_allocated_blocks = 0;
arena->stats_reused_blocks = 0;
arena->stats_reallocated_blocks = 0;
arena->stats_freed_blocks = 0;
MUTEX_UNLOCK(&arena->mutex);
}
Py_INCREF(Py_None);
return Py_None;
@ -3994,9 +4013,10 @@ _get_alignment(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
int alignment = ImagingDefaultArena.alignment;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = ImagingGetArena();
MUTEX_LOCK(&arena->mutex);
int alignment = arena->alignment;
MUTEX_UNLOCK(&arena->mutex);
return PyLong_FromLong(alignment);
}
@ -4006,9 +4026,10 @@ _get_block_size(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
int block_size = ImagingDefaultArena.block_size;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = ImagingGetArena();
MUTEX_LOCK(&arena->mutex);
int block_size = arena->block_size;
MUTEX_UNLOCK(&arena->mutex);
return PyLong_FromLong(block_size);
}
@ -4018,9 +4039,10 @@ _get_blocks_max(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
int blocks_max = ImagingDefaultArena.blocks_max;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = ImagingGetArena();
MUTEX_LOCK(&arena->mutex);
int blocks_max = arena->blocks_max;
MUTEX_UNLOCK(&arena->mutex);
return PyLong_FromLong(blocks_max);
}
@ -4041,9 +4063,13 @@ _set_alignment(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingDefaultArena.alignment = alignment;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
uint8_t index;
ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(index, arena) {
MUTEX_LOCK(&arena->mutex);
arena->alignment = alignment;
MUTEX_UNLOCK(&arena->mutex);
}
Py_INCREF(Py_None);
return Py_None;
@ -4066,9 +4092,13 @@ _set_block_size(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingDefaultArena.block_size = block_size;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
uint8_t index;
ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(index, arena) {
MUTEX_LOCK(&arena->mutex);
arena->block_size = block_size;
MUTEX_UNLOCK(&arena->mutex);
}
Py_INCREF(Py_None);
return Py_None;
@ -4087,15 +4117,21 @@ _set_blocks_max(PyObject *self, PyObject *args) {
}
if ((unsigned long)blocks_max >
SIZE_MAX / sizeof(ImagingDefaultArena.blocks_pool[0])) {
SIZE_MAX / sizeof(ImagingGetArena()->blocks_pool[0])) {
PyErr_SetString(PyExc_ValueError, "blocks_max is too large");
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
int status = ImagingMemorySetBlocksMax(&ImagingDefaultArena, blocks_max);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
if (!status) {
int status = 0;
uint8_t index;
ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(index, arena) {
MUTEX_LOCK(&arena->mutex);
status |= ImagingMemorySetBlocksMax(arena, blocks_max);
MUTEX_UNLOCK(&arena->mutex);
}
if (status) {
return ImagingError_MemoryError();
}
@ -4111,9 +4147,13 @@ _clear_cache(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingMemoryClearCache(&ImagingDefaultArena, i);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
uint8_t index;
ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(index, arena) {
MUTEX_LOCK(&arena->mutex);
ImagingMemoryClearCache(arena, i);
MUTEX_UNLOCK(&arena->mutex);
}
Py_INCREF(Py_None);
return Py_None;

View File

@ -51,6 +51,20 @@ extern "C" {
* extensions, see http://www.effbot.org/zone/pil-extending.htm
*/
#ifdef Py_GIL_DISABLED
#if defined(__cplusplus)
#define IMAGING_TLS thread_local
#elif defined(HAVE_THREAD_LOCAL)
#define IMAGING_TLS thread_local
#elif defined(HAVE__THREAD_LOCAL)
#define IMAGING_TLS _Thread_local
#elif defined(HAVE___THREAD)
#define IMAGING_TLS __thread
#elif defined(HAVE___DECLSPEC_THREAD_)
#define IMAGING_TLS __declspec(thread)
#endif
#endif
/* Handles */
typedef struct ImagingMemoryInstance *Imaging;
@ -104,6 +118,10 @@ struct ImagingMemoryInstance {
/* Virtual methods */
void (*destroy)(Imaging im);
#ifdef IMAGING_TLS
int arenaindex; /* Index of the arena this image is associated with. */
#endif
};
#define IMAGING_PIXEL_1(im, x, y) ((im)->image8[(y)][(x)])
@ -161,6 +179,9 @@ typedef struct ImagingMemoryArena {
int stats_reallocated_blocks; /* Number of blocks which were actually reallocated
after retrieving */
int stats_freed_blocks; /* Number of freed blocks */
#ifdef IMAGING_TLS
int index; /* Index of the arena in the global array. */
#endif
#ifdef Py_GIL_DISABLED
PyMutex mutex;
#endif
@ -169,7 +190,34 @@ typedef struct ImagingMemoryArena {
/* Objects */
/* ------- */
#ifdef IMAGING_TLS
/* In this case we both do not have the GIL and have thread-local storage, so we
* will allocate a set of arenas and associated them with threads one at a time.
*/
#define IMAGING_ARENAS_COUNT 8
extern struct ImagingMemoryArena ImagingArenas[IMAGING_ARENAS_COUNT];
/* Provide a macro that loops through each arena that has been
* statically-allocated. This is necessary to properly handle stats.
*/
#define IMAGING_ARENAS_FOREACH(index, arena) \
for (index = 0, (arena) = &ImagingArenas[index]; index < IMAGING_ARENAS_COUNT; (arena) = &ImagingArenas[++index])
#else
/* In this case we either have the GIL or do not have thread-local storage, in
* which case we will only allocate a single arena.
*/
extern struct ImagingMemoryArena ImagingDefaultArena;
/* Provide a macro that loops through each arena that has been
* statically-allocated. In this case because there is only one, this is
* effectively a single block of code.
*/
#define IMAGING_ARENAS_FOREACH(index, arena) \
for ((void) index, (arena) = &ImagingDefaultArena; (arena); (arena) = NULL)
#endif
ImagingMemoryArena ImagingGetArena(void);
extern int
ImagingMemorySetBlocksMax(ImagingMemoryArena arena, int blocks_max);
extern void

View File

@ -218,9 +218,10 @@ ImagingNewPrologueSubtype(const char *mode, int xsize, int ysize, int size) {
break;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingDefaultArena.stats_new_count += 1;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = ImagingGetArena();
MUTEX_LOCK(&arena->mutex);
arena->stats_new_count += 1;
MUTEX_UNLOCK(&arena->mutex);
return im;
}
@ -258,23 +259,101 @@ ImagingDelete(Imaging im) {
/* Allocate image as an array of line buffers. */
#define IMAGING_PAGE_SIZE (4096)
#define IMAGING_ARENA_BLOCK_SIZE (16 * 1024 * 1024)
#ifdef IMAGING_TLS
/* This is the overall process-level index that keeps track of the next index
* that will be assigned to a thread.
*/
static uint64_t ImagingArenaIndex = UINT64_MAX;
/* This is the thread-local index that associated a thread with an arena in the
* statically-allocated list.
*/
static IMAGING_TLS uint64_t ImagingArenaThreadIndex = UINT64_MAX;
/* These are the statically-allocated arenas. */
struct ImagingMemoryArena ImagingArenas[IMAGING_ARENAS_COUNT] = {
{ 1, IMAGING_ARENA_BLOCK_SIZE, 0, 0, NULL, 0, 0, 0, 0, 0, 0, {0} },
{ 1, IMAGING_ARENA_BLOCK_SIZE, 0, 0, NULL, 0, 0, 0, 0, 0, 1, {0} },
{ 1, IMAGING_ARENA_BLOCK_SIZE, 0, 0, NULL, 0, 0, 0, 0, 0, 2, {0} },
{ 1, IMAGING_ARENA_BLOCK_SIZE, 0, 0, NULL, 0, 0, 0, 0, 0, 3, {0} },
{ 1, IMAGING_ARENA_BLOCK_SIZE, 0, 0, NULL, 0, 0, 0, 0, 0, 4, {0} },
{ 1, IMAGING_ARENA_BLOCK_SIZE, 0, 0, NULL, 0, 0, 0, 0, 0, 5, {0} },
{ 1, IMAGING_ARENA_BLOCK_SIZE, 0, 0, NULL, 0, 0, 0, 0, 0, 6, {0} },
{ 1, IMAGING_ARENA_BLOCK_SIZE, 0, 0, NULL, 0, 0, 0, 0, 0, 7, {0} }
};
/* Get a pointer to the correct arena for this context. In this case where we
* are using a round-robin approach to the statically allocated arenas, we will
* return the arena that is assigned to the thread on first use.
*/
ImagingMemoryArena ImagingGetArena(void) {
if (ImagingArenaThreadIndex == UINT64_MAX) {
ImagingArenaThreadIndex = _Py_atomic_add_uint64(&ImagingArenaIndex, 1) % IMAGING_ARENAS_COUNT;
}
return &ImagingArenas[ImagingArenaThreadIndex];
}
/* Return the arena associated with the given image. In this case the index of
* the arena is stored on the image itself.
*/
ImagingMemoryArena ImagingGetArenaFromImaging(Imaging im) {
int arenaindex = im->arenaindex;
assert(arenaindex >= 0 && arenaindex < IMAGING_ARENAS_COUNT);
return &ImagingArenas[arenaindex];
}
/* Set the arena index on the given image based on the index of the arena. This
* is necessary in order to return the blocks to the correct arena when the
* image is destroyed.
*/
static void ImagingSetArenaOnImaging(Imaging im, ImagingMemoryArena arena) {
im->arenaindex = arena->index;
}
#else
/* Because we have the GIL (or do not have thread-local storage), we only have a
* single arena.
*/
struct ImagingMemoryArena ImagingDefaultArena = {
1, // alignment
16 * 1024 * 1024, // block_size
0, // blocks_max
0, // blocks_cached
NULL, // blocks_pool
1, // alignment
IMAGING_ARENA_BLOCK_SIZE, // block_size
0, // blocks_max
0, // blocks_cached
NULL, // blocks_pool
0,
0,
0,
0,
0, // Stats
#ifdef Py_GIL_DISABLED
/* On the very off-chance that someone is running free-threaded Python on a
* platform that does not support thread-local storage, we need a mutex
* here.
*/
{0},
#endif
};
/* Get a pointer to the correct arena for this context. In this case where we
* either have the GIL or we do not have TLS, we will return only the default
* arena.
*/
ImagingMemoryArena ImagingGetArena(void) {
return &ImagingDefaultArena;
}
/* Return the arena associated with the given image. In this case because we
* only have one arena, we always return the default arena.
*/
#define ImagingGetArenaFromImaging(im) &ImagingDefaultArena
/* Set the arena index on the given image based on the index of the arena. In
* this case because we only have one arena, we do not need to do anything.
*/
#define ImagingSetArenaOnImaging(im, arena)
#endif
int
ImagingMemorySetBlocksMax(ImagingMemoryArena arena, int blocks_max) {
void *p;
@ -288,18 +367,18 @@ ImagingMemorySetBlocksMax(ImagingMemoryArena arena, int blocks_max) {
p = realloc(arena->blocks_pool, sizeof(*arena->blocks_pool) * blocks_max);
if (!p) {
// Leave previous blocks_max value
return 0;
return 1;
}
arena->blocks_pool = p;
} else {
arena->blocks_pool = calloc(sizeof(*arena->blocks_pool), blocks_max);
if (!arena->blocks_pool) {
return 0;
return 1;
}
}
arena->blocks_max = blocks_max;
return 1;
return 0;
}
void
@ -369,12 +448,13 @@ ImagingDestroyArray(Imaging im) {
int y = 0;
if (im->blocks) {
MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = ImagingGetArenaFromImaging(im);
MUTEX_LOCK(&arena->mutex);
while (im->blocks[y].ptr) {
memory_return_block(&ImagingDefaultArena, im->blocks[y]);
memory_return_block(arena, im->blocks[y]);
y += 1;
}
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
MUTEX_UNLOCK(&arena->mutex);
free(im->blocks);
}
}
@ -504,11 +584,14 @@ ImagingNewInternal(const char *mode, int xsize, int ysize, int dirty) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = ImagingGetArena();
ImagingSetArenaOnImaging(im, arena);
MUTEX_LOCK(&arena->mutex);
Imaging tmp = ImagingAllocateArray(
im, &ImagingDefaultArena, dirty, ImagingDefaultArena.block_size
im, arena, dirty, arena->block_size
);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
MUTEX_UNLOCK(&arena->mutex);
if (tmp) {
return im;
}
@ -516,9 +599,9 @@ ImagingNewInternal(const char *mode, int xsize, int ysize, int dirty) {
ImagingError_Clear();
// Try to allocate the image once more with smallest possible block size
MUTEX_LOCK(&ImagingDefaultArena.mutex);
tmp = ImagingAllocateArray(im, &ImagingDefaultArena, dirty, IMAGING_PAGE_SIZE);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
MUTEX_LOCK(&arena->mutex);
tmp = ImagingAllocateArray(im, arena, dirty, IMAGING_PAGE_SIZE);
MUTEX_UNLOCK(&arena->mutex);
if (tmp) {
return im;
}