Thread-local arenas

Currently, all threads use the same arena for imaging. This can
result in a lot of contention when there are enough workers and
the mutex is constantly being checked.

This commit instead introduces lockless thread-local arenas for
environments that support it.
This commit is contained in:
Kevin Newton 2025-01-09 12:15:36 -05:00
parent f521a4be7d
commit cfb2dcdfd0
4 changed files with 114 additions and 37 deletions

View File

@ -8,18 +8,21 @@
# ------------------------------
from __future__ import annotations
import distutils.ccompiler
import os
import re
import shutil
import struct
import subprocess
import sys
import tempfile
import warnings
from collections.abc import Iterator
from typing import Any
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
from setuptools.errors import CompileError
def get_version() -> str:
@ -292,6 +295,47 @@ def _pkg_config(name: str) -> tuple[list[str], list[str]] | None:
return None
def _try_compile(compiler: distutils.ccompiler.CCompiler, code: str) -> bool:
try:
with tempfile.TemporaryDirectory() as d:
fn = os.path.join(d, "test.c")
with open(fn, "w") as f:
f.write(code)
compiler.compile([fn], output_dir=d, extra_preargs=["-Werror"])
return True
except CompileError:
return False
def _try_compile_attr(compiler: distutils.ccompiler.CCompiler, attr: str) -> bool:
code = f"""
#pragma GCC diagnostic error "-Wattributes"
#pragma clang diagnostic error "-Wattributes"
int {attr} foo;
int main() {{
return 0;
}}
"""
return _try_compile(compiler, code)
def _try_compile_tls_define_macros(
compiler: distutils.ccompiler.CCompiler,
) -> list[tuple[str, str | None]]:
if _try_compile_attr(compiler, "thread_local"): # C23
return [("HAVE_THREAD_LOCAL", None)]
elif _try_compile_attr(compiler, "_Thread_local"): # C11/C17
return [("HAVE__THREAD_LOCAL", None)]
elif _try_compile_attr(compiler, "__thread"): # GCC/clang
return [("HAVE___THREAD", None)]
elif _try_compile_attr(compiler, "__declspec(thread)"): # MSVC
return [("HAVE___DECLSPEC_THREAD_", None)]
else:
return []
class pil_build_ext(build_ext):
class ext_feature:
features = [
@ -426,12 +470,13 @@ class pil_build_ext(build_ext):
def _update_extension(
self,
name: str,
libraries: list[str] | list[str | bool | None],
libraries: list[str] | list[str | bool | None] | None = None,
define_macros: list[tuple[str, str | None]] | None = None,
sources: list[str] | None = None,
) -> None:
for extension in self.extensions:
if extension.name == name:
if libraries is not None:
extension.libraries += libraries
if define_macros is not None:
extension.define_macros += define_macros
@ -890,7 +935,10 @@ class pil_build_ext(build_ext):
defs.append(("PILLOW_VERSION", f'"{PILLOW_VERSION}"'))
self._update_extension("PIL._imaging", libs, defs)
tls_define_macros = _try_compile_tls_define_macros(self.compiler)
self._update_extension("PIL._imaging", libs, defs + tls_define_macros)
self._update_extension("PIL._imagingmath", define_macros=tls_define_macros)
self._update_extension("PIL._imagingmorph", define_macros=tls_define_macros)
#
# additional libraries
@ -913,7 +961,9 @@ class pil_build_ext(build_ext):
libs.append(feature.get("fribidi"))
else: # building FriBiDi shim from src/thirdparty
srcs.append("src/thirdparty/fribidi-shim/fribidi.c")
self._update_extension("PIL._imagingft", libs, defs, srcs)
self._update_extension(
"PIL._imagingft", libs, defs + tls_define_macros, srcs
)
else:
self._remove_extension("PIL._imagingft")
@ -922,19 +972,19 @@ class pil_build_ext(build_ext):
libs = [feature.get("lcms")]
if sys.platform == "win32":
libs.extend(["user32", "gdi32"])
self._update_extension("PIL._imagingcms", libs)
self._update_extension("PIL._imagingcms", libs, tls_define_macros)
else:
self._remove_extension("PIL._imagingcms")
webp = feature.get("webp")
if isinstance(webp, str):
libs = [webp, webp + "mux", webp + "demux"]
self._update_extension("PIL._webp", libs)
self._update_extension("PIL._webp", libs, tls_define_macros)
else:
self._remove_extension("PIL._webp")
tk_libs = ["psapi"] if sys.platform in ("win32", "cygwin") else []
self._update_extension("PIL._imagingtk", tk_libs)
self._update_extension("PIL._imagingtk", tk_libs, tls_define_macros)
build_ext.build_extensions(self)

View File

@ -3938,7 +3938,7 @@ _get_stats(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = &ImagingDefaultArena;
v = PyLong_FromLong(arena->stats_new_count);
@ -3965,7 +3965,7 @@ _get_stats(PyObject *self, PyObject *args) {
PyDict_SetItemString(d, "blocks_cached", v ? v : Py_None);
Py_XDECREF(v);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
return d;
}
@ -3975,14 +3975,14 @@ _reset_stats(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = &ImagingDefaultArena;
arena->stats_new_count = 0;
arena->stats_allocated_blocks = 0;
arena->stats_reused_blocks = 0;
arena->stats_reallocated_blocks = 0;
arena->stats_freed_blocks = 0;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
Py_INCREF(Py_None);
return Py_None;
@ -3994,9 +3994,9 @@ _get_alignment(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
int alignment = ImagingDefaultArena.alignment;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
return PyLong_FromLong(alignment);
}
@ -4006,9 +4006,9 @@ _get_block_size(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
int block_size = ImagingDefaultArena.block_size;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
return PyLong_FromLong(block_size);
}
@ -4018,9 +4018,9 @@ _get_blocks_max(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
int blocks_max = ImagingDefaultArena.blocks_max;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
return PyLong_FromLong(blocks_max);
}
@ -4041,9 +4041,9 @@ _set_alignment(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
ImagingDefaultArena.alignment = alignment;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
Py_INCREF(Py_None);
return Py_None;
@ -4066,9 +4066,9 @@ _set_block_size(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
ImagingDefaultArena.block_size = block_size;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
Py_INCREF(Py_None);
return Py_None;
@ -4092,9 +4092,9 @@ _set_blocks_max(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
int status = ImagingMemorySetBlocksMax(&ImagingDefaultArena, blocks_max);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
if (!status) {
return ImagingError_MemoryError();
}
@ -4111,9 +4111,9 @@ _clear_cache(PyObject *self, PyObject *args) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
ImagingMemoryClearCache(&ImagingDefaultArena, i);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
Py_INCREF(Py_None);
return Py_None;

View File

@ -149,6 +149,32 @@ struct ImagingPaletteInstance {
int keep_cache; /* This palette will be reused; keep cache */
};
#define IMAGING_ARENA_LOCK(m)
#define IMAGING_ARENA_UNLOCK(m)
#if defined(__cplusplus)
#define IMAGING_ARENA_TLS thread_local
#elif defined(HAVE_THREAD_LOCAL)
#define IMAGING_ARENA_TLS thread_local
#elif defined(HAVE__THREAD_LOCAL)
#define IMAGING_ARENA_TLS _Thread_local
#elif defined(HAVE___THREAD)
#define IMAGING_ARENA_TLS __thread
#elif defined(HAVE___DECLSPEC_THREAD_)
#define IMAGING_ARENA_TLS __declspec(thread)
#elif defined(Py_GIL_DISABLED)
#define IMAGING_ARENA_TLS
#define IMAGING_ARENA_LOCKING
#undef IMAGING_ARENA_LOCK
#undef IMAGING_ARENA_UNLOCK
#define IMAGING_ARENA_LOCK(m) PyMutex_Lock(m)
#define IMAGING_ARENA_UNLOCK(m) PyMutex_Unlock(m)
#else
#define IMAGING_ARENA_TLS
#endif
typedef struct ImagingMemoryArena {
int alignment; /* Alignment in memory of each line of an image */
int block_size; /* Preferred block size, bytes */
@ -161,7 +187,8 @@ typedef struct ImagingMemoryArena {
int stats_reallocated_blocks; /* Number of blocks which were actually reallocated
after retrieving */
int stats_freed_blocks; /* Number of freed blocks */
#ifdef Py_GIL_DISABLED
#ifdef IMAGING_ARENA_LOCKING
PyMutex mutex;
#endif
} *ImagingMemoryArena;
@ -169,7 +196,7 @@ typedef struct ImagingMemoryArena {
/* Objects */
/* ------- */
extern struct ImagingMemoryArena ImagingDefaultArena;
extern IMAGING_ARENA_TLS struct ImagingMemoryArena ImagingDefaultArena;
extern int
ImagingMemorySetBlocksMax(ImagingMemoryArena arena, int blocks_max);
extern void

View File

@ -218,9 +218,9 @@ ImagingNewPrologueSubtype(const char *mode, int xsize, int ysize, int size) {
break;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
ImagingDefaultArena.stats_new_count += 1;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
return im;
}
@ -259,7 +259,7 @@ ImagingDelete(Imaging im) {
#define IMAGING_PAGE_SIZE (4096)
struct ImagingMemoryArena ImagingDefaultArena = {
IMAGING_ARENA_TLS struct ImagingMemoryArena ImagingDefaultArena = {
1, // alignment
16 * 1024 * 1024, // block_size
0, // blocks_max
@ -270,7 +270,7 @@ struct ImagingMemoryArena ImagingDefaultArena = {
0,
0,
0, // Stats
#ifdef Py_GIL_DISABLED
#ifdef IMAGING_ARENA_LOCKING
{0},
#endif
};
@ -369,12 +369,12 @@ ImagingDestroyArray(Imaging im) {
int y = 0;
if (im->blocks) {
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
while (im->blocks[y].ptr) {
memory_return_block(&ImagingDefaultArena, im->blocks[y]);
y += 1;
}
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
free(im->blocks);
}
}
@ -504,11 +504,11 @@ ImagingNewInternal(const char *mode, int xsize, int ysize, int dirty) {
return NULL;
}
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
Imaging tmp = ImagingAllocateArray(
im, &ImagingDefaultArena, dirty, ImagingDefaultArena.block_size
);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
if (tmp) {
return im;
}
@ -516,9 +516,9 @@ ImagingNewInternal(const char *mode, int xsize, int ysize, int dirty) {
ImagingError_Clear();
// Try to allocate the image once more with smallest possible block size
MUTEX_LOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_LOCK(&ImagingDefaultArena.mutex);
tmp = ImagingAllocateArray(im, &ImagingDefaultArena, dirty, IMAGING_PAGE_SIZE);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
IMAGING_ARENA_UNLOCK(&ImagingDefaultArena.mutex);
if (tmp) {
return im;
}