From 85212dbbb6400255a0522a99ce7fa18a40ea3f81 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Sat, 19 Jul 2025 16:55:52 +0200 Subject: [PATCH 1/8] Add image band metadata for the 4 channel images --- Tests/test_pyarrow.py | 27 +++++++++++++ src/libImaging/Arrow.c | 86 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 112 insertions(+), 1 deletion(-) diff --git a/Tests/test_pyarrow.py b/Tests/test_pyarrow.py index 8dad94fe0..a69504e78 100644 --- a/Tests/test_pyarrow.py +++ b/Tests/test_pyarrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json from typing import Any, NamedTuple import pytest @@ -244,3 +245,29 @@ def test_from_int32array(mode: str, data_tp: DataShape, mask: list[int] | None) img = Image.fromarrow(arr, mode, TEST_IMAGE_SIZE) _test_img_equals_int32_pyarray(img, arr, mask, elts_per_pixel) + + +@pytest.mark.parametrize( + "mode, metadata", + ( + ("LA", ["L", "X", "X", "A"]), + ("RGB", ["R", "G", "B", "X"]), + ("RGBX", ["R", "G", "B", "X"]), + ("RGBA", ["R", "G", "B", "A"]), + ("CMYK", ["C", "M", "Y", "K"]), + ("YCbCr", ["Y", "Cb", "Cr", "X"]), + ("HSV", ["H", "S", "V", "X"]), + ), +) +def test_image_metadata(mode: str, metadata: list[str]) -> None: + img = hopper(mode) + + arr = pyarrow.array(img) # type: ignore[call-overload] + + assert arr.type.field(0).metadata + assert arr.type.field(0).metadata[b"image"] + + parsed_metadata = json.loads(arr.type.field(0).metadata[b"image"].decode("utf8")) + + assert "bands" in parsed_metadata + assert parsed_metadata["bands"] == metadata diff --git a/src/libImaging/Arrow.c b/src/libImaging/Arrow.c index ccafe33b9..2ecec9b29 100644 --- a/src/libImaging/Arrow.c +++ b/src/libImaging/Arrow.c @@ -55,6 +55,77 @@ ReleaseExportedSchema(struct ArrowSchema *array) { // Mark array released array->release = NULL; } +char * +image_band_json(Imaging im) { + char *format = "{\"bands\": [\"%s\", \"%s\", \"%s\", \"%s\"]}"; + char *json; + // Bands can be 4 bands * 2 characters each + int len = strlen(format) + 8 + 1; + int err; + + json = calloc(1, len); + + if (!json) { + return NULL; + } + + err = PyOS_snprintf( + json, + len, + format, + im->band_names[0], + im->band_names[1], + im->band_names[2], + im->band_names[3] + ); + if (err < 0) { + return NULL; + } + return json; +} + +char * +assemble_metadata(const char *band_json) { + /* format is + int32: number of key/value pairs (noted N below) + int32: byte length of key 0 + key 0 (not null-terminated) + int32: byte length of value 0 + value 0 (not null-terminated) + ... + int32: byte length of key N - 1 + key N - 1 (not null-terminated) + int32: byte length of value N - 1 + value N - 1 (not null-terminated) + */ + const char *key = "image"; + INT32 key_len = strlen(key); + INT32 band_json_len = strlen(band_json); + + char *buf; + INT32 *dest_int; + char *dest; + + buf = calloc(1, key_len + band_json_len + 4 + 1 * 8); + if (!buf) { + return NULL; + } + + dest_int = (void *)buf; + + dest_int[0] = 1; + dest_int[1] = key_len; + dest_int += 2; + dest = (void *)dest_int; + memcpy(dest, key, key_len); + dest += key_len; + dest_int = (void *)dest; + dest_int[0] = band_json_len; + dest_int += 1; + memcpy(dest_int, band_json, band_json_len); + + return buf; +} int export_named_type(struct ArrowSchema *schema, char *format, char *name) { @@ -95,6 +166,8 @@ export_named_type(struct ArrowSchema *schema, char *format, char *name) { int export_imaging_schema(Imaging im, struct ArrowSchema *schema) { int retval = 0; + char *metadata; + char *band_json; if (strcmp(im->arrow_band_format, "") == 0) { return IMAGING_ARROW_INCOMPATIBLE_MODE; @@ -117,13 +190,24 @@ export_imaging_schema(Imaging im, struct ArrowSchema *schema) { schema->n_children = 1; schema->children = calloc(1, sizeof(struct ArrowSchema *)); schema->children[0] = (struct ArrowSchema *)calloc(1, sizeof(struct ArrowSchema)); - retval = export_named_type(schema->children[0], im->arrow_band_format, "pixel"); + retval = export_named_type(schema->children[0], im->arrow_band_format, im->mode); if (retval != 0) { free(schema->children[0]); free(schema->children); schema->release(schema); return retval; } + + // band related metadata + band_json = image_band_json(im); + if (band_json) { + // adding the metadata to the child array. + // Accessible in pyarrow via pa.array(img).type.field(0).metadata + // adding it to the top level is not accessible. + schema->children[0]->metadata = assemble_metadata(band_json); + free(band_json); + } + return 0; } From 64556405e29d076d95d284d3d146cecff7476b7d Mon Sep 17 00:00:00 2001 From: wiredfool Date: Sat, 19 Jul 2025 17:34:39 +0200 Subject: [PATCH 2/8] WIP - Not working in pyarrow --- src/libImaging/Arrow.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/libImaging/Arrow.c b/src/libImaging/Arrow.c index 2ecec9b29..ff98dfb51 100644 --- a/src/libImaging/Arrow.c +++ b/src/libImaging/Arrow.c @@ -84,6 +84,33 @@ image_band_json(Imaging im) { return json; } +char * +single_band_json(Imaging im) { + char *format = "{\"bands\": [\"%s\"]}"; + char *json; + // Bands can be 1 band * (maybe but probably not) 2 characters each + int len = strlen(format) + 2 + 1; + int err; + + json = calloc(1, len); + + if (!json) { + return NULL; + } + + err = PyOS_snprintf( + json, + len, + format, + im->band_names[0] + ); + if (err < 0) { + return NULL; + } + return json; +} + + char * assemble_metadata(const char *band_json) { /* format is @@ -179,7 +206,17 @@ export_imaging_schema(Imaging im, struct ArrowSchema *schema) { } if (im->bands == 1) { - return export_named_type(schema, im->arrow_band_format, im->band_names[0]); + retval = export_named_type(schema, im->arrow_band_format, im->band_names[0]); + if (retval != 0) { + return retval; + } + // band related metadata + band_json = single_band_json(im); + if (band_json) { + schema->metadata = assemble_metadata(band_json); + free(band_json); + } + return retval; } retval = export_named_type(schema, "+w:4", ""); From 1159e65b4f60013f93c5b043743c407dbcf74777 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Sun, 20 Jul 2025 12:58:54 +0200 Subject: [PATCH 3/8] Added integration tests for Arro3, comparable to PyArrow tests --- Tests/test_arro3.py | 276 ++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 + 2 files changed, 278 insertions(+) create mode 100644 Tests/test_arro3.py diff --git a/Tests/test_arro3.py b/Tests/test_arro3.py new file mode 100644 index 000000000..ddc7ecd0a --- /dev/null +++ b/Tests/test_arro3.py @@ -0,0 +1,276 @@ +from __future__ import annotations + +import json +from typing import Any, NamedTuple +from itertools import repeat, chain + +import pytest + +from PIL import Image + +from .helper import ( + assert_deep_equal, + assert_image_equal, + hopper, + is_big_endian, +) + +TYPE_CHECKING = False +if TYPE_CHECKING: + from arro3.core import Array, DataType, Field, fixed_size_list_array + from arro3 import compute +else: + arro3 = pytest.importorskip("arro3", reason="Arro3 not installed") + from arro3.core import Array, DataType, Field, fixed_size_list_array + from arro3 import compute + +TEST_IMAGE_SIZE = (10, 10) + + +def _test_img_equals_pyarray( + img: Image.Image, arr: Any, mask: list[int] | None, elts_per_pixel: int = 1 +) -> None: + assert img.height * img.width * elts_per_pixel == len(arr) + px = img.load() + assert px is not None + if elts_per_pixel > 1 and mask is None: + # have to do element-wise comparison when we're comparing + # flattened r,g,b,a to a pixel. + mask = list(range(elts_per_pixel)) + for x in range(0, img.size[0], int(img.size[0] / 10)): + for y in range(0, img.size[1], int(img.size[1] / 10)): + if mask: + pixel = px[x, y] + assert isinstance(pixel, tuple) + for ix, elt in enumerate(mask): + if elts_per_pixel == 1: + assert pixel[ix] == arr[y * img.width + x].as_py()[elt] + else: + assert ( + pixel[ix] + == arr[(y * img.width + x) * elts_per_pixel + elt].as_py() + ) + else: + assert_deep_equal(px[x, y], arr[y * img.width + x].as_py()) + + +def _test_img_equals_int32_pyarray( + img: Image.Image, arr: Any, mask: list[int] | None, elts_per_pixel: int = 1 +) -> None: + assert img.height * img.width * elts_per_pixel == len(arr) + px = img.load() + assert px is not None + if mask is None: + # have to do element-wise comparison when we're comparing + # flattened rgba in an uint32 to a pixel. + mask = list(range(elts_per_pixel)) + for x in range(0, img.size[0], int(img.size[0] / 10)): + for y in range(0, img.size[1], int(img.size[1] / 10)): + pixel = px[x, y] + assert isinstance(pixel, tuple) + arr_pixel_int = arr[y * img.width + x].as_py() + arr_pixel_tuple = ( + arr_pixel_int % 256, + (arr_pixel_int // 256) % 256, + (arr_pixel_int // 256**2) % 256, + (arr_pixel_int // 256**3), + ) + if is_big_endian(): + arr_pixel_tuple = arr_pixel_tuple[::-1] + + for ix, elt in enumerate(mask): + assert pixel[ix] == arr_pixel_tuple[elt] + +fl_uint8_4_type = DataType.list(Field("_", DataType.uint8()).with_nullable(False), 4) + +@pytest.mark.parametrize( + "mode, dtype, mask", + ( + ("L", DataType.uint8(), None), + ("I", DataType.int32(), None), + ("F", DataType.float32(), None), + ("LA", fl_uint8_4_type, [0, 3]), + ("RGB", fl_uint8_4_type, [0, 1, 2]), + ("RGBA", fl_uint8_4_type, None), + ("RGBX", fl_uint8_4_type, None), + ("CMYK", fl_uint8_4_type, None), + ("YCbCr", fl_uint8_4_type, [0, 1, 2]), + ("HSV", fl_uint8_4_type, [0, 1, 2]), + ), +) +def test_to_array(mode: str, dtype: DataType, mask: list[int] | None) -> None: + img = hopper(mode) + + # Resize to non-square + img = img.crop((3, 0, 124, 127)) + assert img.size == (121, 127) + + arr = Array(img) # type: ignore[call-overload] + _test_img_equals_pyarray(img, arr, mask) + assert arr.type == dtype + + reloaded = Image.fromarrow(arr, mode, img.size) + + assert reloaded + + assert_image_equal(img, reloaded) + + +def test_lifetime() -> None: + # valgrind shouldn't error out here. + # arrays should be accessible after the image is deleted. + + img = hopper("L") + + arr_1 = Array(img) # type: ignore[call-overload] + arr_2 = Array(img) # type: ignore[call-overload] + + del img + + assert compute.sum(arr_1).as_py() > 0 + del arr_1 + + assert compute.sum(arr_2).as_py() > 0 + del arr_2 + + +def test_lifetime2() -> None: + # valgrind shouldn't error out here. + # img should remain after the arrays are collected. + + img = hopper("L") + + arr_1 = Array(img) # type: ignore[call-overload] + arr_2 = Array(img) # type: ignore[call-overload] + + assert compute.sum(arr_1).as_py() > 0 + del arr_1 + + assert compute.sum(arr_2).as_py() > 0 + del arr_2 + + img2 = img.copy() + px = img2.load() + assert px # make mypy happy + assert isinstance(px[0, 0], int) + + +class DataShape(NamedTuple): + dtype: DataType + # Strictly speaking, elt should be a pixel or pixel component, so + # list[uint8][4], float, int, uint32, uint8, etc. But more + # correctly, it should be exactly the dtype from the line above. + elt: Any + elts_per_pixel: int + + +UINT_ARR = DataShape( + dtype=fl_uint8_4_type, + elt=[1, 2, 3, 4], # array of 4 uint8 per pixel + elts_per_pixel=1, # only one array per pixel +) + +UINT = DataShape( + dtype=DataType.uint8(), + elt=3, # one uint8, + elts_per_pixel=4, # but repeated 4x per pixel +) + +UINT32 = DataShape( + dtype=DataType.uint32(), + elt=0xABCDEF45, # one packed int, doesn't fit in a int32 > 0x80000000 + elts_per_pixel=1, # one per pixel +) + +INT32 = DataShape( + dtype=DataType.uint32(), + elt=0x12CDEF45, # one packed int + elts_per_pixel=1, # one per pixel +) + + +@pytest.mark.parametrize( + "mode, data_tp, mask", + ( + ("L", DataShape(DataType.uint8(), 3, 1), None), + ("I", DataShape(DataType.int32(), 1 << 24, 1), None), + ("F", DataShape(DataType.float32(), 3.14159, 1), None), + ("LA", UINT_ARR, [0, 3]), + ("LA", UINT, [0, 3]), + ("RGB", UINT_ARR, [0, 1, 2]), + ("RGBA", UINT_ARR, None), + ("CMYK", UINT_ARR, None), + ("YCbCr", UINT_ARR, [0, 1, 2]), + ("HSV", UINT_ARR, [0, 1, 2]), + ("RGB", UINT, [0, 1, 2]), + ("RGBA", UINT, None), + ("CMYK", UINT, None), + ("YCbCr", UINT, [0, 1, 2]), + ("HSV", UINT, [0, 1, 2]), + ), +) +def test_fromarray(mode: str, data_tp: DataShape, mask: list[int] | None) -> None: + (dtype, elt, elts_per_pixel) = data_tp + + ct_pixels = TEST_IMAGE_SIZE[0] * TEST_IMAGE_SIZE[1] + if dtype == fl_uint8_4_type: + tmp_arr = Array(elt * (ct_pixels * elts_per_pixel), type=DataType.uint8()) + arr = fixed_size_list_array(tmp_arr, 4) + else: + arr = Array([elt] * (ct_pixels * elts_per_pixel), type=dtype) + img = Image.fromarrow(arr, mode, TEST_IMAGE_SIZE) + + _test_img_equals_pyarray(img, arr, mask, elts_per_pixel) + + +@pytest.mark.parametrize( + "mode, data_tp, mask", + ( + ("LA", UINT32, [0, 3]), + ("RGB", UINT32, [0, 1, 2]), + ("RGBA", UINT32, None), + ("CMYK", UINT32, None), + ("YCbCr", UINT32, [0, 1, 2]), + ("HSV", UINT32, [0, 1, 2]), + ("LA", INT32, [0, 3]), + ("RGB", INT32, [0, 1, 2]), + ("RGBA", INT32, None), + ("CMYK", INT32, None), + ("YCbCr", INT32, [0, 1, 2]), + ("HSV", INT32, [0, 1, 2]), + ), +) +def test_from_int32array(mode: str, data_tp: DataShape, mask: list[int] | None) -> None: + (dtype, elt, elts_per_pixel) = data_tp + + ct_pixels = TEST_IMAGE_SIZE[0] * TEST_IMAGE_SIZE[1] + arr = Array([elt] * (ct_pixels * elts_per_pixel), type=dtype) + img = Image.fromarrow(arr, mode, TEST_IMAGE_SIZE) + + _test_img_equals_int32_pyarray(img, arr, mask, elts_per_pixel) + + +@pytest.mark.parametrize( + "mode, metadata", + ( + ("LA", ["L", "X", "X", "A"]), + ("RGB", ["R", "G", "B", "X"]), + ("RGBX", ["R", "G", "B", "X"]), + ("RGBA", ["R", "G", "B", "A"]), + ("CMYK", ["C", "M", "Y", "K"]), + ("YCbCr", ["Y", "Cb", "Cr", "X"]), + ("HSV", ["H", "S", "V", "X"]), + ), +) +def test_image_metadata(mode: str, metadata: list[str]) -> None: + img = hopper(mode) + + arr = Array(img) # type: ignore[call-overload] + + assert arr.type.value_field.metadata + assert arr.type.value_field.metadata[b"image"] + + parsed_metadata = json.loads(arr.type.value_field.metadata[b"image"].decode("utf8")) + + assert "bands" in parsed_metadata + assert parsed_metadata["bands"] == metadata diff --git a/pyproject.toml b/pyproject.toml index 4e8623118..b1765e82c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,8 @@ optional-dependencies.mic = [ ] optional-dependencies.test-arrow = [ "pyarrow", + "arro3-core", + "arro3-compute", ] optional-dependencies.tests = [ From 1a02d4ed5a1672218249ed129f43d4109b32e183 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Sun, 20 Jul 2025 13:01:39 +0200 Subject: [PATCH 4/8] lint fixes --- Tests/test_arro3.py | 7 ++++--- pyproject.toml | 4 ++-- src/libImaging/Arrow.c | 8 +------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/Tests/test_arro3.py b/Tests/test_arro3.py index ddc7ecd0a..a7c755fc2 100644 --- a/Tests/test_arro3.py +++ b/Tests/test_arro3.py @@ -2,7 +2,6 @@ from __future__ import annotations import json from typing import Any, NamedTuple -from itertools import repeat, chain import pytest @@ -17,12 +16,12 @@ from .helper import ( TYPE_CHECKING = False if TYPE_CHECKING: - from arro3.core import Array, DataType, Field, fixed_size_list_array from arro3 import compute + from arro3.core import Array, DataType, Field, fixed_size_list_array else: arro3 = pytest.importorskip("arro3", reason="Arro3 not installed") - from arro3.core import Array, DataType, Field, fixed_size_list_array from arro3 import compute + from arro3.core import Array, DataType, Field, fixed_size_list_array TEST_IMAGE_SIZE = (10, 10) @@ -81,8 +80,10 @@ def _test_img_equals_int32_pyarray( for ix, elt in enumerate(mask): assert pixel[ix] == arr_pixel_tuple[elt] + fl_uint8_4_type = DataType.list(Field("_", DataType.uint8()).with_nullable(False), 4) + @pytest.mark.parametrize( "mode, dtype, mask", ( diff --git a/pyproject.toml b/pyproject.toml index b1765e82c..899e833e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,9 +57,9 @@ optional-dependencies.mic = [ "olefile", ] optional-dependencies.test-arrow = [ - "pyarrow", - "arro3-core", "arro3-compute", + "arro3-core", + "pyarrow", ] optional-dependencies.tests = [ diff --git a/src/libImaging/Arrow.c b/src/libImaging/Arrow.c index ff98dfb51..4519243ae 100644 --- a/src/libImaging/Arrow.c +++ b/src/libImaging/Arrow.c @@ -98,19 +98,13 @@ single_band_json(Imaging im) { return NULL; } - err = PyOS_snprintf( - json, - len, - format, - im->band_names[0] - ); + err = PyOS_snprintf(json, len, format, im->band_names[0]); if (err < 0) { return NULL; } return json; } - char * assemble_metadata(const char *band_json) { /* format is From 28c7645d8bab0a746f60e37cbfde1c79b8f69ca9 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Mon, 21 Jul 2025 11:19:45 +0200 Subject: [PATCH 5/8] Added tests for integration with nanoarrow --- Tests/test_nanoarrow.py | 281 ++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 282 insertions(+) create mode 100644 Tests/test_nanoarrow.py diff --git a/Tests/test_nanoarrow.py b/Tests/test_nanoarrow.py new file mode 100644 index 000000000..e0ae11baa --- /dev/null +++ b/Tests/test_nanoarrow.py @@ -0,0 +1,281 @@ +from __future__ import annotations + +import json +from typing import Any, NamedTuple + +import pytest + +from PIL import Image + +from .helper import ( + assert_deep_equal, + assert_image_equal, + hopper, + is_big_endian, +) + +TYPE_CHECKING = False +if TYPE_CHECKING: + import nanoarrow +else: + nanoarrow = pytest.importorskip("nanoarrow", reason="Nanoarrow not installed") + +TEST_IMAGE_SIZE = (10, 10) + + +def _test_img_equals_pyarray( + img: Image.Image, arr: Any, mask: list[int] | None, elts_per_pixel: int = 1 +) -> None: + assert img.height * img.width * elts_per_pixel == len(arr) + px = img.load() + assert px is not None + if elts_per_pixel > 1 and mask is None: + # have to do element-wise comparison when we're comparing + # flattened r,g,b,a to a pixel. + mask = list(range(elts_per_pixel)) + for x in range(0, img.size[0], int(img.size[0] / 10)): + for y in range(0, img.size[1], int(img.size[1] / 10)): + if mask: + pixel = px[x, y] + assert isinstance(pixel, tuple) + for ix, elt in enumerate(mask): + if elts_per_pixel == 1: + assert pixel[ix] == arr[y * img.width + x].as_py()[elt] + else: + assert ( + pixel[ix] + == arr[(y * img.width + x) * elts_per_pixel + elt].as_py() + ) + else: + assert_deep_equal(px[x, y], arr[y * img.width + x].as_py()) + + +def _test_img_equals_int32_pyarray( + img: Image.Image, arr: Any, mask: list[int] | None, elts_per_pixel: int = 1 +) -> None: + assert img.height * img.width * elts_per_pixel == len(arr) + px = img.load() + assert px is not None + if mask is None: + # have to do element-wise comparison when we're comparing + # flattened rgba in an uint32 to a pixel. + mask = list(range(elts_per_pixel)) + for x in range(0, img.size[0], int(img.size[0] / 10)): + for y in range(0, img.size[1], int(img.size[1] / 10)): + pixel = px[x, y] + assert isinstance(pixel, tuple) + arr_pixel_int = arr[y * img.width + x].as_py() + arr_pixel_tuple = ( + arr_pixel_int % 256, + (arr_pixel_int // 256) % 256, + (arr_pixel_int // 256**2) % 256, + (arr_pixel_int // 256**3), + ) + if is_big_endian(): + arr_pixel_tuple = arr_pixel_tuple[::-1] + + for ix, elt in enumerate(mask): + assert pixel[ix] == arr_pixel_tuple[elt] + + +fl_uint8_4_type = nanoarrow.fixed_size_list(value_type=nanoarrow.uint8(nullable=False), + list_size=4, + nullable=False) + + +@pytest.mark.parametrize( + "mode, dtype, mask", + ( + ("L", nanoarrow.uint8(nullable=False), None), + ("I", nanoarrow.int32(nullable=False), None), + ("F", nanoarrow.float32(nullable=False), None), + ("LA", fl_uint8_4_type, [0, 3]), + ("RGB", fl_uint8_4_type, [0, 1, 2]), + ("RGBA", fl_uint8_4_type, None), + ("RGBX", fl_uint8_4_type, None), + ("CMYK", fl_uint8_4_type, None), + ("YCbCr", fl_uint8_4_type, [0, 1, 2]), + ("HSV", fl_uint8_4_type, [0, 1, 2]), + ), +) +def test_to_array(mode: str, dtype: nanoarrow, mask: list[int] | None) -> None: + img = hopper(mode) + + # Resize to non-square + img = img.crop((3, 0, 124, 127)) + assert img.size == (121, 127) + + arr = nanoarrow.Array(img) # type: ignore[call-overload] + _test_img_equals_pyarray(img, arr, mask) + assert arr.schema.type == dtype.type + assert arr.schema.nullable == dtype.nullable + + reloaded = Image.fromarrow(arr, mode, img.size) + + assert reloaded + + assert_image_equal(img, reloaded) + + +def test_lifetime() -> None: + # valgrind shouldn't error out here. + # arrays should be accessible after the image is deleted. + + img = hopper("L") + + arr_1 = nanoarrow.Array(img) # type: ignore[call-overload] + arr_2 = nanoarrow.Array(img) # type: ignore[call-overload] + + del img + + assert sum(arr_1.iter_py()) > 0 + del arr_1 + + assert sum(arr_2.iter_py()) > 0 + del arr_2 + + +def test_lifetime2() -> None: + # valgrind shouldn't error out here. + # img should remain after the arrays are collected. + + img = hopper("L") + + arr_1 = nanoarrow.Array(img) # type: ignore[call-overload] + arr_2 = nanoarrow.Array(img) # type: ignore[call-overload] + + assert sum(arr_1.iter_py()) > 0 + del arr_1 + + assert sum(arr_2.iter_py()) > 0 + del arr_2 + + img2 = img.copy() + px = img2.load() + assert px # make mypy happy + assert isinstance(px[0, 0], int) + + +class DataShape(NamedTuple): + dtype: nanoarrow + # Strictly speaking, elt should be a pixel or pixel component, so + # list[uint8][4], float, int, uint32, uint8, etc. But more + # correctly, it should be exactly the dtype from the line above. + elt: Any + elts_per_pixel: int + + +UINT_ARR = DataShape( + dtype=fl_uint8_4_type, + elt=[1, 2, 3, 4], # array of 4 uint8 per pixel + elts_per_pixel=1, # only one array per pixel +) + +UINT = DataShape( + dtype=nanoarrow.uint8(), + elt=3, # one uint8, + elts_per_pixel=4, # but repeated 4x per pixel +) + +UINT32 = DataShape( + dtype=nanoarrow.uint32(), + elt=0xABCDEF45, # one packed int, doesn't fit in a int32 > 0x80000000 + elts_per_pixel=1, # one per pixel +) + +INT32 = DataShape( + dtype=nanoarrow.uint32(), + elt=0x12CDEF45, # one packed int + elts_per_pixel=1, # one per pixel +) + + +@pytest.mark.xfail(reason="Support for nested array creation is not available in nanoarrow/python") +@pytest.mark.parametrize( + "mode, data_tp, mask", + ( + ("L", DataShape(nanoarrow.uint8(), 3, 1), None), + ("I", DataShape(nanoarrow.int32(), 1 << 24, 1), None), + ("F", DataShape(nanoarrow.float32(), 3.14159, 1), None), + ("LA", UINT_ARR, [0, 3]), + ("LA", UINT, [0, 3]), + ("RGB", UINT_ARR, [0, 1, 2]), + ("RGBA", UINT_ARR, None), + ("CMYK", UINT_ARR, None), + ("YCbCr", UINT_ARR, [0, 1, 2]), + ("HSV", UINT_ARR, [0, 1, 2]), + ("RGB", UINT, [0, 1, 2]), + ("RGBA", UINT, None), + ("CMYK", UINT, None), + ("YCbCr", UINT, [0, 1, 2]), + ("HSV", UINT, [0, 1, 2]), + ), +) +def test_fromarray(mode: str, data_tp: DataShape, mask: list[int] | None) -> None: + (dtype, elt, elts_per_pixel) = data_tp + + ct_pixels = TEST_IMAGE_SIZE[0] * TEST_IMAGE_SIZE[1] + if dtype == fl_uint8_4_type: + # Apparently there's no good way to create this array from python using nanoarrow + # https://github.com/apache/arrow-nanoarrow/issues/620 + # the following lines will fail. + tmp_arr = nanoarrow.c_array(elt * (ct_pixels * elts_per_pixel), schema=nanoarrow.uint8()) + arr = nanoarrow.Array(tmp_arr, schema=dtype) + else: + arr = nanoarrow.Array(nanoarrow.c_array([elt] * (ct_pixels * elts_per_pixel), schema=dtype)) + img = Image.fromarrow(arr, mode, TEST_IMAGE_SIZE) + + _test_img_equals_pyarray(img, arr, mask, elts_per_pixel) + + +@pytest.mark.parametrize( + "mode, data_tp, mask", + ( + ("LA", UINT32, [0, 3]), + ("RGB", UINT32, [0, 1, 2]), + ("RGBA", UINT32, None), + ("CMYK", UINT32, None), + ("YCbCr", UINT32, [0, 1, 2]), + ("HSV", UINT32, [0, 1, 2]), + ("LA", INT32, [0, 3]), + ("RGB", INT32, [0, 1, 2]), + ("RGBA", INT32, None), + ("CMYK", INT32, None), + ("YCbCr", INT32, [0, 1, 2]), + ("HSV", INT32, [0, 1, 2]), + ), +) +def test_from_int32array(mode: str, data_tp: DataShape, mask: list[int] | None) -> None: + (dtype, elt, elts_per_pixel) = data_tp + + ct_pixels = TEST_IMAGE_SIZE[0] * TEST_IMAGE_SIZE[1] + arr = nanoarrow.Array(nanoarrow.c_array([elt] * (ct_pixels * elts_per_pixel), schema=dtype)) + img = Image.fromarrow(arr, mode, TEST_IMAGE_SIZE) + + _test_img_equals_int32_pyarray(img, arr, mask, elts_per_pixel) + + +@pytest.mark.parametrize( + "mode, metadata", + ( + ("LA", ["L", "X", "X", "A"]), + ("RGB", ["R", "G", "B", "X"]), + ("RGBX", ["R", "G", "B", "X"]), + ("RGBA", ["R", "G", "B", "A"]), + ("CMYK", ["C", "M", "Y", "K"]), + ("YCbCr", ["Y", "Cb", "Cr", "X"]), + ("HSV", ["H", "S", "V", "X"]), + ), +) +def test_image_nested_metadata(mode: str, metadata: list[str]) -> None: + img = hopper(mode) + + arr = nanoarrow.Array(img) # type: ignore[call-overload] + + assert arr.schema.value_type.metadata + assert arr.schema.value_type.metadata[b"image"] + + parsed_metadata = json.loads(arr.schema.value_type.metadata[b"image"].decode("utf8")) + + assert "bands" in parsed_metadata + assert parsed_metadata["bands"] == metadata diff --git a/pyproject.toml b/pyproject.toml index 899e833e3..5a4f752b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ optional-dependencies.test-arrow = [ "arro3-compute", "arro3-core", "pyarrow", + "nanoarrow", ] optional-dependencies.tests = [ From 7d2abbdcf91f01fd2bc83dace639f5eb5f7a562c Mon Sep 17 00:00:00 2001 From: wiredfool Date: Mon, 21 Jul 2025 11:22:45 +0200 Subject: [PATCH 6/8] lint. --- Tests/test_nanoarrow.py | 26 ++++++++++++++++++-------- pyproject.toml | 2 +- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/Tests/test_nanoarrow.py b/Tests/test_nanoarrow.py index e0ae11baa..3dc540043 100644 --- a/Tests/test_nanoarrow.py +++ b/Tests/test_nanoarrow.py @@ -78,9 +78,9 @@ def _test_img_equals_int32_pyarray( assert pixel[ix] == arr_pixel_tuple[elt] -fl_uint8_4_type = nanoarrow.fixed_size_list(value_type=nanoarrow.uint8(nullable=False), - list_size=4, - nullable=False) +fl_uint8_4_type = nanoarrow.fixed_size_list( + value_type=nanoarrow.uint8(nullable=False), list_size=4, nullable=False +) @pytest.mark.parametrize( @@ -190,7 +190,9 @@ INT32 = DataShape( ) -@pytest.mark.xfail(reason="Support for nested array creation is not available in nanoarrow/python") +@pytest.mark.xfail( + reason="Support for nested array creation is not available in nanoarrow/python" +) @pytest.mark.parametrize( "mode, data_tp, mask", ( @@ -219,10 +221,14 @@ def test_fromarray(mode: str, data_tp: DataShape, mask: list[int] | None) -> Non # Apparently there's no good way to create this array from python using nanoarrow # https://github.com/apache/arrow-nanoarrow/issues/620 # the following lines will fail. - tmp_arr = nanoarrow.c_array(elt * (ct_pixels * elts_per_pixel), schema=nanoarrow.uint8()) + tmp_arr = nanoarrow.c_array( + elt * (ct_pixels * elts_per_pixel), schema=nanoarrow.uint8() + ) arr = nanoarrow.Array(tmp_arr, schema=dtype) else: - arr = nanoarrow.Array(nanoarrow.c_array([elt] * (ct_pixels * elts_per_pixel), schema=dtype)) + arr = nanoarrow.Array( + nanoarrow.c_array([elt] * (ct_pixels * elts_per_pixel), schema=dtype) + ) img = Image.fromarrow(arr, mode, TEST_IMAGE_SIZE) _test_img_equals_pyarray(img, arr, mask, elts_per_pixel) @@ -249,7 +255,9 @@ def test_from_int32array(mode: str, data_tp: DataShape, mask: list[int] | None) (dtype, elt, elts_per_pixel) = data_tp ct_pixels = TEST_IMAGE_SIZE[0] * TEST_IMAGE_SIZE[1] - arr = nanoarrow.Array(nanoarrow.c_array([elt] * (ct_pixels * elts_per_pixel), schema=dtype)) + arr = nanoarrow.Array( + nanoarrow.c_array([elt] * (ct_pixels * elts_per_pixel), schema=dtype) + ) img = Image.fromarrow(arr, mode, TEST_IMAGE_SIZE) _test_img_equals_int32_pyarray(img, arr, mask, elts_per_pixel) @@ -275,7 +283,9 @@ def test_image_nested_metadata(mode: str, metadata: list[str]) -> None: assert arr.schema.value_type.metadata assert arr.schema.value_type.metadata[b"image"] - parsed_metadata = json.loads(arr.schema.value_type.metadata[b"image"].decode("utf8")) + parsed_metadata = json.loads( + arr.schema.value_type.metadata[b"image"].decode("utf8") + ) assert "bands" in parsed_metadata assert parsed_metadata["bands"] == metadata diff --git a/pyproject.toml b/pyproject.toml index 5a4f752b3..c450e4274 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,8 +59,8 @@ optional-dependencies.mic = [ optional-dependencies.test-arrow = [ "arro3-compute", "arro3-core", - "pyarrow", "nanoarrow", + "pyarrow", ] optional-dependencies.tests = [ From c07fe6e94313bf4172c670a8d6c4c3e7c78ac469 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Mon, 21 Jul 2025 11:33:14 +0200 Subject: [PATCH 7/8] Added flat image metadata tests This metadata is available in nanoarrow, but not pyarrow or arro3 --- Tests/test_nanoarrow.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/Tests/test_nanoarrow.py b/Tests/test_nanoarrow.py index 3dc540043..f0d609545 100644 --- a/Tests/test_nanoarrow.py +++ b/Tests/test_nanoarrow.py @@ -289,3 +289,26 @@ def test_image_nested_metadata(mode: str, metadata: list[str]) -> None: assert "bands" in parsed_metadata assert parsed_metadata["bands"] == metadata + +@pytest.mark.parametrize( + "mode, metadata", + ( + ("L", ["L"]), + ("I", ["I"]), + ("F", ["F"]), + ), +) +def test_image_flat_metadata(mode: str, metadata: list[str]) -> None: + img = hopper(mode) + + arr = nanoarrow.Array(img) # type: ignore[call-overload] + + assert arr.schema.metadata + assert arr.schema.metadata[b"image"] + + parsed_metadata = json.loads( + arr.schema.metadata[b"image"].decode("utf8") + ) + + assert "bands" in parsed_metadata + assert parsed_metadata["bands"] == metadata From 9e415c787639d50fcfbd9a519174a60e3eb6c1e9 Mon Sep 17 00:00:00 2001 From: wiredfool Date: Mon, 21 Jul 2025 17:24:52 +0200 Subject: [PATCH 8/8] A way to make nested arrays in nano arrow but detouring through a buffer --- Tests/test_nanoarrow.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/Tests/test_nanoarrow.py b/Tests/test_nanoarrow.py index f0d609545..b08333ae9 100644 --- a/Tests/test_nanoarrow.py +++ b/Tests/test_nanoarrow.py @@ -190,9 +190,6 @@ INT32 = DataShape( ) -@pytest.mark.xfail( - reason="Support for nested array creation is not available in nanoarrow/python" -) @pytest.mark.parametrize( "mode, data_tp, mask", ( @@ -218,13 +215,13 @@ def test_fromarray(mode: str, data_tp: DataShape, mask: list[int] | None) -> Non ct_pixels = TEST_IMAGE_SIZE[0] * TEST_IMAGE_SIZE[1] if dtype == fl_uint8_4_type: - # Apparently there's no good way to create this array from python using nanoarrow - # https://github.com/apache/arrow-nanoarrow/issues/620 - # the following lines will fail. - tmp_arr = nanoarrow.c_array( + tmp_arr = nanoarrow.Array( elt * (ct_pixels * elts_per_pixel), schema=nanoarrow.uint8() ) - arr = nanoarrow.Array(tmp_arr, schema=dtype) + c_array = nanoarrow.c_array_from_buffers( + dtype, ct_pixels, buffers=[], children=[tmp_arr] + ) + arr = nanoarrow.Array(c_array) else: arr = nanoarrow.Array( nanoarrow.c_array([elt] * (ct_pixels * elts_per_pixel), schema=dtype) @@ -290,6 +287,7 @@ def test_image_nested_metadata(mode: str, metadata: list[str]) -> None: assert "bands" in parsed_metadata assert parsed_metadata["bands"] == metadata + @pytest.mark.parametrize( "mode, metadata", ( @@ -306,9 +304,7 @@ def test_image_flat_metadata(mode: str, metadata: list[str]) -> None: assert arr.schema.metadata assert arr.schema.metadata[b"image"] - parsed_metadata = json.loads( - arr.schema.metadata[b"image"].decode("utf8") - ) + parsed_metadata = json.loads(arr.schema.metadata[b"image"].decode("utf8")) assert "bands" in parsed_metadata assert parsed_metadata["bands"] == metadata