From 01998fc71d946fc04149d3c0428d626696394e2f Mon Sep 17 00:00:00 2001 From: wiredfool Date: Tue, 18 Feb 2025 21:27:42 +0000 Subject: [PATCH] Added docs --- docs/reference/Image.rst | 3 ++ docs/reference/arrow_support.rst | 85 ++++++++++++++++++++++++++++++ docs/reference/internal_design.rst | 1 + src/PIL/Image.py | 36 +++++++++++++ 4 files changed, 125 insertions(+) create mode 100644 docs/reference/arrow_support.rst diff --git a/docs/reference/Image.rst b/docs/reference/Image.rst index bc3758218..6a682cabc 100644 --- a/docs/reference/Image.rst +++ b/docs/reference/Image.rst @@ -79,6 +79,7 @@ Constructing images .. autofunction:: new .. autofunction:: fromarray +.. autofunction:: fromarrow .. autofunction:: frombytes .. autofunction:: frombuffer @@ -370,6 +371,8 @@ Protocols .. autoclass:: SupportsArrayInterface :show-inheritance: +.. autoclass:: SupportsArrowInterface + :show-inheritance: .. autoclass:: SupportsGetData :show-inheritance: diff --git a/docs/reference/arrow_support.rst b/docs/reference/arrow_support.rst new file mode 100644 index 000000000..964446000 --- /dev/null +++ b/docs/reference/arrow_support.rst @@ -0,0 +1,85 @@ +============= +Arrow Support +============= + +Arrow is an in memory data exchange format that is the spritual +successor to the numpy array interface. It provides for zero copy +access to columnar data, which in our case is Image data. + +The goal with Arrow is to provide native zero-copy interop with any +arrow provider or consumer in the Python ecosystem. + +.. warning:: Zero-copy does not mean zero allocation -- The internal +memory layout of Pillow images contains an allocation for row +pointers, so there is a non-zero, but significantly smaller than a +full copy memory cost to reading an arrow image. + + +Data Formats +============ + +Pillow currently supports exporting arrow images in all modes +**except** for ``BGR;15``, ``BGR;16`` and ``BGR;24``. This is due to +line length packing in these modes making for non-continuous memory. + +For single band images, the exported array is width*height elements, +with each pixel corresponding to the appropriate arrow type. + +For multiband images, the exported array is width*height fixed length +4 element arrays of uint8. This is memory compatible with the raw +image storage of 4 bytes per pixel. + +Mode ``1`` images are exported as 1 uint8 byte/pixel, as this is +consistent with the internal storage. + +Pillow will accept, but not produce, one other format. For any +multichannel image with 32 bit storage per pixel, Pillow will accept +an array of width*height int32 elements, which will then be +interpreted using the mode specific interpretation of the bytes. + +The image mode must match the arrow band format when reading single +channel images + +Memory Allocator +================ + +Pillow's default memory allocator, the :ref:`block_allocator`, +allocates up to a 16MB block for images by default. Larger images +overflow into additional blocks. Arrow requires a single continuous +memory allocation, so images allocated in multiple blocks cannot be +exported in the arrow format. + +To enable the single block allocator:: + + from PIL import Image + Image.core.set_use_block_allocator(1) + +Note that this is a global setting, not a per image setting. + +Unsupported Features +==================== + +* Table/Dataframe protocol. We currently support a single array. +* Null markers, producing or consuming. Null values are inferred from + the mode. e.g. RGB images are stored in the first three bytes of + each 32 bit pixel, and the last byte is an implied null. +* Schema Negotiation. There is an optional schema for the requested + datatype in the arrow source interface. We currently ignore that + parameter. +* Array Metadata. + +Internal Details +================ + +Python Arrow C interface: +https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + +The memory that is exported from the arrow interface is shared -- not +copied, so the lifetime of the memory allocation is no longer strictly +tied to the life of the python object. + +The core imaging struct now has a refcount associated with it, and the +lifetime of the core image struct is now divorced from the python +image object. Creating an arrow reference to the image increments the +refcount, and the imaging struct is only released when the refcount +reaches 0. diff --git a/docs/reference/internal_design.rst b/docs/reference/internal_design.rst index 99a18e9ea..041177953 100644 --- a/docs/reference/internal_design.rst +++ b/docs/reference/internal_design.rst @@ -9,3 +9,4 @@ Internal Reference block_allocator internal_modules c_extension_debugging + arrow_support diff --git a/src/PIL/Image.py b/src/PIL/Image.py index 328d2202d..ebfd49b00 100644 --- a/src/PIL/Image.py +++ b/src/PIL/Image.py @@ -3305,6 +3305,42 @@ def fromarray(obj: SupportsArrayInterface, mode: str | None = None) -> Image: def fromarrow(obj: SupportsArrowArrayInterface, mode, size) -> Image: + """Creates an image with zero copy shared memory from an object exporting + the arrow_c_array interface protocol:: + + from PIL import Image + import pyarrow as pa + arr = pa.array([0]*(5*5*4), type=pa.uint8()) + im = Image.fromarrow(arr, 'RGBA', (5, 5)) + + If the data representation of the ``obj`` is not compatible with + Pillow internal storage, a ValueError is raised. + + Pillow images can also be converted to arrow objects:: + + from PIL import Image + import pyarrow as pa + im = Image.open('hopper.jpg') + arr = pa.array(im) + + As with array support, when converting Pillow images to arrays, + only pixel values are transferred. This means that P and PA mode + image will lose their palette. + + :param obj: Object with an arrow_c_array interface + :param mode: Image mode. + :param size: Image size. This must match the storage of the arrow object. + :returns: An Image Object + + Note that according to the arrow spec, both the producer and the + consumer should consider the exported array to be immutable, as + unsynchronized updates will potentially cause inconsistent data. + + See: :ref:`arrow-support` for more detailed information + + .. versionadded:: 11.2 + + """ if not hasattr(obj, "__arrow_c_array__"): msg = "arrow_c_array interface not found" raise ValueError(msg)