From 16f69404f10202cfe2be6bbe58982abee67f6f60 Mon Sep 17 00:00:00 2001 From: Alexander Date: Tue, 27 Mar 2018 03:20:19 +0300 Subject: [PATCH] SIMD ColorLUT. First try --- src/_imaging.c | 2 +- src/libImaging/ColorLUT.c | 153 ++++++++++++++++++++------------------ 2 files changed, 82 insertions(+), 73 deletions(-) diff --git a/src/_imaging.c b/src/_imaging.c index 2a42c0461..88af37f82 100644 --- a/src/_imaging.c +++ b/src/_imaging.c @@ -761,7 +761,7 @@ _prepare_lut_table(PyObject *table, Py_ssize_t table_size) { } /* malloc check ok, max is 2 * 4 * 65**3 = 2197000 */ - prepared = (INT16 *)malloc(sizeof(INT16) * table_size); + prepared = (INT16 *)malloc(sizeof(INT16) * table_size + 2); if (!prepared) { if (free_table_data) { free(table_data); diff --git a/src/libImaging/ColorLUT.c b/src/libImaging/ColorLUT.c index fd6e268b5..690d28b39 100644 --- a/src/libImaging/ColorLUT.c +++ b/src/libImaging/ColorLUT.c @@ -13,7 +13,8 @@ #define SCALE_BITS (32 - 8 - 6) #define SCALE_MASK ((1 << SCALE_BITS) - 1) -#define SHIFT_BITS (16 - 1) +#define SHIFT_BITS (16 - 2) +#define SHIFT_ROUNDING (1<<(SHIFT_BITS-1)) static inline UINT8 clip8(int in) { @@ -74,10 +75,17 @@ ImagingColorLUT3D_linear( +1 cells will be outside of the table. With this compensation we never hit the upper cells but this also doesn't introduce any noticeable difference. */ - UINT32 scale1D = (size1D - 1) / 255.0 * (1 << SCALE_BITS); - UINT32 scale2D = (size2D - 1) / 255.0 * (1 << SCALE_BITS); - UINT32 scale3D = (size3D - 1) / 255.0 * (1 << SCALE_BITS); int size1D_2D = size1D * size2D; + __m128i scale = _mm_set_epi32(0, + (size3D - 1) / 255.0 * (1<image[y]; char *rowOut = (char *)imOut->image[y]; for (x = 0; x < imOut->xsize; x++) { - UINT32 index1D = rowIn[x * 4 + 0] * scale1D; - UINT32 index2D = rowIn[x * 4 + 1] * scale2D; - UINT32 index3D = rowIn[x * 4 + 2] * scale3D; - INT16 shift1D = (SCALE_MASK & index1D) >> (SCALE_BITS - SHIFT_BITS); - INT16 shift2D = (SCALE_MASK & index2D) >> (SCALE_BITS - SHIFT_BITS); - INT16 shift3D = (SCALE_MASK & index3D) >> (SCALE_BITS - SHIFT_BITS); - int idx = table_channels * table_index3D( - index1D >> SCALE_BITS, - index2D >> SCALE_BITS, - index3D >> SCALE_BITS, - size1D, - size1D_2D); - INT16 result[4], left[4], right[4]; - INT16 leftleft[4], leftright[4], rightleft[4], rightright[4]; + __m128i index = _mm_mullo_epi32(scale, + _mm_cvtepu8_epi32(*(__m128i *) &rowIn[x*4])); + __m128i shift = _mm_srli_epi32( + _mm_and_si128(scale_mask, index), (SCALE_BITS - SHIFT_BITS)); + int idx = table_channels * _mm_extract_epi32( + _mm_hadd_epi32(_mm_hadd_epi32( + _mm_madd_epi16(index_mul, _mm_srli_epi32(index, SCALE_BITS)), + _mm_setzero_si128()), _mm_setzero_si128()), 0); + __m128i shift1D, shift2D, shift3D; + __m128i source, left, right, result; + __m128i leftleft, leftright, rightleft, rightright; + + shift = _mm_or_si128( + _mm_sub_epi32(_mm_set1_epi32(1<