From 1e70975dc187dfae096004e9ec23a045a125d8d2 Mon Sep 17 00:00:00 2001 From: Alexander Date: Wed, 22 Jan 2020 13:36:22 +0300 Subject: [PATCH] SIMD ColorLUT. 16bit arithmetic. Access violation fixed --- src/libImaging/ColorLUT.c | 581 ++++++++++++++++++++++---------------- 1 file changed, 331 insertions(+), 250 deletions(-) diff --git a/src/libImaging/ColorLUT.c b/src/libImaging/ColorLUT.c index 57b7bb18c..33fd6e3c0 100644 --- a/src/libImaging/ColorLUT.c +++ b/src/libImaging/ColorLUT.c @@ -7,13 +7,118 @@ #define PRECISION_BITS (16 - 8 - 2) #define PRECISION_ROUNDING (1 << (PRECISION_BITS - 1)) -/* 8 — scales are multiplied on byte. +/* 16 — UINT16 capacity 6 — max index in the table - (max size is 65, but index 64 is not reachable) */ -#define SCALE_BITS (32 - 8 - 6) -#define SCALE_MASK ((1 << SCALE_BITS) - 1) + (max size is 65, but index 64 is not reachable) + 7 — that much bits we can save if divide the value by 255. */ +#define SCALE_BITS (16 - 6 + 7) +#define SCALE_MASK ((1<= 9, <= 14. */ +#define SHIFT_BITS (16 - 7) + + +__m128i static inline +mm_lut_pixel(INT16* table, int size2D_shift, int size3D_shift, + int idx, __m128i shift, __m128i shuffle) +{ + __m128i leftleft, leftright, rightleft, rightright; + __m128i source, left, right; + __m128i shift1D = _mm_shuffle_epi32(shift, 0x00); + __m128i shift2D = _mm_shuffle_epi32(shift, 0x55); + __m128i shift3D = _mm_shuffle_epi32(shift, 0xaa); + + source = _mm_shuffle_epi8( + _mm_loadu_si128((__m128i *) &table[idx + 0]), shuffle); + leftleft = _mm_srai_epi32(_mm_madd_epi16( + source, shift1D), SHIFT_BITS); + + source = _mm_shuffle_epi8( + _mm_loadu_si128((__m128i *) &table[idx + size2D_shift]), shuffle); + leftright = _mm_slli_epi32(_mm_madd_epi16( + source, shift1D), 16 - SHIFT_BITS); + + source = _mm_shuffle_epi8( + _mm_loadu_si128((__m128i *) &table[idx + size3D_shift]), shuffle); + rightleft = _mm_srai_epi32(_mm_madd_epi16( + source, shift1D), SHIFT_BITS); + + source = _mm_shuffle_epi8( + _mm_loadu_si128((__m128i *) &table[idx + size3D_shift + size2D_shift]), shuffle); + rightright = _mm_slli_epi32(_mm_madd_epi16( + source, shift1D), 16 - SHIFT_BITS); + + left = _mm_srai_epi32(_mm_madd_epi16( + _mm_blend_epi16(leftleft, leftright, 0xaa), shift2D), + SHIFT_BITS); + + right = _mm_slli_epi32(_mm_madd_epi16( + _mm_blend_epi16(rightleft, rightright, 0xaa), shift2D), + 16 - SHIFT_BITS); + + return _mm_srai_epi32(_mm_madd_epi16( + _mm_blend_epi16(left, right, 0xaa), shift3D), + SHIFT_BITS); +} + + +#if defined(__AVX2__) +__m256i static inline +mm256_lut_pixel(INT16* table, int size2D_shift, int size3D_shift, + int idx1, int idx2, __m256i shift, __m256i shuffle) +{ + __m256i leftleft, leftright, rightleft, rightright; + __m256i source, left, right; + __m256i shift1D = _mm256_shuffle_epi32(shift, 0x00); + __m256i shift2D = _mm256_shuffle_epi32(shift, 0x55); + __m256i shift3D = _mm256_shuffle_epi32(shift, 0xaa); + + source = _mm256_shuffle_epi8( + _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) &table[idx1 + 0])), + _mm_loadu_si128((__m128i *) &table[idx2 + 0]), 1), + shuffle); + leftleft = _mm256_srai_epi32(_mm256_madd_epi16( + source, shift1D), SHIFT_BITS); + + source = _mm256_shuffle_epi8( + _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) &table[idx1 + size2D_shift])), + _mm_loadu_si128((__m128i *) &table[idx2 + size2D_shift]), 1), + shuffle); + leftright = _mm256_slli_epi32(_mm256_madd_epi16( + source, shift1D), 16 - SHIFT_BITS); + + source = _mm256_shuffle_epi8( + _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) &table[idx1 + size3D_shift])), + _mm_loadu_si128((__m128i *) &table[idx2 + size3D_shift]), 1), + shuffle); + rightleft = _mm256_srai_epi32(_mm256_madd_epi16( + source, shift1D), SHIFT_BITS); + + source = _mm256_shuffle_epi8( + _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) &table[idx1 + size3D_shift + size2D_shift])), + _mm_loadu_si128((__m128i *) &table[idx2 + size3D_shift + size2D_shift]), 1), + shuffle); + rightright = _mm256_slli_epi32(_mm256_madd_epi16( + source, shift1D), 16 - SHIFT_BITS); + + left = _mm256_srai_epi32(_mm256_madd_epi16( + _mm256_blend_epi16(leftleft, leftright, 0xaa), shift2D), + SHIFT_BITS); + + right = _mm256_slli_epi32(_mm256_madd_epi16( + _mm256_blend_epi16(rightleft, rightright, 0xaa), shift2D), + 16 - SHIFT_BITS); + + return _mm256_srai_epi32(_mm256_madd_epi16( + _mm256_blend_epi16(left, right, 0xaa), shift3D), + SHIFT_BITS); +} +#endif /* @@ -51,28 +156,46 @@ ImagingColorLUT3D_linear( With this compensation we never hit the upper cells but this also doesn't introduce any noticeable difference. */ int y, size1D_2D = size1D * size2D; + int size2D_shift = size1D * table_channels; + int size3D_shift = size1D_2D * table_channels; ImagingSectionCookie cookie; - __m128i scale = _mm_set_epi32(0, - (size3D - 1) / 255.0 * (1<> 8); + __m128i index_mul = _mm_set_epi16( + 0, size1D_2D*table_channels, size1D*table_channels, table_channels, 0, size1D_2D*table_channels, size1D*table_channels, table_channels); __m128i shuffle3 = _mm_set_epi8(-1,-1, -1,-1, 11,10, 5,4, 9,8, 3,2, 7,6, 1,0); __m128i shuffle4 = _mm_set_epi8(15,14, 7,6, 13,12, 5,4, 11,10, 3,2, 9,8, 1,0); #if defined(__AVX2__) - __m256i scale256 = _mm256_set_epi32( + __m256i scale256 = _mm256_set_epi16( 0, - (size3D - 1) / 255.0 * (1<> 8); + __m256i index_mul256 = _mm256_set_epi16( + 0, size1D_2D*table_channels, size1D*table_channels, table_channels, + 0, size1D_2D*table_channels, size1D*table_channels, table_channels, 0, size1D_2D*table_channels, size1D*table_channels, table_channels, 0, size1D_2D*table_channels, size1D*table_channels, table_channels); __m256i shuffle3_256 = _mm256_set_epi8( @@ -102,268 +225,226 @@ ImagingColorLUT3D_linear( for (y = 0; y < imOut->ysize; y++) { UINT32* rowIn = (UINT32 *)imIn->image[y]; UINT32* rowOut = (UINT32 *)imOut->image[y]; - UINT8* rowIn8 = (UINT8 *)imIn->image[y]; - UINT8* rowOut8 = (UINT8 *)imOut->image[y]; int x = 0; #if defined(__AVX2__) - { - __m256i index = _mm256_mullo_epi32(scale256, mm256_cvtepu8_epi32(&rowIn[x])); - __m256i idxs = _mm256_hadd_epi32(_mm256_hadd_epi32( - _mm256_madd_epi16(index_mul256, _mm256_srli_epi32(index, SCALE_BITS)), - _mm256_setzero_si256()), _mm256_setzero_si256()); - int idx1 = _mm_cvtsi128_si32(_mm256_castsi256_si128(idxs)); - int idx2 = _mm256_extract_epi32(idxs, 4); + // This makes sense if 12 or more pixels remain (two loops) + if (x < imOut->xsize - 11) { + __m128i source = _mm_loadu_si128((__m128i *) &rowIn[x]); + // scale up to 16 bits, but scale * 255 * 256 up to 31 bits + // bi, gi and ri - 6 bits index + // rs, rs and rs - 9 bits shift + // 00 bi3.bs3 gi3.gs3 ri3.rs3 00 bi2.bs2 gi2.gs2 ri2.rs2 + // 00 bi1.bs1 gi1.gs1 ri1.rs1 00 bi0.bs0 gi0.gs0 ri0.rs0 + __m256i index = _mm256_mulhi_epu16(scale256, + _mm256_slli_epi16(_mm256_cvtepu8_epi16(source), 8)); + // 0000 0000 idx3 idx2 + // 0000 0000 idx1 idx0 + __m256i index_src = _mm256_hadd_epi32( + _mm256_madd_epi16(index_mul256, _mm256_srli_epi16(index, SHIFT_BITS)), + _mm256_setzero_si256()); - for (; x < imOut->xsize - 1; x += 2) { - __m256i next_index = _mm256_mullo_epi32(scale256, mm256_cvtepu8_epi32(&rowIn[x + 2])); - __m256i next_idxs = _mm256_hadd_epi32(_mm256_hadd_epi32( - _mm256_madd_epi16(index_mul256, _mm256_srli_epi32(next_index, SCALE_BITS)), - _mm256_setzero_si256()), _mm256_setzero_si256()); - int next_idx1 = _mm_cvtsi128_si32(_mm256_castsi256_si128(next_idxs)); - int next_idx2 = _mm256_extract_epi32(next_idxs, 4); + for (; x < imOut->xsize - 7; x += 4) { + __m128i next_source = _mm_loadu_si128((__m128i *) &rowIn[x + 4]); + // scale up to 16 bits, but scale * 255 * 256 up to 31 bits + // bi, gi and ri - 6 bits index + // rs, rs and rs - 9 bits shift + // 00 bi3.bs3 gi3.gs3 ri3.rs3 00 bi2.bs2 gi2.gs2 ri2.rs2 + // 00 bi1.bs1 gi1.gs1 ri1.rs1 00 bi0.bs0 gi0.gs0 ri0.rs0 + __m256i next_index = _mm256_mulhi_epu16(scale256, + _mm256_slli_epi16(_mm256_cvtepu8_epi16(next_source), 8)); + // 0000 0000 idx3 idx2 + // 0000 0000 idx1 idx0 + __m256i next_index_src = _mm256_hadd_epi32( + _mm256_madd_epi16(index_mul256, _mm256_srli_epi16(next_index, SHIFT_BITS)), + _mm256_setzero_si256()); - __m256i shift = _mm256_srli_epi32( - _mm256_and_si256(scale_mask256, index), (SCALE_BITS - SHIFT_BITS)); + int idx0 = _mm_cvtsi128_si32(_mm256_castsi256_si128(index_src)); + int idx1 = _mm256_extract_epi32(index_src, 1); + int idx2 = _mm256_extract_epi32(index_src, 4); + int idx3 = _mm256_extract_epi32(index_src, 5); - __m256i shift1D, shift2D, shift3D; - __m256i source, left, right, result; - __m256i leftleft, leftright, rightleft, rightright; + // 00 0.bs3 0.gs3 0.rs3 00 0.bs2 0.gs2 0.rs2 + // 00 0.bs1 0.gs1 0.rs1 00 0.bs0 0.gs0 0.rs0 + __m256i shift = _mm256_and_si256(index, scale_mask256); + // 11 1-bs3 1-gs3 1-rs3 11 1-bs2 1-gs2 1-rs2 + // 11 1-bs1 1-gs1 1-rs1 11 1-bs0 1-gs0 1-rs0 + __m256i inv_shift = _mm256_sub_epi16(_mm256_set1_epi16((1<xsize - 5) { + __m128i source = _mm_loadl_epi64((__m128i *) &rowIn[x]); + // scale up to 16 bits, but scale * 255 * 256 up to 31 bits + // bi, gi and ri - 6 bits index + // rs, rs and rs - 9 bits shift + // 00 bi1.bs1 gi1.gs1 ri1.rs1 00 bi0.bs0 gi0.gs0 ri0.rs0 + __m128i index = _mm_mulhi_epu16(scale, + _mm_unpacklo_epi8(_mm_setzero_si128(), source)); + // 0000 0000 idx1 idx0 + __m128i index_src = _mm_hadd_epi32( + _mm_madd_epi16(index_mul, _mm_srli_epi16(index, SHIFT_BITS)), + _mm_setzero_si128()); - __m128i index = _mm_mullo_epi32(scale, mm_cvtepu8_epi32(&rowIn[x])); - int idx = _mm_cvtsi128_si32( - _mm_hadd_epi32(_mm_hadd_epi32( - _mm_madd_epi16(index_mul, _mm_srli_epi32(index, SCALE_BITS)), - _mm_setzero_si128()), _mm_setzero_si128())); + for (; x < imOut->xsize - 3; x += 2) { + __m128i next_source = _mm_loadl_epi64((__m128i *) &rowIn[x + 2]); + __m128i next_index = _mm_mulhi_epu16(scale, + _mm_unpacklo_epi8(_mm_setzero_si128(), next_source)); + __m128i next_index_src = _mm_hadd_epi32( + _mm_madd_epi16(index_mul, _mm_srli_epi16(next_index, SHIFT_BITS)), + _mm_setzero_si128()); + + int idx0 = _mm_cvtsi128_si32(index_src); + int idx1 = _mm_cvtsi128_si32(_mm_srli_si128(index_src, 4)); + + // 00 0.bs1 0.gs1 0.rs1 00 0.bs0 0.gs0 0.rs0 + __m128i shift = _mm_and_si128(index, scale_mask); + // 11 1-bs1 1-gs1 1-rs1 11 1-bs0 1-gs0 1-rs0 + __m128i inv_shift = _mm_sub_epi16(_mm_set1_epi16((1<xsize; x++) { - __m128i next_index = _mm_mullo_epi32(scale, mm_cvtepu8_epi32(&rowIn[x + 1])); - int next_idx = _mm_cvtsi128_si32( - _mm_hadd_epi32(_mm_hadd_epi32( - _mm_madd_epi16(index_mul, _mm_srli_epi32(next_index, SCALE_BITS)), - _mm_setzero_si128()), _mm_setzero_si128())); + __m128i source = _mm_cvtsi32_si128(rowIn[x]); + // scale up to 16 bits, but scale * 255 * 256 up to 31 bits + // bi, gi and ri - 6 bits index + // rs, rs and rs - 9 bits shift + // 00 00 00 00 00 bi.bs gi.gs ri.rs + __m128i index = _mm_mulhi_epu16(scale, + _mm_unpacklo_epi8(_mm_setzero_si128(), source)); - __m128i shift = _mm_srli_epi32( - _mm_and_si128(scale_mask, index), (SCALE_BITS - SHIFT_BITS)); + int idx0 = _mm_cvtsi128_si32( + _mm_hadd_epi32( + _mm_madd_epi16(index_mul, _mm_srli_epi16(index, SHIFT_BITS)), + _mm_setzero_si128())); - __m128i shift1D, shift2D, shift3D; - __m128i source, left, right, result; - __m128i leftleft, leftright, rightleft, rightright; + // 00 00 00 00 00 0.bs 0.gs 0.rs + __m128i shift = _mm_and_si128(index, scale_mask); + // 11 11 11 11 11 1-bs 1-gs 1-rs + __m128i inv_shift = _mm_sub_epi16(_mm_set1_epi16((1<