diff --git a/src/libImaging/ColorLUT.c b/src/libImaging/ColorLUT.c index 938653171..c9918db86 100644 --- a/src/libImaging/ColorLUT.c +++ b/src/libImaging/ColorLUT.c @@ -50,27 +50,38 @@ ImagingColorLUT3D_linear( +1 cells will be outside of the table. With this compensation we never hit the upper cells but this also doesn't introduce any noticeable difference. */ - int size1D_2D = size1D * size2D; + int y, size1D_2D = size1D * size2D; + ImagingSectionCookie cookie; __m128i scale = _mm_set_epi32(0, (size3D - 1) / 255.0 * (1< 4) { PyErr_SetString(PyExc_ValueError, "table_channels could be 3 or 4"); @@ -91,72 +102,185 @@ ImagingColorLUT3D_linear( for (y = 0; y < imOut->ysize; y++) { UINT8* rowIn = (UINT8 *)imIn->image[y]; UINT32* rowOut = (UINT32 *)imOut->image[y]; + int x = 0; + + #if defined(__AVX2__) + { + __m256i index = _mm256_mullo_epi32(scale256, + _mm256_cvtepu8_epi32(*(__m128i *) &rowIn[x*4])); + __m256i idxs = _mm256_hadd_epi32(_mm256_hadd_epi32( + _mm256_madd_epi16(index_mul256, _mm256_srli_epi32(index, SCALE_BITS)), + _mm256_setzero_si256()), _mm256_setzero_si256()); + int idx1 = _mm_cvtsi128_si32(_mm256_castsi256_si128(idxs)); + int idx2 = _mm256_extract_epi32(idxs, 4); + for (; x < imOut->xsize - 1; x += 2) { + __m256i next_index = _mm256_mullo_epi32(scale256, + _mm256_cvtepu8_epi32(*(__m128i *) &rowIn[x*4 + 8])); + __m256i next_idxs = _mm256_hadd_epi32(_mm256_hadd_epi32( + _mm256_madd_epi16(index_mul256, _mm256_srli_epi32(next_index, SCALE_BITS)), + _mm256_setzero_si256()), _mm256_setzero_si256()); + int next_idx1 = _mm_cvtsi128_si32(_mm256_castsi256_si128(next_idxs)); + int next_idx2 = _mm256_extract_epi32(next_idxs, 4); + + __m256i shift = _mm256_srli_epi32( + _mm256_and_si256(scale_mask256, index), (SCALE_BITS - SHIFT_BITS)); + + __m256i shift1D, shift2D, shift3D; + __m256i source, left, right, result; + __m256i leftleft, leftright, rightleft, rightright; + + shift = _mm256_or_si256( + _mm256_sub_epi32(_mm256_set1_epi32((1<xsize; x++) { + for (; x < imOut->xsize; x++) { __m128i next_index = _mm_mullo_epi32(scale, _mm_cvtepu8_epi32(*(__m128i *) &rowIn[x*4 + 4])); int next_idx = _mm_cvtsi128_si32( _mm_hadd_epi32(_mm_hadd_epi32( _mm_madd_epi16(index_mul, _mm_srli_epi32(next_index, SCALE_BITS)), _mm_setzero_si128()), _mm_setzero_si128())); + __m128i shift = _mm_srli_epi32( _mm_and_si128(scale_mask, index), (SCALE_BITS - SHIFT_BITS)); - #if defined(__AVX2__) - __m256i shift1D, shift2D; - __m128i shift3D, result; - __m256i source, left, right; - #else __m128i shift1D, shift2D, shift3D; __m128i source, left, right, result; __m128i leftleft, leftright, rightleft, rightright; - #endif shift = _mm_or_si128( _mm_sub_epi32(_mm_set1_epi32((1<