diff --git a/libImaging/FilterSIMD_5x5i_4u8.c b/libImaging/FilterSIMD_5x5i_4u8.c index 5fff7c68a..5c6dced03 100644 --- a/libImaging/FilterSIMD_5x5i_4u8.c +++ b/libImaging/FilterSIMD_5x5i_4u8.c @@ -15,6 +15,111 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, INT32* in1 = im->image32[y+1]; INT32* in2 = im->image32[y+2]; INT32* out = imOut->image32[y]; +#if defined(__AVX2__) + + #define MM_KERNEL_LOAD(row, x) \ + pix0##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x]), shuffle)); \ + pix0##row = _mm256_inserti128_si256(pix0##row, _mm256_castsi256_si128(pix0##row), 1); \ + pix1##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle)); \ + pix1##row = _mm256_inserti128_si256(pix1##row, _mm256_castsi256_si128(pix1##row), 1); \ + pix2##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle)); \ + pix2##row = _mm256_inserti128_si256(pix2##row, _mm256_castsi256_si128(pix2##row), 1); \ + pix3##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle)); \ + pix3##row = _mm256_inserti128_si256(pix3##row, _mm256_castsi256_si128(pix3##row), 1); \ + pix4##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x]), shuffle)); \ + pix4##row = _mm256_inserti128_si256(pix4##row, _mm256_castsi256_si128(pix4##row), 1); + + #define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix0##row, zero), _mm256_shuffle_epi32(kernel0##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix1##row, zero), _mm256_shuffle_epi32(kernel1##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix2##row, zero), _mm256_shuffle_epi32(kernel2##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix3##row, zero), _mm256_shuffle_epi32(kernel3##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix4##row, zero), _mm256_shuffle_epi32(kernel4##krow, kctl))); + + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m256i kernel00 = _mm256_set_epi16( + 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0], 0, + 0, 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0]); + __m256i kernel10 = _mm256_set_epi16( + 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5], 0, + 0, 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5]); + __m256i kernel20 = _mm256_set_epi16( + 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10], 0, + 0, 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10]); + __m256i kernel30 = _mm256_set_epi16( + 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15], 0, + 0, 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15]); + __m256i kernel40 = _mm256_set_epi16( + 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20], 0, + 0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20]); + __m256i zero = _mm256_setzero_si256(); + __m256i pix00, pix10, pix20, pix30, pix40; + __m256i pix01, pix11, pix21, pix31, pix41; + + out[0] = in0[0]; + out[1] = in0[1]; + x = 2; + + MM_KERNEL_LOAD(0, 0); + + for (; x < im->xsize-2-3; x += 4) { + __m256i ss0, ss2; + __m128i ssout; + + ss0 = _mm256_set1_epi32(offset); + ss2 = _mm256_set1_epi32(offset); + + MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm256_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss2, 0, _mm256_unpackhi_epi8, 0, 0x00); + + MM_KERNEL_LOAD(1, x+2); + MM_KERNEL_SUM(ss0, 1, _mm256_unpacklo_epi8, 0, 0xaa); + ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS); + MM_KERNEL_SUM(ss2, 1, _mm256_unpacklo_epi8, 0, 0x55); + MM_KERNEL_SUM(ss2, 1, _mm256_unpackhi_epi8, 0, 0xaa); + ss2 = _mm256_srai_epi32(ss2, PRECISION_BITS); + + pix00 = pix01; + pix10 = pix11; + pix20 = pix21; + pix30 = pix31; + pix40 = pix41; + + ss0 = _mm256_packs_epi32(ss0, ss2); + ssout = _mm_packus_epi16( + _mm256_extracti128_si256(ss0, 0), + _mm256_extracti128_si256(ss0, 1)); + ssout = _mm_shuffle_epi32(ssout, 0xd8); + _mm_storeu_si128((__m128i*) &out[x], ssout); + } + for (; x < im->xsize-2; x++) { + __m256i ss0; + + MM_KERNEL_LOAD(0, x-2); + MM_KERNEL_LOAD(1, x+2); + + ss0 = _mm256_set1_epi32(offset); + MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm256_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss0, 1, _mm256_unpacklo_epi8, 0, 0xaa); + ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS); + + ss0 = _mm256_packs_epi32(ss0, ss0); + ss0 = _mm256_packus_epi16(ss0, ss0); + out[x] = _mm_cvtsi128_si32(_mm256_castsi256_si128(ss0)); + } + out[x] = in0[x]; + out[x+1] = in0[x+1]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#else #define MM_KERNEL_LOAD(row, x) \ pix0##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x]), shuffle); \ @@ -37,35 +142,25 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); __m128i kernel00 = _mm_set_epi16( - 0, 0, 0, kernel[4], - kernel[3], kernel[2], kernel[1], kernel[0]); + 0, 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0]); __m128i kernel10 = _mm_set_epi16( - 0, 0, 0, kernel[9], - kernel[8], kernel[7], kernel[6], kernel[5]); + 0, 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5]); __m128i kernel20 = _mm_set_epi16( - 0, 0, 0, kernel[14], - kernel[13], kernel[12], kernel[11], kernel[10]); + 0, 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10]); __m128i kernel30 = _mm_set_epi16( - 0, 0, 0, kernel[19], - kernel[18], kernel[17], kernel[16], kernel[15]); + 0, 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15]); __m128i kernel40 = _mm_set_epi16( - 0, 0, 0, kernel[24], - kernel[23], kernel[22], kernel[21], kernel[20]); + 0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20]); __m128i kernel01 = _mm_set_epi16( - 0, 0, kernel[4], kernel[3], - kernel[2], kernel[1], kernel[0], 0); + 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0], 0); __m128i kernel11 = _mm_set_epi16( - 0, 0, kernel[9], kernel[8], - kernel[7], kernel[6], kernel[5], 0); + 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5], 0); __m128i kernel21 = _mm_set_epi16( - 0, 0, kernel[14], kernel[13], - kernel[12], kernel[11], kernel[10], 0); + 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10], 0); __m128i kernel31 = _mm_set_epi16( - 0, 0, kernel[19], kernel[18], - kernel[17], kernel[16], kernel[15], 0); + 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15], 0); __m128i kernel41 = _mm_set_epi16( - 0, 0, kernel[24], kernel[23], - kernel[22], kernel[21], kernel[20], 0); + 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20], 0); __m128i zero = _mm_setzero_si128(); __m128i pix00, pix10, pix20, pix30, pix40; __m128i pix01, pix11, pix21, pix31, pix41; @@ -76,7 +171,7 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, MM_KERNEL_LOAD(0, 0); - for (; x < im->xsize-2-4; x += 4) { + for (; x < im->xsize-2-3; x += 4) { __m128i ss0, ss1, ss2, ss3; ss0 = _mm_set1_epi32(offset); @@ -132,10 +227,11 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, } out[x] = in0[x]; out[x+1] = in0[x+1]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#endif } memcpy(imOut->image32[y], im->image32[y], im->linesize); memcpy(imOut->image32[y+1], im->image32[y+1], im->linesize); - -#undef MM_KERNEL_LOAD -#undef MM_KERNEL_SUM }