From a1ce80cb295c0c8c8837d217031e6bacfc1eb4ac Mon Sep 17 00:00:00 2001 From: Alexander Date: Wed, 13 Sep 2017 05:37:40 +0300 Subject: [PATCH] SIMD Filter. avx2 version --- libImaging/FilterSIMD_3x3i_4u8.c | 130 +++++++++++++++++++++++++------ 1 file changed, 108 insertions(+), 22 deletions(-) diff --git a/libImaging/FilterSIMD_3x3i_4u8.c b/libImaging/FilterSIMD_3x3i_4u8.c index bccfaeaf7..1c580f804 100644 --- a/libImaging/FilterSIMD_3x3i_4u8.c +++ b/libImaging/FilterSIMD_3x3i_4u8.c @@ -2,25 +2,6 @@ void ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, INT32 offset) { -#define MM_KERNEL_LOAD(x) \ - source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \ - pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ - pix01 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ - source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \ - pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ - pix11 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ - source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); \ - pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ - pix21 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); - -#define MM_KERNEL_SUM(ss, row, kctl) \ - ss = _mm_add_epi32(ss, _mm_madd_epi16( \ - pix0##row, _mm_shuffle_epi32(kernel00, kctl))); \ - ss = _mm_add_epi32(ss, _mm_madd_epi16( \ - pix1##row, _mm_shuffle_epi32(kernel10, kctl))); \ - ss = _mm_add_epi32(ss, _mm_madd_epi16( \ - pix2##row, _mm_shuffle_epi32(kernel20, kctl))); \ - int x, y; offset += 1 << (PRECISION_BITS-1); @@ -32,6 +13,110 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, INT32* in0 = im->image32[y]; INT32* in1 = im->image32[y+1]; INT32* out = imOut->image32[y]; + +#if defined(__AVX2__) + + #define MM_KERNEL_LOAD(x) \ + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle)); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \ + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle)); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \ + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle)); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + + #define MM_KERNEL_SUM(ss, row, kctl) \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + pix0##row, _mm256_shuffle_epi32(kernel00, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + pix1##row, _mm256_shuffle_epi32(kernel10, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + pix2##row, _mm256_shuffle_epi32(kernel20, kctl))); + + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m256i kernel00 = _mm256_set_epi16( + 0, 0, 0, 0, kernel[2], kernel[1], kernel[0], 0, + 0, 0, 0, 0, 0, kernel[2], kernel[1], kernel[0]); + __m256i kernel10 = _mm256_set_epi16( + 0, 0, 0, 0, kernel[5], kernel[4], kernel[3], 0, + 0, 0, 0, 0, 0, kernel[5], kernel[4], kernel[3]); + __m256i kernel20 = _mm256_set_epi16( + 0, 0, 0, 0, kernel[8], kernel[7], kernel[6], 0, + 0, 0, 0, 0, 0, kernel[8], kernel[7], kernel[6]); + __m256i pix00, pix10, pix20; + __m256i pix01, pix11, pix21; + __m256i source; + + out[0] = in0[0]; + x = 1; + MM_KERNEL_LOAD(0); + for (; x < im->xsize-1-3; x += 4) { + __m256i ss0, ss2; + __m128i ssout; + + ss0 = _mm256_set1_epi32(offset); + MM_KERNEL_SUM(ss0, 0, 0x00); + MM_KERNEL_SUM(ss0, 1, 0x55); + ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS); + + ss2 = _mm256_set1_epi32(offset); + MM_KERNEL_SUM(ss2, 1, 0x00); + + MM_KERNEL_LOAD(x+3); + + MM_KERNEL_SUM(ss2, 0, 0x55); + ss2 = _mm256_srai_epi32(ss2, PRECISION_BITS); + + ss0 = _mm256_packs_epi32(ss0, ss2); + ssout = _mm_packus_epi16( + _mm256_extracti128_si256(ss0, 0), + _mm256_extracti128_si256(ss0, 1)); + ssout = _mm_shuffle_epi32(ssout, 216); + _mm_storeu_si128((__m128i*) &out[x], ssout); + } + // for (; x < im->xsize-1; x++) { + // __m128i ss = _mm_set1_epi32(offset); + + // MM_KERNEL_LOAD(x-1); + // MM_KERNEL_SUM(ss, 0, 0x00); + // MM_KERNEL_SUM(ss, 1, 0x55); + + // ss = _mm_srai_epi32(ss, PRECISION_BITS); + + // ss = _mm_packs_epi32(ss, ss); + // ss = _mm_packus_epi16(ss, ss); + // out[x] = _mm_cvtsi128_si32(ss); + // } + out[x] = in0[x]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#else + + #define MM_KERNEL_LOAD(x) \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \ + pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix01 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \ + pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix11 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); \ + pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix21 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + + #define MM_KERNEL_SUM(ss, row, kctl) \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + pix0##row, _mm_shuffle_epi32(kernel00, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + pix1##row, _mm_shuffle_epi32(kernel10, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + pix2##row, _mm_shuffle_epi32(kernel20, kctl))); \ + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); __m128i kernel00 = _mm_set_epi16( kernel[2], kernel[1], kernel[0], 0, @@ -92,9 +177,10 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, out[x] = _mm_cvtsi128_si32(ss); } out[x] = in0[x]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#endif } memcpy(imOut->image32[y], im->image32[y], im->linesize); - -#undef MM_KERNEL_LOAD -#undef MM_KERNEL_SUM }