diff --git a/libImaging/FilterSIMD_5x5i_4u8.c b/libImaging/FilterSIMD_5x5i_4u8.c index d1fb60fc5..5fff7c68a 100644 --- a/libImaging/FilterSIMD_5x5i_4u8.c +++ b/libImaging/FilterSIMD_5x5i_4u8.c @@ -2,20 +2,6 @@ void ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, INT32 offset) { -#define MM_KERNEL_LOAD(row, x) \ - pix0##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in2[x])); \ - pix1##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in1[x])); \ - pix2##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in0[x])); \ - pix3##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_1[x])); \ - pix4##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_2[x])); - -#define MM_KERNEL_SUM(row, kindex) \ - ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ - ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, _mm_set1_ps(kernel[5 + kindex]))); \ - ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, _mm_set1_ps(kernel[10 + kindex]))); \ - ss = _mm_add_ps(ss, _mm_mul_ps(pix3##row, _mm_set1_ps(kernel[15 + kindex]))); \ - ss = _mm_add_ps(ss, _mm_mul_ps(pix4##row, _mm_set1_ps(kernel[20 + kindex]))); - int x, y; offset += 1 << (PRECISION_BITS-1); @@ -29,6 +15,26 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, INT32* in1 = im->image32[y+1]; INT32* in2 = im->image32[y+2]; INT32* out = imOut->image32[y]; + + #define MM_KERNEL_LOAD(row, x) \ + pix0##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x]), shuffle); \ + pix1##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \ + pix2##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \ + pix3##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); \ + pix4##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x]), shuffle); + + #define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix0##row, zero), _mm_shuffle_epi32(kernel0##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix1##row, zero), _mm_shuffle_epi32(kernel1##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix2##row, zero), _mm_shuffle_epi32(kernel2##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix3##row, zero), _mm_shuffle_epi32(kernel3##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix4##row, zero), _mm_shuffle_epi32(kernel4##krow, kctl))); + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); __m128i kernel00 = _mm_set_epi16( 0, 0, 0, kernel[4], @@ -45,79 +51,79 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, __m128i kernel40 = _mm_set_epi16( 0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20]); + __m128i kernel01 = _mm_set_epi16( + 0, 0, kernel[4], kernel[3], + kernel[2], kernel[1], kernel[0], 0); + __m128i kernel11 = _mm_set_epi16( + 0, 0, kernel[9], kernel[8], + kernel[7], kernel[6], kernel[5], 0); + __m128i kernel21 = _mm_set_epi16( + 0, 0, kernel[14], kernel[13], + kernel[12], kernel[11], kernel[10], 0); + __m128i kernel31 = _mm_set_epi16( + 0, 0, kernel[19], kernel[18], + kernel[17], kernel[16], kernel[15], 0); + __m128i kernel41 = _mm_set_epi16( + 0, 0, kernel[24], kernel[23], + kernel[22], kernel[21], kernel[20], 0); __m128i zero = _mm_setzero_si128(); __m128i pix00, pix10, pix20, pix30, pix40; __m128i pix01, pix11, pix21, pix31, pix41; - out[0] = in0[0]; out[1] = in0[1]; x = 2; + + MM_KERNEL_LOAD(0, 0); + + for (; x < im->xsize-2-4; x += 4) { + __m128i ss0, ss1, ss2, ss3; + + ss0 = _mm_set1_epi32(offset); + ss1 = _mm_set1_epi32(offset); + ss2 = _mm_set1_epi32(offset); + ss3 = _mm_set1_epi32(offset); + + MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss1, 0, _mm_unpacklo_epi8, 1, 0x00); + MM_KERNEL_SUM(ss1, 0, _mm_unpackhi_epi8, 1, 0x55); + MM_KERNEL_SUM(ss2, 0, _mm_unpackhi_epi8, 0, 0x00); + MM_KERNEL_SUM(ss3, 0, _mm_unpackhi_epi8, 1, 0x00); + + MM_KERNEL_LOAD(1, x+2); + MM_KERNEL_SUM(ss0, 1, _mm_unpacklo_epi8, 0, 0xaa); + ss0 = _mm_srai_epi32(ss0, PRECISION_BITS); + MM_KERNEL_SUM(ss1, 1, _mm_unpacklo_epi8, 1, 0xaa); + ss1 = _mm_srai_epi32(ss1, PRECISION_BITS); + MM_KERNEL_SUM(ss2, 1, _mm_unpacklo_epi8, 0, 0x55); + MM_KERNEL_SUM(ss2, 1, _mm_unpackhi_epi8, 0, 0xaa); + ss2 = _mm_srai_epi32(ss2, PRECISION_BITS); + MM_KERNEL_SUM(ss3, 1, _mm_unpacklo_epi8, 1, 0x55); + MM_KERNEL_SUM(ss3, 1, _mm_unpackhi_epi8, 1, 0xaa); + ss3 = _mm_srai_epi32(ss3, PRECISION_BITS); + + pix00 = pix01; + pix10 = pix11; + pix20 = pix21; + pix30 = pix31; + pix40 = pix41; + + ss0 = _mm_packs_epi32(ss0, ss1); + ss2 = _mm_packs_epi32(ss2, ss3); + ss0 = _mm_packus_epi16(ss0, ss2); + _mm_storeu_si128((__m128i*) &out[x], ss0); + } for (; x < im->xsize-2; x++) { __m128i ss0; - pix00 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x-2]), shuffle); - pix10 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x-2]), shuffle); - pix20 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x-2]), shuffle); - pix30 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x-2]), shuffle); - pix40 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x-2]), shuffle); - - pix01 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x+2]), shuffle); - pix11 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x+2]), shuffle); - pix21 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x+2]), shuffle); - pix31 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x+2]), shuffle); - pix41 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x+2]), shuffle); + MM_KERNEL_LOAD(0, x-2); + MM_KERNEL_LOAD(1, x+2); ss0 = _mm_set1_epi32(offset); - - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpacklo_epi8(pix00, zero), - _mm_shuffle_epi32(kernel00, 0x00))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpacklo_epi8(pix10, zero), - _mm_shuffle_epi32(kernel10, 0x00))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpacklo_epi8(pix20, zero), - _mm_shuffle_epi32(kernel20, 0x00))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpacklo_epi8(pix30, zero), - _mm_shuffle_epi32(kernel30, 0x00))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpacklo_epi8(pix40, zero), - _mm_shuffle_epi32(kernel40, 0x00))); - - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpackhi_epi8(pix00, zero), - _mm_shuffle_epi32(kernel00, 0x55))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpackhi_epi8(pix10, zero), - _mm_shuffle_epi32(kernel10, 0x55))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpackhi_epi8(pix20, zero), - _mm_shuffle_epi32(kernel20, 0x55))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpackhi_epi8(pix30, zero), - _mm_shuffle_epi32(kernel30, 0x55))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpackhi_epi8(pix40, zero), - _mm_shuffle_epi32(kernel40, 0x55))); - - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpacklo_epi8(pix01, zero), - _mm_shuffle_epi32(kernel00, 0xaa))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpacklo_epi8(pix11, zero), - _mm_shuffle_epi32(kernel10, 0xaa))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpacklo_epi8(pix21, zero), - _mm_shuffle_epi32(kernel20, 0xaa))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpacklo_epi8(pix31, zero), - _mm_shuffle_epi32(kernel30, 0xaa))); - ss0 = _mm_add_epi32(ss0, _mm_madd_epi16( - _mm_unpacklo_epi8(pix41, zero), - _mm_shuffle_epi32(kernel40, 0xaa))); - + MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss0, 1, _mm_unpacklo_epi8, 0, 0xaa); ss0 = _mm_srai_epi32(ss0, PRECISION_BITS); ss0 = _mm_packs_epi32(ss0, ss0);