From 8e3a81a53c62963d186edd685889160c7ca67bc1 Mon Sep 17 00:00:00 2001 From: Alexander Date: Sun, 13 Aug 2017 17:16:16 +0300 Subject: [PATCH] SIMD Filter. 3x3 implementation SIMD Filter. 5x5 implementation SIMD Filter. fast 3x3 filter SIMD Filter. a bit faster 5x5 filter SIMD Filter. improve locality in 5x5 filter SIMD Filter. rearrange 3x3 filter to match 5x5 SIMD Filter. use macros SIMD Filter. use macros in 3x3 SIMD Filter. 3x3 SSE4 singleband SIMD Filter. faster 3x3 singleband SSE4 SIMD Filter. reuse loaded values SIMD Filter. 3x3 SSE4 singleband: 2 lines SIMD Filter. First AVX try SIMD Filter. unroll AVX 2 times SIMD Filter. Macros for AVX SIMD Filter. unroll AVX (with no profit) SIMD Filter. consider last pixel in AVX SIMD Filter. 5x5 single channel SSE4 (tests failed) SIMD Filter. fix offset SIMD Filter. move ImagingFilterxxx functions to separate files SIMD Filter. 3x3i SIMD Filter. better macros SIMD Filter. better loading SIMD Filter. Rearrange instruction for speedup SIMD Filter. reduce number of registers SIMD Filter. rearrange operations SIMD Filter. avx2 version SIMD Filter. finish 3x3i_4u8 SIMD Filter. 5x5i_4u8 SSE4 SIMD Filter. advanced 5x5i_4u8 SSE4 SIMD Filter. 5x5i_4u8 AVX2 SIMD Filter. fix memory access for: 3x3f_u8 3x3i_4u8 5x5i_4u8 SIMD Filter. move files SIMD Filter. Correct offset for 3x3f_u8 # Conflicts: # src/libImaging/Filter.c --- src/libImaging/Filter.c | 343 +++++---------------------- src/libImaging/FilterSIMD_3x3f_4u8.c | 237 ++++++++++++++++++ src/libImaging/FilterSIMD_3x3f_u8.c | 159 +++++++++++++ src/libImaging/FilterSIMD_3x3i_4u8.c | 222 +++++++++++++++++ src/libImaging/FilterSIMD_5x5f_4u8.c | 126 ++++++++++ src/libImaging/FilterSIMD_5x5f_u8.c | 100 ++++++++ src/libImaging/FilterSIMD_5x5i_4u8.c | 250 +++++++++++++++++++ 7 files changed, 1151 insertions(+), 286 deletions(-) create mode 100644 src/libImaging/FilterSIMD_3x3f_4u8.c create mode 100644 src/libImaging/FilterSIMD_3x3f_u8.c create mode 100644 src/libImaging/FilterSIMD_3x3i_4u8.c create mode 100644 src/libImaging/FilterSIMD_5x5f_4u8.c create mode 100644 src/libImaging/FilterSIMD_5x5f_u8.c create mode 100644 src/libImaging/FilterSIMD_5x5i_4u8.c diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index fbd6b425f..df3ca3630 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -26,16 +26,21 @@ #include "Imaging.h" -static inline UINT8 -clip8(float in) { - if (in <= 0.0) { - return 0; - } - if (in >= 255.0) { - return 255; - } - return (UINT8)in; -} +/* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25). + 8 is number of bits in result. + Any coefficients delta smaller than this precision will have no effect. */ +#define PRECISION_BITS (8 + 5) +/* 16 is number of bis required for kernel storage. + 1 bit is reserver for sign. + Largest possible kernel coefficient is */ +#define LARGEST_KERNEL (1 << (16 - 1 - PRECISION_BITS)) + +#include "FilterSIMD_3x3f_u8.c" +#include "FilterSIMD_3x3f_4u8.c" +#include "FilterSIMD_3x3i_4u8.c" +#include "FilterSIMD_5x5f_u8.c" +#include "FilterSIMD_5x5f_4u8.c" +#include "FilterSIMD_5x5i_4u8.c" static inline INT32 clip32(float in) { @@ -105,283 +110,14 @@ ImagingExpand(Imaging imIn, int xmargin, int ymargin) { return imOut; } -void -ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { -#define KERNEL1x3(in0, x, kernel, d) \ - (_i2f(in0[x - d]) * (kernel)[0] + _i2f(in0[x]) * (kernel)[1] + \ - _i2f(in0[x + d]) * (kernel)[2]) - - int x = 0, y = 0; - - memcpy(imOut->image[0], im->image[0], im->linesize); - if (im->bands == 1) { - // Add one time for rounding - offset += 0.5; - if (im->type == IMAGING_TYPE_INT32) { - for (y = 1; y < im->ysize - 1; y++) { - INT32 *in_1 = (INT32 *)im->image[y - 1]; - INT32 *in0 = (INT32 *)im->image[y]; - INT32 *in1 = (INT32 *)im->image[y + 1]; - INT32 *out = (INT32 *)imOut->image[y]; - - out[0] = in0[0]; - for (x = 1; x < im->xsize - 1; x++) { - float ss = offset; - ss += KERNEL1x3(in1, x, &kernel[0], 1); - ss += KERNEL1x3(in0, x, &kernel[3], 1); - ss += KERNEL1x3(in_1, x, &kernel[6], 1); - out[x] = clip32(ss); - } - out[x] = in0[x]; - } - } else { - for (y = 1; y < im->ysize - 1; y++) { - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *out = (UINT8 *)imOut->image[y]; - - out[0] = in0[0]; - for (x = 1; x < im->xsize - 1; x++) { - float ss = offset; - ss += KERNEL1x3(in1, x, &kernel[0], 1); - ss += KERNEL1x3(in0, x, &kernel[3], 1); - ss += KERNEL1x3(in_1, x, &kernel[6], 1); - out[x] = clip8(ss); - } - out[x] = in0[x]; - } - } - } else { - // Add one time for rounding - offset += 0.5; - for (y = 1; y < im->ysize - 1; y++) { - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *out = (UINT8 *)imOut->image[y]; - - memcpy(out, in0, sizeof(UINT32)); - if (im->bands == 2) { - for (x = 1; x < im->xsize - 1; x++) { - float ss0 = offset; - float ss3 = offset; - UINT32 v; - ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4); - ss3 += KERNEL1x3(in1, x * 4 + 3, &kernel[0], 4); - ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4); - ss3 += KERNEL1x3(in0, x * 4 + 3, &kernel[3], 4); - ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4); - ss3 += KERNEL1x3(in_1, x * 4 + 3, &kernel[6], 4); - v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3)); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } else if (im->bands == 3) { - for (x = 1; x < im->xsize - 1; x++) { - float ss0 = offset; - float ss1 = offset; - float ss2 = offset; - UINT32 v; - ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4); - ss1 += KERNEL1x3(in1, x * 4 + 1, &kernel[0], 4); - ss2 += KERNEL1x3(in1, x * 4 + 2, &kernel[0], 4); - ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4); - ss1 += KERNEL1x3(in0, x * 4 + 1, &kernel[3], 4); - ss2 += KERNEL1x3(in0, x * 4 + 2, &kernel[3], 4); - ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4); - ss1 += KERNEL1x3(in_1, x * 4 + 1, &kernel[6], 4); - ss2 += KERNEL1x3(in_1, x * 4 + 2, &kernel[6], 4); - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } else if (im->bands == 4) { - for (x = 1; x < im->xsize - 1; x++) { - float ss0 = offset; - float ss1 = offset; - float ss2 = offset; - float ss3 = offset; - UINT32 v; - ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4); - ss1 += KERNEL1x3(in1, x * 4 + 1, &kernel[0], 4); - ss2 += KERNEL1x3(in1, x * 4 + 2, &kernel[0], 4); - ss3 += KERNEL1x3(in1, x * 4 + 3, &kernel[0], 4); - ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4); - ss1 += KERNEL1x3(in0, x * 4 + 1, &kernel[3], 4); - ss2 += KERNEL1x3(in0, x * 4 + 2, &kernel[3], 4); - ss3 += KERNEL1x3(in0, x * 4 + 3, &kernel[3], 4); - ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4); - ss1 += KERNEL1x3(in_1, x * 4 + 1, &kernel[6], 4); - ss2 += KERNEL1x3(in_1, x * 4 + 2, &kernel[6], 4); - ss3 += KERNEL1x3(in_1, x * 4 + 3, &kernel[6], 4); - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3)); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } - memcpy(out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32)); - } - } - memcpy(imOut->image[y], im->image[y], im->linesize); -} - -void -ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { -#define KERNEL1x5(in0, x, kernel, d) \ - (_i2f(in0[x - d - d]) * (kernel)[0] + _i2f(in0[x - d]) * (kernel)[1] + \ - _i2f(in0[x]) * (kernel)[2] + _i2f(in0[x + d]) * (kernel)[3] + \ - _i2f(in0[x + d + d]) * (kernel)[4]) - - int x = 0, y = 0; - - memcpy(imOut->image[0], im->image[0], im->linesize); - memcpy(imOut->image[1], im->image[1], im->linesize); - if (im->bands == 1) { - // Add one time for rounding - offset += 0.5; - if (im->type == IMAGING_TYPE_INT32) { - for (y = 2; y < im->ysize - 2; y++) { - INT32 *in_2 = (INT32 *)im->image[y - 2]; - INT32 *in_1 = (INT32 *)im->image[y - 1]; - INT32 *in0 = (INT32 *)im->image[y]; - INT32 *in1 = (INT32 *)im->image[y + 1]; - INT32 *in2 = (INT32 *)im->image[y + 2]; - INT32 *out = (INT32 *)imOut->image[y]; - - out[0] = in0[0]; - out[1] = in0[1]; - for (x = 2; x < im->xsize - 2; x++) { - float ss = offset; - ss += KERNEL1x5(in2, x, &kernel[0], 1); - ss += KERNEL1x5(in1, x, &kernel[5], 1); - ss += KERNEL1x5(in0, x, &kernel[10], 1); - ss += KERNEL1x5(in_1, x, &kernel[15], 1); - ss += KERNEL1x5(in_2, x, &kernel[20], 1); - out[x] = clip32(ss); - } - out[x + 0] = in0[x + 0]; - out[x + 1] = in0[x + 1]; - } - } else { - for (y = 2; y < im->ysize - 2; y++) { - UINT8 *in_2 = (UINT8 *)im->image[y - 2]; - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *in2 = (UINT8 *)im->image[y + 2]; - UINT8 *out = (UINT8 *)imOut->image[y]; - - out[0] = in0[0]; - out[1] = in0[1]; - for (x = 2; x < im->xsize - 2; x++) { - float ss = offset; - ss += KERNEL1x5(in2, x, &kernel[0], 1); - ss += KERNEL1x5(in1, x, &kernel[5], 1); - ss += KERNEL1x5(in0, x, &kernel[10], 1); - ss += KERNEL1x5(in_1, x, &kernel[15], 1); - ss += KERNEL1x5(in_2, x, &kernel[20], 1); - out[x] = clip8(ss); - } - out[x + 0] = in0[x + 0]; - out[x + 1] = in0[x + 1]; - } - } - } else { - // Add one time for rounding - offset += 0.5; - for (y = 2; y < im->ysize - 2; y++) { - UINT8 *in_2 = (UINT8 *)im->image[y - 2]; - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *in2 = (UINT8 *)im->image[y + 2]; - UINT8 *out = (UINT8 *)imOut->image[y]; - - memcpy(out, in0, sizeof(UINT32) * 2); - if (im->bands == 2) { - for (x = 2; x < im->xsize - 2; x++) { - float ss0 = offset; - float ss3 = offset; - UINT32 v; - ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4); - ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4); - ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4); - ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4); - ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4); - ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4); - ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4); - ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4); - ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4); - ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4); - v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3)); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } else if (im->bands == 3) { - for (x = 2; x < im->xsize - 2; x++) { - float ss0 = offset; - float ss1 = offset; - float ss2 = offset; - UINT32 v; - ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4); - ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4); - ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4); - ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4); - ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4); - ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4); - ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4); - ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4); - ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4); - ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4); - ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4); - ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4); - ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4); - ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4); - ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4); - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } else if (im->bands == 4) { - for (x = 2; x < im->xsize - 2; x++) { - float ss0 = offset; - float ss1 = offset; - float ss2 = offset; - float ss3 = offset; - UINT32 v; - ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4); - ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4); - ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4); - ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4); - ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4); - ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4); - ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4); - ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4); - ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4); - ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4); - ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4); - ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4); - ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4); - ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4); - ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4); - ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4); - ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4); - ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4); - ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4); - ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4); - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3)); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } - memcpy( - out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32) * 2 - ); - } - } - memcpy(imOut->image[y], im->image[y], im->linesize); - memcpy(imOut->image[y + 1], im->image[y + 1], im->linesize); -} - Imaging ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 offset) { - Imaging imOut; ImagingSectionCookie cookie; + Imaging imOut; + int i, fast_path = 1; + FLOAT32 tmp; + INT16 norm_kernel[25]; + INT32 norm_offset; if (im->type != IMAGING_TYPE_UINT8 && im->type != IMAGING_TYPE_INT32) { return (Imaging)ImagingError_ModeError(); @@ -400,13 +136,48 @@ ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 o return NULL; } + for (i = 0; i < xsize * ysize; i++) { + tmp = kernel[i] * (1 << PRECISION_BITS); + tmp += (tmp >= 0) ? 0.5 : -0.5; + if (tmp >= 32768 || tmp <= -32769) { + fast_path = 0; + break; + } + norm_kernel[i] = tmp; + } + tmp = offset * (1 << PRECISION_BITS); + tmp += (tmp >= 0) ? 0.5 : -0.5; + if (tmp >= 1 << 30) { + norm_offset = 1 << 30; + } else if (tmp <= -1 << 30) { + norm_offset = -1 << 30; + } else { + norm_offset = tmp; + } + ImagingSectionEnter(&cookie); if (xsize == 3) { /* 3x3 kernel. */ - ImagingFilter3x3(imOut, im, kernel, offset); + if (im->image8) { + ImagingFilter3x3f_u8(imOut, im, kernel, offset); + } else { + if (fast_path) { + ImagingFilter3x3i_4u8(imOut, im, norm_kernel, norm_offset); + } else { + ImagingFilter3x3f_4u8(imOut, im, kernel, offset); + } + } } else { /* 5x5 kernel. */ - ImagingFilter5x5(imOut, im, kernel, offset); + if (im->image8) { + ImagingFilter5x5f_u8(imOut, im, kernel, offset); + } else { + if (fast_path) { + ImagingFilter5x5i_4u8(imOut, im, norm_kernel, norm_offset); + } else { + ImagingFilter5x5f_4u8(imOut, im, kernel, offset); + } + } } ImagingSectionLeave(&cookie); return imOut; diff --git a/src/libImaging/FilterSIMD_3x3f_4u8.c b/src/libImaging/FilterSIMD_3x3f_4u8.c new file mode 100644 index 000000000..02272f688 --- /dev/null +++ b/src/libImaging/FilterSIMD_3x3f_4u8.c @@ -0,0 +1,237 @@ +void +ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel, + float offset) +{ +#define MM_KERNEL1x3_LOAD(row, x) \ + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); + +#define MM_KERNEL1x3_SUM(row, kindex) \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, _mm_set1_ps(kernel[3 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, _mm_set1_ps(kernel[6 + kindex]))); + + int x = 0, y = 0; + + memcpy(imOut->image32[0], im->image32[0], im->linesize); + + for (y = 1; y < im->ysize-1; y++) { + INT32* in_1 = im->image32[y-1]; + INT32* in0 = im->image32[y]; + INT32* in1 = im->image32[y+1]; + INT32* out = imOut->image32[y]; + +#if defined(__AVX2__) + +#define MM256_LOAD(row, x) \ + pix0##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in1[x])); \ + pix1##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in0[x])); \ + pix2##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in_1[x])); + +#define MM256_SUM(pixrow, kernelrow) \ + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix0##pixrow, kernel0##kernelrow)); \ + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix1##pixrow, kernel1##kernelrow)); \ + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix2##pixrow, kernel2##kernelrow)); + +#define MM256_PERMUTE(row, from0, from1, control) \ + pix0##row = _mm256_permute2f128_ps(pix0##from0, pix0##from1, control); \ + pix1##row = _mm256_permute2f128_ps(pix1##from0, pix1##from1, control); \ + pix2##row = _mm256_permute2f128_ps(pix2##from0, pix2##from1, control); + +#define MM256_OUT(x) \ + _mm_cvtps_epi32(_mm_add_ps( \ + _mm256_extractf128_ps(ss, 0), \ + _mm256_extractf128_ps(ss, 1) \ + )) + + __m256 kernel00 = _mm256_insertf128_ps( + _mm256_set1_ps(kernel[0+0]), + _mm_set1_ps(kernel[0+1]), 1); + __m256 kernel01 = _mm256_castps128_ps256(_mm_set1_ps(kernel[0+2])); + __m256 kernel10 = _mm256_insertf128_ps( + _mm256_set1_ps(kernel[3+0]), + _mm_set1_ps(kernel[3+1]), 1); + __m256 kernel11 = _mm256_castps128_ps256(_mm_set1_ps(kernel[3+2])); + __m256 kernel20 = _mm256_insertf128_ps( + _mm256_set1_ps(kernel[6+0]), + _mm_set1_ps(kernel[6+1]), 1); + __m256 kernel21 = _mm256_castps128_ps256(_mm_set1_ps(kernel[6+2])); + __m256 pix00, pix10, pix20; + __m256 pix01, pix11, pix21; + __m256 pix02, pix12, pix22; + + out[0] = in0[0]; + x = 1; + MM256_LOAD(0, 0); + for (; x < im->xsize-1-7; x += 8) { + __m256 ss; + __m128i ssi0, ssi1, ssi2, ssi3; + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(0, 0); + MM256_LOAD(1, x+1); + MM256_SUM(1, 1); + ssi0 = MM256_OUT(x); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(1, 0); + MM256_LOAD(2, x+3); + MM256_SUM(2, 1); + ssi2 = MM256_OUT(x+2); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_PERMUTE(0, 0, 1, 0x21); + MM256_SUM(0, 0); + MM256_PERMUTE(1, 1, 2, 0x21); + MM256_SUM(1, 1); + ssi1 = MM256_OUT(x+1); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(1, 0); + MM256_PERMUTE(0, 2, 2, 0x21); + MM256_SUM(0, 1); + ssi3 = MM256_OUT(x+3); + + ssi0 = _mm_packs_epi32(ssi0, ssi1); + ssi2 = _mm_packs_epi32(ssi2, ssi3); + ssi0 = _mm_packus_epi16(ssi0, ssi2); + _mm_storeu_si128((__m128i*) &out[x], ssi0); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(2, 0); + MM256_LOAD(1, x+5); + MM256_SUM(1, 1); + ssi0 = MM256_OUT(x+4); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(1, 0); + MM256_LOAD(0, x+7); + MM256_SUM(0, 1); + ssi2 = MM256_OUT(x+6); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_PERMUTE(2, 2, 1, 0x21); + MM256_SUM(2, 0); + MM256_PERMUTE(1, 1, 0, 0x21); + MM256_SUM(1, 1); + ssi1 = MM256_OUT(x+5); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(1, 0); + MM256_PERMUTE(2, 0, 0, 0x21); + MM256_SUM(2, 1); + ssi3 = MM256_OUT(x+7); + + ssi0 = _mm_packs_epi32(ssi0, ssi1); + ssi2 = _mm_packs_epi32(ssi2, ssi3); + ssi0 = _mm_packus_epi16(ssi0, ssi2); + _mm_storeu_si128((__m128i*) &out[x+4], ssi0); + } + + for (; x < im->xsize-1-1; x += 2) { + __m256 ss; + __m128i ssi0, ssi1; + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(0, 0); + MM256_LOAD(1, x+1); + MM256_SUM(1, 1); + ssi0 = MM256_OUT(x); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_PERMUTE(0, 0, 1, 0x21); + MM256_SUM(0, 0); + MM256_PERMUTE(1, 0, 1, 0xf3); + MM256_SUM(1, 1); + ssi1 = MM256_OUT(x+1); + + ssi0 = _mm_packs_epi32(ssi0, ssi1); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + _mm_storel_epi64((__m128i*) &out[x], ssi0); + + MM256_PERMUTE(0, 0, 1, 0x21); + } + for (; x < im->xsize-1; x++) { + __m128 pix00, pix10, pix20; + __m128 ss = _mm_set1_ps(offset); + __m128i ssi0; + MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_SUM(0, 0); + MM_KERNEL1x3_LOAD(0, x+0); + MM_KERNEL1x3_SUM(0, 1); + MM_KERNEL1x3_LOAD(0, x+1); + MM_KERNEL1x3_SUM(0, 2); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + out[x] = in0[x]; + +#undef MM256_LOAD +#undef MM256_SUM +#undef MM256_PERMUTE +#undef MM256_OUT + +#else + __m128 pix00, pix10, pix20; + __m128 pix01, pix11, pix21; + __m128 pix02, pix12, pix22; + MM_KERNEL1x3_LOAD(0, 0); + MM_KERNEL1x3_LOAD(1, 1); + + out[0] = in0[0]; + x = 1; + for (; x < im->xsize-1-2; x += 3) { + __m128 ss; + __m128i ssi0, ssi1, ssi2; + + ss = _mm_set1_ps(offset); + MM_KERNEL1x3_SUM(0, 0); + MM_KERNEL1x3_SUM(1, 1); + MM_KERNEL1x3_LOAD(2, x+1); + MM_KERNEL1x3_SUM(2, 2); + ssi0 = _mm_cvtps_epi32(ss); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x3_SUM(1, 0); + MM_KERNEL1x3_SUM(2, 1); + MM_KERNEL1x3_LOAD(0, x+2); + MM_KERNEL1x3_SUM(0, 2); + ssi1 = _mm_cvtps_epi32(ss); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x3_SUM(2, 0); + MM_KERNEL1x3_SUM(0, 1); + MM_KERNEL1x3_LOAD(1, x+3); + MM_KERNEL1x3_SUM(1, 2); + ssi2 = _mm_cvtps_epi32(ss); + + ssi0 = _mm_packs_epi32(ssi0, ssi1); + ssi1 = _mm_packs_epi32(ssi2, ssi2); + ssi0 = _mm_packus_epi16(ssi0, ssi1); + _mm_storeu_si128((__m128i*) &out[x], ssi0); + } + for (; x < im->xsize-1; x++) { + __m128 ss = _mm_set1_ps(offset); + __m128i ssi0; + MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_SUM(0, 0); + MM_KERNEL1x3_LOAD(0, x+0); + MM_KERNEL1x3_SUM(0, 1); + MM_KERNEL1x3_LOAD(0, x+1); + MM_KERNEL1x3_SUM(0, 2); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + out[x] = in0[x]; +#endif + } + memcpy(imOut->image32[y], im->image32[y], im->linesize); + +#undef MM_KERNEL1x3_LOAD +#undef MM_KERNEL1x3_SUM +} diff --git a/src/libImaging/FilterSIMD_3x3f_u8.c b/src/libImaging/FilterSIMD_3x3f_u8.c new file mode 100644 index 000000000..858366c1d --- /dev/null +++ b/src/libImaging/FilterSIMD_3x3f_u8.c @@ -0,0 +1,159 @@ +void +ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, + float offset) +{ +#define MM_KERNEL1x3_SUM1(ss, row, kernel) \ + ss = _mm_mul_ps(pix0##row, kernel0##kernel); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel1##kernel)); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, kernel2##kernel)); + +#define MM_KERNEL1x3_SUM1_2(ss, row, kernel) \ + ss = _mm_mul_ps(pix3##row, kernel0##kernel); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, kernel1##kernel)); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel2##kernel)); + +#define MM_KERNEL1x3_LOAD(row, x) \ + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); + + int x, y; + __m128 kernel00 = _mm_set_ps(0, kernel[2], kernel[1], kernel[0]); + __m128 kernel10 = _mm_set_ps(0, kernel[5], kernel[4], kernel[3]); + __m128 kernel20 = _mm_set_ps(0, kernel[8], kernel[7], kernel[6]); + __m128 kernel01 = _mm_set_ps(kernel[2], kernel[1], kernel[0], 0); + __m128 kernel11 = _mm_set_ps(kernel[5], kernel[4], kernel[3], 0); + __m128 kernel21 = _mm_set_ps(kernel[8], kernel[7], kernel[6], 0); + __m128 mm_offset = _mm_set1_ps(offset); + + memcpy(imOut->image8[0], im->image8[0], im->linesize); + y = 1; + for (; y < im->ysize-1-1; y += 2) { + UINT8* in_1 = im->image8[y-1]; + UINT8* in0 = im->image8[y]; + UINT8* in1 = im->image8[y+1]; + UINT8* in2 = im->image8[y+2]; + UINT8* out0 = imOut->image8[y]; + UINT8* out1 = imOut->image8[y+1]; + + out0[0] = in0[0]; + out1[0] = in1[0]; + x = 1; + for (; x < im->xsize-1-3; x += 4) { + __m128 ss0, ss1, ss2, ss3, ss4, ss5; + __m128 pix00, pix10, pix20, pix30; + __m128i ssi0; + + MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_SUM1(ss0, 0, 0); + MM_KERNEL1x3_SUM1(ss1, 0, 1); + ss0 = _mm_hadd_ps(ss0, ss1); + pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x-1])); + MM_KERNEL1x3_SUM1_2(ss3, 0, 0); + MM_KERNEL1x3_SUM1_2(ss4, 0, 1); + ss3 = _mm_hadd_ps(ss3, ss4); + + MM_KERNEL1x3_LOAD(0, x+1); + MM_KERNEL1x3_SUM1(ss1, 0, 0); + MM_KERNEL1x3_SUM1(ss2, 0, 1); + ss1 = _mm_hadd_ps(ss1, ss2); + pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x+1])); + MM_KERNEL1x3_SUM1_2(ss4, 0, 0); + MM_KERNEL1x3_SUM1_2(ss5, 0, 1); + ss4 = _mm_hadd_ps(ss4, ss5); + + ss0 = _mm_hadd_ps(ss0, ss1); + ss0 = _mm_add_ps(ss0, mm_offset); + ssi0 = _mm_cvtps_epi32(ss0); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + *((UINT32*) &out0[x]) = _mm_cvtsi128_si32(ssi0); + + ss3 = _mm_hadd_ps(ss3, ss4); + ss3 = _mm_add_ps(ss3, mm_offset); + ssi0 = _mm_cvtps_epi32(ss3); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + *((UINT32*) &out1[x]) = _mm_cvtsi128_si32(ssi0); + } + for (; x < im->xsize-1; x++) { + __m128 ss0, ss1; + __m128 pix00, pix10, pix20, pix30; + __m128i ssi0; + + pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]); + pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]); + pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]); + pix30 = _mm_set_ps(0, in2[x+1], in2[x], in2[x-1]); + MM_KERNEL1x3_SUM1(ss0, 0, 0); + MM_KERNEL1x3_SUM1_2(ss1, 0, 0); + + ss0 = _mm_hadd_ps(ss0, ss0); + ss0 = _mm_hadd_ps(ss0, ss0); + ss0 = _mm_add_ps(ss0, mm_offset); + ssi0 = _mm_cvtps_epi32(ss0); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out0[x] = _mm_cvtsi128_si32(ssi0); + + ss1 = _mm_hadd_ps(ss1, ss1); + ss1 = _mm_hadd_ps(ss1, ss1); + ss1 = _mm_add_ps(ss1, mm_offset); + ssi0 = _mm_cvtps_epi32(ss1); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out1[x] = _mm_cvtsi128_si32(ssi0); + } + out0[x] = in0[x]; + out1[x] = in1[x]; + } + for (; y < im->ysize-1; y++) { + UINT8* in_1 = im->image8[y-1]; + UINT8* in0 = im->image8[y]; + UINT8* in1 = im->image8[y+1]; + UINT8* out = imOut->image8[y]; + + out[0] = in0[0]; + x = 1; + for (; x < im->xsize-2; x++) { + __m128 ss; + __m128 pix00, pix10, pix20; + __m128i ssi0; + + MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_SUM1(ss, 0, 0); + + ss = _mm_hadd_ps(ss, ss); + ss = _mm_hadd_ps(ss, ss); + ss = _mm_add_ps(ss, mm_offset); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + for (; x < im->xsize-1; x++) { + __m128 ss; + __m128 pix00, pix10, pix20; + __m128i ssi0; + + pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]); + pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]); + pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]); + MM_KERNEL1x3_SUM1(ss, 0, 0); + + ss = _mm_hadd_ps(ss, ss); + ss = _mm_hadd_ps(ss, ss); + ss = _mm_add_ps(ss, mm_offset); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + out[x] = in0[x]; + } + memcpy(imOut->image8[y], im->image8[y], im->linesize); + +#undef MM_KERNEL1x3_SUM1 +#undef MM_KERNEL1x3_SUM1_2 +#undef MM_KERNEL1x3_LOAD +} diff --git a/src/libImaging/FilterSIMD_3x3i_4u8.c b/src/libImaging/FilterSIMD_3x3i_4u8.c new file mode 100644 index 000000000..2a52d45eb --- /dev/null +++ b/src/libImaging/FilterSIMD_3x3i_4u8.c @@ -0,0 +1,222 @@ +void +ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, + INT32 offset) +{ + int x, y; + + offset += 1 << (PRECISION_BITS-1); + + memcpy(imOut->image32[0], im->image32[0], im->linesize); + + for (y = 1; y < im->ysize-1; y++) { + INT32* in_1 = im->image32[y-1]; + INT32* in0 = im->image32[y]; + INT32* in1 = im->image32[y+1]; + INT32* out = imOut->image32[y]; + +#if defined(__AVX2__) + + #define MM_KERNEL_LOAD(x) \ + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle)); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \ + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle)); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \ + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle)); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + + #define MM_KERNEL_SUM(ss, row, kctl) \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + pix0##row, _mm256_shuffle_epi32(kernel00, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + pix1##row, _mm256_shuffle_epi32(kernel10, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + pix2##row, _mm256_shuffle_epi32(kernel20, kctl))); + + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m256i kernel00 = _mm256_set_epi16( + 0, 0, 0, 0, kernel[2], kernel[1], kernel[0], 0, + 0, 0, 0, 0, 0, kernel[2], kernel[1], kernel[0]); + __m256i kernel10 = _mm256_set_epi16( + 0, 0, 0, 0, kernel[5], kernel[4], kernel[3], 0, + 0, 0, 0, 0, 0, kernel[5], kernel[4], kernel[3]); + __m256i kernel20 = _mm256_set_epi16( + 0, 0, 0, 0, kernel[8], kernel[7], kernel[6], 0, + 0, 0, 0, 0, 0, kernel[8], kernel[7], kernel[6]); + __m256i pix00, pix10, pix20; + __m256i pix01, pix11, pix21; + __m256i source; + + out[0] = in0[0]; + x = 1; + if (im->xsize >= 4) { + MM_KERNEL_LOAD(0); + } + // Last loaded pixel: x+3 + 3 (wide load) + // Last x should be < im->xsize-6 + for (; x < im->xsize-1-5; x += 4) { + __m256i ss0, ss2; + __m128i ssout; + + ss0 = _mm256_set1_epi32(offset); + MM_KERNEL_SUM(ss0, 0, 0x00); + MM_KERNEL_SUM(ss0, 1, 0x55); + ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS); + + ss2 = _mm256_set1_epi32(offset); + MM_KERNEL_SUM(ss2, 1, 0x00); + MM_KERNEL_LOAD(x+3); + MM_KERNEL_SUM(ss2, 0, 0x55); + ss2 = _mm256_srai_epi32(ss2, PRECISION_BITS); + + ss0 = _mm256_packs_epi32(ss0, ss2); + ssout = _mm_packus_epi16( + _mm256_extracti128_si256(ss0, 0), + _mm256_extracti128_si256(ss0, 1)); + ssout = _mm_shuffle_epi32(ssout, 0xd8); + _mm_storeu_si128((__m128i*) &out[x], ssout); + } + for (; x < im->xsize-1; x++) { + __m256i ss = _mm256_set1_epi32(offset); + + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in1[x+1]), 8), + _mm_loadl_epi64((__m128i*) &in1[x-1])), shuffle)); + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); + pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in0[x+1]), 8), + _mm_loadl_epi64((__m128i*) &in0[x-1])), shuffle)); + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); + pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in_1[x+1]), 8), + _mm_loadl_epi64((__m128i*) &in_1[x-1])), shuffle)); + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); + pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + + MM_KERNEL_SUM(ss, 0, 0x00); + MM_KERNEL_SUM(ss, 1, 0x55); + + ss = _mm256_srai_epi32(ss, PRECISION_BITS); + + ss = _mm256_packs_epi32(ss, ss); + ss = _mm256_packus_epi16(ss, ss); + out[x] = _mm_cvtsi128_si32(_mm256_castsi256_si128(ss)); + } + out[x] = in0[x]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#else + + #define MM_KERNEL_LOAD(x) \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \ + pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix01 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \ + pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix11 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); \ + pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix21 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + + #define MM_KERNEL_SUM(ss, row, kctl) \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + pix0##row, _mm_shuffle_epi32(kernel00, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + pix1##row, _mm_shuffle_epi32(kernel10, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + pix2##row, _mm_shuffle_epi32(kernel20, kctl))); + + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m128i kernel00 = _mm_set_epi16( + kernel[2], kernel[1], kernel[0], 0, + 0, kernel[2], kernel[1], kernel[0]); + __m128i kernel10 = _mm_set_epi16( + kernel[5], kernel[4], kernel[3], 0, + 0, kernel[5], kernel[4], kernel[3]); + __m128i kernel20 = _mm_set_epi16( + kernel[8], kernel[7], kernel[6], 0, + 0, kernel[8], kernel[7], kernel[6]); + __m128i pix00, pix10, pix20; + __m128i pix01, pix11, pix21; + __m128i source; + + out[0] = in0[0]; + x = 1; + if (im->xsize >= 4) { + MM_KERNEL_LOAD(0); + } + // Last loaded pixel: x+3 + 3 (wide load) + // Last x should be < im->xsize-6 + for (; x < im->xsize-1-5; x += 4) { + __m128i ss0 = _mm_set1_epi32(offset); + __m128i ss1 = _mm_set1_epi32(offset); + __m128i ss2 = _mm_set1_epi32(offset); + __m128i ss3 = _mm_set1_epi32(offset); + + MM_KERNEL_SUM(ss0, 0, 0x00); + MM_KERNEL_SUM(ss0, 1, 0x55); + ss0 = _mm_srai_epi32(ss0, PRECISION_BITS); + + MM_KERNEL_SUM(ss1, 0, 0xaa); + MM_KERNEL_SUM(ss1, 1, 0xff); + ss1 = _mm_srai_epi32(ss1, PRECISION_BITS); + + MM_KERNEL_SUM(ss2, 1, 0x00); + MM_KERNEL_SUM(ss3, 1, 0xaa); + + MM_KERNEL_LOAD(x+3); + + MM_KERNEL_SUM(ss2, 0, 0x55); + MM_KERNEL_SUM(ss3, 0, 0xff); + ss2 = _mm_srai_epi32(ss2, PRECISION_BITS); + ss3 = _mm_srai_epi32(ss3, PRECISION_BITS); + + ss0 = _mm_packs_epi32(ss0, ss1); + ss2 = _mm_packs_epi32(ss2, ss3); + ss0 = _mm_packus_epi16(ss0, ss2); + _mm_storeu_si128((__m128i*) &out[x], ss0); + } + for (; x < im->xsize-1; x++) { + __m128i ss = _mm_set1_epi32(offset); + + source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in1[x-1]), shuffle); + pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in1[x+1]), shuffle); + pix01 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in0[x-1]), shuffle); + pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in0[x+1]), shuffle); + pix11 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in_1[x-1]), shuffle); + pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in_1[x+1]), shuffle); + pix21 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + + MM_KERNEL_SUM(ss, 0, 0x00); + MM_KERNEL_SUM(ss, 1, 0x55); + + ss = _mm_srai_epi32(ss, PRECISION_BITS); + + ss = _mm_packs_epi32(ss, ss); + ss = _mm_packus_epi16(ss, ss); + out[x] = _mm_cvtsi128_si32(ss); + } + out[x] = in0[x]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#endif + } + memcpy(imOut->image32[y], im->image32[y], im->linesize); +} diff --git a/src/libImaging/FilterSIMD_5x5f_4u8.c b/src/libImaging/FilterSIMD_5x5f_4u8.c new file mode 100644 index 000000000..097242adb --- /dev/null +++ b/src/libImaging/FilterSIMD_5x5f_4u8.c @@ -0,0 +1,126 @@ +void +ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel, + float offset) +{ +#define MM_KERNEL1x5_LOAD(row, x) \ + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \ + pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x])); + +#define MM_KERNEL1x5_SUM(row, kindex) \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, _mm_set1_ps(kernel[5 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, _mm_set1_ps(kernel[10 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix3##row, _mm_set1_ps(kernel[15 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix4##row, _mm_set1_ps(kernel[20 + kindex]))); + + int x, y; + + memcpy(imOut->image32[0], im->image32[0], im->linesize); + memcpy(imOut->image32[1], im->image32[1], im->linesize); + for (y = 2; y < im->ysize-2; y++) { + INT32* in_2 = im->image32[y-2]; + INT32* in_1 = im->image32[y-1]; + INT32* in0 = im->image32[y]; + INT32* in1 = im->image32[y+1]; + INT32* in2 = im->image32[y+2]; + INT32* out = imOut->image32[y]; + __m128 pix00, pix10, pix20, pix30, pix40; + __m128 pix01, pix11, pix21, pix31, pix41; + __m128 pix02, pix12, pix22, pix32, pix42; + __m128 pix03, pix13, pix23, pix33, pix43; + __m128 pix04, pix14, pix24, pix34, pix44; + MM_KERNEL1x5_LOAD(0, 0); + MM_KERNEL1x5_LOAD(1, 1); + MM_KERNEL1x5_LOAD(2, 2); + MM_KERNEL1x5_LOAD(3, 3); + + out[0] = in0[0]; + out[1] = in0[1]; + x = 2; + for (; x < im->xsize-2-4; x += 5) { + __m128 ss; + __m128i ssi0, ssi1, ssi2, ssi3; + + ss = _mm_set1_ps(offset); + MM_KERNEL1x5_SUM(0, 0); + MM_KERNEL1x5_SUM(1, 1); + MM_KERNEL1x5_SUM(2, 2); + MM_KERNEL1x5_SUM(3, 3); + MM_KERNEL1x5_LOAD(4, x+2); + MM_KERNEL1x5_SUM(4, 4); + ssi0 = _mm_cvtps_epi32(ss); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x5_SUM(1, 0); + MM_KERNEL1x5_SUM(2, 1); + MM_KERNEL1x5_SUM(3, 2); + MM_KERNEL1x5_SUM(4, 3); + MM_KERNEL1x5_LOAD(0, x+3); + MM_KERNEL1x5_SUM(0, 4); + ssi1 = _mm_cvtps_epi32(ss); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x5_SUM(2, 0); + MM_KERNEL1x5_SUM(3, 1); + MM_KERNEL1x5_SUM(4, 2); + MM_KERNEL1x5_SUM(0, 3); + MM_KERNEL1x5_LOAD(1, x+4); + MM_KERNEL1x5_SUM(1, 4); + ssi2 = _mm_cvtps_epi32(ss); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x5_SUM(3, 0); + MM_KERNEL1x5_SUM(4, 1); + MM_KERNEL1x5_SUM(0, 2); + MM_KERNEL1x5_SUM(1, 3); + MM_KERNEL1x5_LOAD(2, x+5); + MM_KERNEL1x5_SUM(2, 4); + ssi3 = _mm_cvtps_epi32(ss); + + ssi0 = _mm_packs_epi32(ssi0, ssi1); + ssi1 = _mm_packs_epi32(ssi2, ssi3); + ssi0 = _mm_packus_epi16(ssi0, ssi1); + _mm_storeu_si128((__m128i*) &out[x], ssi0); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x5_SUM(4, 0); + MM_KERNEL1x5_SUM(0, 1); + MM_KERNEL1x5_SUM(1, 2); + MM_KERNEL1x5_SUM(2, 3); + MM_KERNEL1x5_LOAD(3, x+6); + MM_KERNEL1x5_SUM(3, 4); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x+4] = _mm_cvtsi128_si32(ssi0); + } + for (; x < im->xsize-2; x++) { + __m128 ss = _mm_set1_ps(offset); + __m128i ssi0; + MM_KERNEL1x5_LOAD(0, x-2); + MM_KERNEL1x5_SUM(0, 0); + MM_KERNEL1x5_LOAD(0, x-1); + MM_KERNEL1x5_SUM(0, 1); + MM_KERNEL1x5_LOAD(0, x+0); + MM_KERNEL1x5_SUM(0, 2); + MM_KERNEL1x5_LOAD(0, x+1); + MM_KERNEL1x5_SUM(0, 3); + MM_KERNEL1x5_LOAD(0, x+2); + MM_KERNEL1x5_SUM(0, 4); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + out[x] = in0[x]; + out[x+1] = in0[x+1]; + } + memcpy(imOut->image32[y], im->image32[y], im->linesize); + memcpy(imOut->image32[y+1], im->image32[y+1], im->linesize); + +#undef MM_KERNEL1x5_LOAD +#undef MM_KERNEL1x5_SUM +} diff --git a/src/libImaging/FilterSIMD_5x5f_u8.c b/src/libImaging/FilterSIMD_5x5f_u8.c new file mode 100644 index 000000000..714bbdf33 --- /dev/null +++ b/src/libImaging/FilterSIMD_5x5f_u8.c @@ -0,0 +1,100 @@ +void +ImagingFilter5x5f_u8(Imaging imOut, Imaging im, const float* kernel, + float offset) +{ +#define MM_KERNEL1x5_LOAD(row, x) \ + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \ + pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x])); + +#define MM_KERNEL1x5_SUM(ss, row, krow) \ + ss = _mm_mul_ps(pix0##row, kernel0##krow); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel1##krow)); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, kernel2##krow)); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix3##row, kernel3##krow)); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix4##row, kernel4##krow)); + + int x, y; + + memcpy(imOut->image8[0], im->image8[0], im->linesize); + memcpy(imOut->image8[1], im->image8[1], im->linesize); + for (y = 2; y < im->ysize-2; y++) { + UINT8* in_2 = im->image8[y-2]; + UINT8* in_1 = im->image8[y-1]; + UINT8* in0 = im->image8[y]; + UINT8* in1 = im->image8[y+1]; + UINT8* in2 = im->image8[y+2]; + UINT8* out = imOut->image8[y]; + __m128 kernelx0 = _mm_set_ps(kernel[15], kernel[10], kernel[5], kernel[0]); + __m128 kernel00 = _mm_loadu_ps(&kernel[0+1]); + __m128 kernel10 = _mm_loadu_ps(&kernel[5+1]); + __m128 kernel20 = _mm_loadu_ps(&kernel[10+1]); + __m128 kernel30 = _mm_loadu_ps(&kernel[15+1]); + __m128 kernel40 = _mm_loadu_ps(&kernel[20+1]); + __m128 kernelx1 = _mm_set_ps(kernel[19], kernel[14], kernel[9], kernel[4]); + __m128 kernel01 = _mm_loadu_ps(&kernel[0+0]); + __m128 kernel11 = _mm_loadu_ps(&kernel[5+0]); + __m128 kernel21 = _mm_loadu_ps(&kernel[10+0]); + __m128 kernel31 = _mm_loadu_ps(&kernel[15+0]); + __m128 kernel41 = _mm_loadu_ps(&kernel[20+0]); + + out[0] = in0[0]; + out[1] = in0[1]; + x = 2; + for (; x < im->xsize-2-1; x += 2) { + __m128 ss0, ss1; + __m128i ssi0; + __m128 pix00, pix10, pix20, pix30, pix40; + + MM_KERNEL1x5_LOAD(0, x-1); + MM_KERNEL1x5_SUM(ss0, 0, 0); + ss0 = _mm_add_ps(ss0, _mm_mul_ps( + _mm_set_ps(in_1[x-2], in0[x-2], in1[x-2], in2[x-2]), + kernelx0)); + + MM_KERNEL1x5_SUM(ss1, 0, 1); + ss1 = _mm_add_ps(ss1, _mm_mul_ps( + _mm_set_ps(in_1[x+3], in0[x+3], in1[x+3], in2[x+3]), + kernelx1)); + + ss0 = _mm_hadd_ps(ss0, ss1); + ss0 = _mm_add_ps(ss0, _mm_set_ps( + offset, kernel[24] * in_2[x+3], + offset, kernel[20] * in_2[x-2])); + ss0 = _mm_hadd_ps(ss0, ss0); + ssi0 = _mm_cvtps_epi32(ss0); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + *((UINT32*) &out[x]) = _mm_cvtsi128_si32(ssi0); + } + for (; x < im->xsize-2; x ++) { + __m128 ss0; + __m128i ssi0; + __m128 pix00, pix10, pix20, pix30, pix40; + + MM_KERNEL1x5_LOAD(0, x-2); + MM_KERNEL1x5_SUM(ss0, 0, 1); + ss0 = _mm_add_ps(ss0, _mm_mul_ps( + _mm_set_ps(in_1[x+2], in0[x+2], in1[x+2], in2[x+2]), + kernelx1)); + + ss0 = _mm_add_ps(ss0, _mm_set_ps( + 0, 0, offset, kernel[24] * in_2[x+2])); + ss0 = _mm_hadd_ps(ss0, ss0); + ss0 = _mm_hadd_ps(ss0, ss0); + ssi0 = _mm_cvtps_epi32(ss0); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + out[x+0] = in0[x+0]; + out[x+1] = in0[x+1]; + } + memcpy(imOut->image8[y], im->image8[y], im->linesize); + memcpy(imOut->image8[y+1], im->image8[y+1], im->linesize); + +#undef MM_KERNEL1x5_LOAD +#undef MM_KERNEL1x5_SUM +} diff --git a/src/libImaging/FilterSIMD_5x5i_4u8.c b/src/libImaging/FilterSIMD_5x5i_4u8.c new file mode 100644 index 000000000..d91d472d0 --- /dev/null +++ b/src/libImaging/FilterSIMD_5x5i_4u8.c @@ -0,0 +1,250 @@ +void +ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, + INT32 offset) +{ + int x, y; + + offset += 1 << (PRECISION_BITS-1); + + memcpy(imOut->image32[0], im->image32[0], im->linesize); + memcpy(imOut->image32[1], im->image32[1], im->linesize); + for (y = 2; y < im->ysize-2; y++) { + INT32* in_2 = im->image32[y-2]; + INT32* in_1 = im->image32[y-1]; + INT32* in0 = im->image32[y]; + INT32* in1 = im->image32[y+1]; + INT32* in2 = im->image32[y+2]; + INT32* out = imOut->image32[y]; +#if defined(__AVX2__) + + #define MM_KERNEL_LOAD(row, x) \ + pix0##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x]), shuffle)); \ + pix0##row = _mm256_inserti128_si256(pix0##row, _mm256_castsi256_si128(pix0##row), 1); \ + pix1##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle)); \ + pix1##row = _mm256_inserti128_si256(pix1##row, _mm256_castsi256_si128(pix1##row), 1); \ + pix2##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle)); \ + pix2##row = _mm256_inserti128_si256(pix2##row, _mm256_castsi256_si128(pix2##row), 1); \ + pix3##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle)); \ + pix3##row = _mm256_inserti128_si256(pix3##row, _mm256_castsi256_si128(pix3##row), 1); \ + pix4##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x]), shuffle)); \ + pix4##row = _mm256_inserti128_si256(pix4##row, _mm256_castsi256_si128(pix4##row), 1); + + #define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix0##row, zero), _mm256_shuffle_epi32(kernel0##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix1##row, zero), _mm256_shuffle_epi32(kernel1##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix2##row, zero), _mm256_shuffle_epi32(kernel2##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix3##row, zero), _mm256_shuffle_epi32(kernel3##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix4##row, zero), _mm256_shuffle_epi32(kernel4##krow, kctl))); + + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m256i kernel00 = _mm256_set_epi16( + 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0], 0, + 0, 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0]); + __m256i kernel10 = _mm256_set_epi16( + 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5], 0, + 0, 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5]); + __m256i kernel20 = _mm256_set_epi16( + 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10], 0, + 0, 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10]); + __m256i kernel30 = _mm256_set_epi16( + 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15], 0, + 0, 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15]); + __m256i kernel40 = _mm256_set_epi16( + 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20], 0, + 0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20]); + __m256i zero = _mm256_setzero_si256(); + __m256i pix00, pix10, pix20, pix30, pix40; + __m256i pix01, pix11, pix21, pix31, pix41; + + out[0] = in0[0]; + out[1] = in0[1]; + x = 2; + + MM_KERNEL_LOAD(0, 0); + + for (; x < im->xsize-2-3; x += 4) { + __m256i ss0, ss2; + __m128i ssout; + + ss0 = _mm256_set1_epi32(offset); + ss2 = _mm256_set1_epi32(offset); + + MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm256_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss2, 0, _mm256_unpackhi_epi8, 0, 0x00); + + MM_KERNEL_LOAD(1, x+2); + MM_KERNEL_SUM(ss0, 1, _mm256_unpacklo_epi8, 0, 0xaa); + ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS); + MM_KERNEL_SUM(ss2, 1, _mm256_unpacklo_epi8, 0, 0x55); + MM_KERNEL_SUM(ss2, 1, _mm256_unpackhi_epi8, 0, 0xaa); + ss2 = _mm256_srai_epi32(ss2, PRECISION_BITS); + + pix00 = pix01; + pix10 = pix11; + pix20 = pix21; + pix30 = pix31; + pix40 = pix41; + + ss0 = _mm256_packs_epi32(ss0, ss2); + ssout = _mm_packus_epi16( + _mm256_extracti128_si256(ss0, 0), + _mm256_extracti128_si256(ss0, 1)); + ssout = _mm_shuffle_epi32(ssout, 0xd8); + _mm_storeu_si128((__m128i*) &out[x], ssout); + } + for (; x < im->xsize-2; x++) { + __m256i ss0; + + MM_KERNEL_LOAD(0, x-2); + pix01 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle)); + pix01 = _mm256_inserti128_si256(pix01, _mm256_castsi256_si128(pix01), 1); + pix11 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle)); + pix11 = _mm256_inserti128_si256(pix11, _mm256_castsi256_si128(pix11), 1); + pix21 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle)); + pix21 = _mm256_inserti128_si256(pix21, _mm256_castsi256_si128(pix21), 1); + pix31 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle)); + pix31 = _mm256_inserti128_si256(pix31, _mm256_castsi256_si128(pix31), 1); + pix41 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle)); + pix41 = _mm256_inserti128_si256(pix41, _mm256_castsi256_si128(pix41), 1); + + ss0 = _mm256_set1_epi32(offset); + MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm256_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss0, 1, _mm256_unpacklo_epi8, 0, 0xaa); + ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS); + + ss0 = _mm256_packs_epi32(ss0, ss0); + ss0 = _mm256_packus_epi16(ss0, ss0); + out[x] = _mm_cvtsi128_si32(_mm256_castsi256_si128(ss0)); + } + out[x] = in0[x]; + out[x+1] = in0[x+1]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#else + + #define MM_KERNEL_LOAD(row, x) \ + pix0##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x]), shuffle); \ + pix1##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \ + pix2##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \ + pix3##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); \ + pix4##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x]), shuffle); + + #define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix0##row, zero), _mm_shuffle_epi32(kernel0##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix1##row, zero), _mm_shuffle_epi32(kernel1##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix2##row, zero), _mm_shuffle_epi32(kernel2##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix3##row, zero), _mm_shuffle_epi32(kernel3##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix4##row, zero), _mm_shuffle_epi32(kernel4##krow, kctl))); + + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m128i kernel00 = _mm_set_epi16( + 0, 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0]); + __m128i kernel10 = _mm_set_epi16( + 0, 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5]); + __m128i kernel20 = _mm_set_epi16( + 0, 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10]); + __m128i kernel30 = _mm_set_epi16( + 0, 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15]); + __m128i kernel40 = _mm_set_epi16( + 0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20]); + __m128i kernel01 = _mm_set_epi16( + 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0], 0); + __m128i kernel11 = _mm_set_epi16( + 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5], 0); + __m128i kernel21 = _mm_set_epi16( + 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10], 0); + __m128i kernel31 = _mm_set_epi16( + 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15], 0); + __m128i kernel41 = _mm_set_epi16( + 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20], 0); + __m128i zero = _mm_setzero_si128(); + __m128i pix00, pix10, pix20, pix30, pix40; + __m128i pix01, pix11, pix21, pix31, pix41; + + out[0] = in0[0]; + out[1] = in0[1]; + x = 2; + + MM_KERNEL_LOAD(0, 0); + + for (; x < im->xsize-2-3; x += 4) { + __m128i ss0, ss1, ss2, ss3; + + ss0 = _mm_set1_epi32(offset); + ss1 = _mm_set1_epi32(offset); + ss2 = _mm_set1_epi32(offset); + ss3 = _mm_set1_epi32(offset); + + MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss1, 0, _mm_unpacklo_epi8, 1, 0x00); + MM_KERNEL_SUM(ss1, 0, _mm_unpackhi_epi8, 1, 0x55); + MM_KERNEL_SUM(ss2, 0, _mm_unpackhi_epi8, 0, 0x00); + MM_KERNEL_SUM(ss3, 0, _mm_unpackhi_epi8, 1, 0x00); + + MM_KERNEL_LOAD(1, x+2); + MM_KERNEL_SUM(ss0, 1, _mm_unpacklo_epi8, 0, 0xaa); + ss0 = _mm_srai_epi32(ss0, PRECISION_BITS); + MM_KERNEL_SUM(ss1, 1, _mm_unpacklo_epi8, 1, 0xaa); + ss1 = _mm_srai_epi32(ss1, PRECISION_BITS); + MM_KERNEL_SUM(ss2, 1, _mm_unpacklo_epi8, 0, 0x55); + MM_KERNEL_SUM(ss2, 1, _mm_unpackhi_epi8, 0, 0xaa); + ss2 = _mm_srai_epi32(ss2, PRECISION_BITS); + MM_KERNEL_SUM(ss3, 1, _mm_unpacklo_epi8, 1, 0x55); + MM_KERNEL_SUM(ss3, 1, _mm_unpackhi_epi8, 1, 0xaa); + ss3 = _mm_srai_epi32(ss3, PRECISION_BITS); + + pix00 = pix01; + pix10 = pix11; + pix20 = pix21; + pix30 = pix31; + pix40 = pix41; + + ss0 = _mm_packs_epi32(ss0, ss1); + ss2 = _mm_packs_epi32(ss2, ss3); + ss0 = _mm_packus_epi16(ss0, ss2); + _mm_storeu_si128((__m128i*) &out[x], ss0); + } + for (; x < im->xsize-2; x++) { + __m128i ss0; + + MM_KERNEL_LOAD(0, x-2); + pix01 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle); + pix11 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle); + pix21 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle); + pix31 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle); + pix41 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle); + + ss0 = _mm_set1_epi32(offset); + MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss0, 1, _mm_unpacklo_epi8, 0, 0xaa); + ss0 = _mm_srai_epi32(ss0, PRECISION_BITS); + + ss0 = _mm_packs_epi32(ss0, ss0); + ss0 = _mm_packus_epi16(ss0, ss0); + out[x] = _mm_cvtsi128_si32(ss0); + } + out[x] = in0[x]; + out[x+1] = in0[x+1]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#endif + } + memcpy(imOut->image32[y], im->image32[y], im->linesize); + memcpy(imOut->image32[y+1], im->image32[y+1], im->linesize); +}