From eb57b5eca7b873322c85f5699313a7f0289087d6 Mon Sep 17 00:00:00 2001 From: Alexander Date: Wed, 13 Sep 2017 01:37:18 +0300 Subject: [PATCH] SIMD Filter. 3x3i --- libImaging/FilterSIMD_3x3i_4u8.c | 107 +++++++++++++++++++++++++++++++ src/libImaging/Filter.c | 44 ++++++++++++- 2 files changed, 148 insertions(+), 3 deletions(-) create mode 100644 libImaging/FilterSIMD_3x3i_4u8.c diff --git a/libImaging/FilterSIMD_3x3i_4u8.c b/libImaging/FilterSIMD_3x3i_4u8.c new file mode 100644 index 000000000..c1c4decea --- /dev/null +++ b/libImaging/FilterSIMD_3x3i_4u8.c @@ -0,0 +1,107 @@ +void +ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, + INT32 offset) +{ +#define MM_KERNEL_LOAD(row, x) \ + pix0##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \ + pix1##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \ + pix2##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); + +#define MM_KERNEL_SUM(ss, row, krow) \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + _mm_unpacklo_epi8(pix0##row, _mm_setzero_si128()), \ + _mm_unpacklo_epi32(kernel0##krow, kernel0##krow))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + _mm_unpacklo_epi8(pix1##row, _mm_setzero_si128()), \ + _mm_unpacklo_epi32(kernel1##krow, kernel1##krow))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + _mm_unpacklo_epi8(pix2##row, _mm_setzero_si128()), \ + _mm_unpacklo_epi32(kernel2##krow, kernel2##krow))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + _mm_unpackhi_epi8(pix0##row, _mm_setzero_si128()), \ + _mm_unpackhi_epi32(kernel0##krow, kernel0##krow))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + _mm_unpackhi_epi8(pix1##row, _mm_setzero_si128()), \ + _mm_unpackhi_epi32(kernel1##krow, kernel1##krow))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + _mm_unpackhi_epi8(pix2##row, _mm_setzero_si128()), \ + _mm_unpackhi_epi32(kernel2##krow, kernel2##krow))); + + int x, y; + + offset += 1 << (PRECISION_BITS-1); + + memcpy(imOut->image32[0], im->image32[0], im->linesize); + + for (y = 1; y < im->ysize-1; y++) { + INT32* in_1 = im->image32[y-1]; + INT32* in0 = im->image32[y]; + INT32* in1 = im->image32[y+1]; + INT32* out = imOut->image32[y]; + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m128i kernel00 = _mm_set_epi16( + 0, kernel[2], 0, kernel[2], + kernel[1], kernel[0], kernel[1], kernel[0]); + __m128i kernel01 = _mm_set_epi16( + kernel[2], kernel[1], kernel[2], kernel[1], + kernel[0], 0, kernel[0], 0); + __m128i kernel10 = _mm_set_epi16( + 0, kernel[5], 0, kernel[5], + kernel[4], kernel[3], kernel[4], kernel[3]); + __m128i kernel11 = _mm_set_epi16( + kernel[5], kernel[4], kernel[5], kernel[4], + kernel[3], 0, kernel[3], 0); + __m128i kernel20 = _mm_set_epi16( + 0, kernel[8], 0, kernel[8], + kernel[7], kernel[6], kernel[7], kernel[6]); + __m128i kernel21 = _mm_set_epi16( + kernel[8], kernel[7], kernel[8], kernel[7], + kernel[6], 0, kernel[6], 0); + // __m128i source; + __m128i pix00, pix10, pix20; + // __m128i pix01, pix11, pix21; + + out[0] = in0[0]; + x = 1; + for (; x < im->xsize-1-3; x += 4) { + __m128i ss0 = _mm_set1_epi32(offset); + __m128i ss1 = _mm_set1_epi32(offset); + __m128i ss2 = _mm_set1_epi32(offset); + __m128i ss3 = _mm_set1_epi32(offset); + + MM_KERNEL_LOAD(0, x-1); + MM_KERNEL_SUM(ss0, 0, 0); + ss0 = _mm_srai_epi32(ss0, PRECISION_BITS); + MM_KERNEL_SUM(ss1, 0, 1); + ss1 = _mm_srai_epi32(ss1, PRECISION_BITS); + ss0 = _mm_packs_epi32(ss0, ss1); + + MM_KERNEL_LOAD(0, x+1); + MM_KERNEL_SUM(ss2, 0, 0); + ss2 = _mm_srai_epi32(ss2, PRECISION_BITS); + MM_KERNEL_SUM(ss3, 0, 1); + ss3 = _mm_srai_epi32(ss3, PRECISION_BITS); + ss2 = _mm_packs_epi32(ss2, ss3); + + ss0 = _mm_packus_epi16(ss0, ss2); + _mm_storeu_si128((__m128i*) &out[x], ss0); + } + for (; x < im->xsize-1; x++) { + __m128i ss = _mm_set1_epi32(offset); + + MM_KERNEL_LOAD(0, x-1); + + MM_KERNEL_SUM(ss, 0, 0); + ss = _mm_srai_epi32(ss, PRECISION_BITS); + + ss = _mm_packs_epi32(ss, ss); + ss = _mm_packus_epi16(ss, ss); + out[x] = _mm_cvtsi128_si32(ss); + } + out[x] = in0[x]; + } + memcpy(imOut->image32[y], im->image32[y], im->linesize); + +#undef MM_KERNEL_LOAD +#undef MM_KERNEL_SUM +} diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index 9fd83ed25..1c5b8b21f 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -33,9 +33,18 @@ #include #endif +/* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25). + 8 is number of bits in result. + Any coefficients delta smaller than this precision will have no effect. */ +#define PRECISION_BITS (8 + 5) +/* 16 is number of bis required for kernel storage. + 1 bit is reserver for sign. + Largest possible kernel coefficient is */ +#define LARGEST_KERNEL (1 << (16 - 1 - PRECISION_BITS)) #include "FilterSIMD_3x3f_u8.c" #include "FilterSIMD_3x3f_4u8.c" +#include "FilterSIMD_3x3i_4u8.c" #include "FilterSIMD_5x5f_u8.c" #include "FilterSIMD_5x5f_4u8.c" @@ -97,9 +106,15 @@ ImagingExpand(Imaging imIn, int xmargin, int ymargin, int mode) { } Imaging -ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 offset) { - Imaging imOut; +ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32* kernel, + FLOAT32 offset) +{ ImagingSectionCookie cookie; + Imaging imOut; + int i, fast_path = 1; + FLOAT32 tmp; + INT16 norm_kernel[25]; + INT32 norm_offset; if (!im || im->type != IMAGING_TYPE_UINT8) { return (Imaging)ImagingError_ModeError(); @@ -118,13 +133,36 @@ ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 o return NULL; } + for (i = 0; i < xsize * ysize; i++) { + tmp = kernel[i] * (1 << PRECISION_BITS); + tmp += (tmp >= 0) ? 0.5 : -0.5; + if (tmp >= 32768 || tmp <= -32769) { + fast_path = 0; + break; + } + norm_kernel[i] = tmp; + } + tmp = offset * (1 << PRECISION_BITS); + tmp += (tmp >= 0) ? 0.5 : -0.5; + if (tmp >= 1 << 30) { + norm_offset = 1 << 30; + } else if (tmp <= -1 << 30) { + norm_offset = -1 << 30; + } else { + norm_offset = tmp; + } + ImagingSectionEnter(&cookie); if (xsize == 3) { /* 3x3 kernel. */ if (im->image8) { ImagingFilter3x3f_u8(imOut, im, kernel, offset); } else { - ImagingFilter3x3f_4u8(imOut, im, kernel, offset); + if (fast_path) { + ImagingFilter3x3i_4u8(imOut, im, norm_kernel, norm_offset); + } else { + ImagingFilter3x3f_4u8(imOut, im, kernel, offset); + } } } else { /* 5x5 kernel. */