SIMD Filter. fix offset

This commit is contained in:
Alexander 2017-09-07 13:35:20 +03:00 committed by Alexander Karpinsky
parent 92d392a99c
commit 256eb37439

View File

@ -29,24 +29,11 @@
#include <emmintrin.h> #include <emmintrin.h>
#include <mmintrin.h> #include <mmintrin.h>
#include <smmintrin.h> #include <smmintrin.h>
#if defined(__AVX2__) #if defined(__AVX2__)
#include <immintrin.h> #include <immintrin.h>
#endif #endif
static inline UINT8 clip8(float in)
{
if (in <= 0.0) {
return 0;
}
if (in >= 255.0) {
return 255;
}
return (UINT8)in;
}
Imaging Imaging
ImagingExpand(Imaging imIn, int xmargin, int ymargin, int mode) { ImagingExpand(Imaging imIn, int xmargin, int ymargin, int mode) {
Imaging imOut; Imaging imOut;
@ -477,8 +464,6 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) {
memcpy(imOut->image[0], im->image[0], im->linesize); memcpy(imOut->image[0], im->image[0], im->linesize);
memcpy(imOut->image[1], im->image[1], im->linesize); memcpy(imOut->image[1], im->image[1], im->linesize);
if (im->bands == 1) { if (im->bands == 1) {
// Add one time for rounding
offset += 0.5;
for (y = 2; y < im->ysize-2; y++) { for (y = 2; y < im->ysize-2; y++) {
UINT8* in_2 = (UINT8*) im->image[y-2]; UINT8* in_2 = (UINT8*) im->image[y-2];
UINT8* in_1 = (UINT8*) im->image[y-1]; UINT8* in_1 = (UINT8*) im->image[y-1];
@ -539,14 +524,31 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) {
ssi0 = _mm_packus_epi16(ssi0, ssi0); ssi0 = _mm_packus_epi16(ssi0, ssi0);
*((UINT32*) &out[x]) = _mm_cvtsi128_si32(ssi0); *((UINT32*) &out[x]) = _mm_cvtsi128_si32(ssi0);
} }
for (; x < im->xsize-2; x++) { for (; x < im->xsize-2; x ++) {
float ss = offset; __m128 ss0;
ss += KERNEL1x5(in2, x, &kernel[0], 1); __m128i ssi0;
ss += KERNEL1x5(in1, x, &kernel[5], 1); __m128 pix00, pix10, pix20, pix30, pix40;
ss += KERNEL1x5(in0, x, &kernel[10], 1);
ss += KERNEL1x5(in_1, x, &kernel[15], 1); MM_KERNEL1x5_LOAD(0, x-2);
ss += KERNEL1x5(in_2, x, &kernel[20], 1);
out[x] = clip8(ss); ss0 = _mm_setzero_ps();
ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix00, kernel01));
ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix10, kernel11));
ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix20, kernel21));
ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix30, kernel31));
ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix40, kernel41));
ss0 = _mm_add_ps(ss0, _mm_mul_ps(
_mm_set_ps(in_1[x+2], in0[x+2], in1[x+2], in2[x+2]),
kernelx1));
ss0 = _mm_add_ps(ss0, _mm_set_ps(
0, 0, offset, kernel[24] * in_2[x+2]));
ss0 = _mm_hadd_ps(ss0, ss0);
ss0 = _mm_hadd_ps(ss0, ss0);
ssi0 = _mm_cvtps_epi32(ss0);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
out[x] = _mm_cvtsi128_si32(ssi0);
} }
out[x + 0] = in0[x + 0]; out[x + 0] = in0[x + 0];
out[x + 1] = in0[x + 1]; out[x + 1] = in0[x + 1];