diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index 0d4047507..51e22530c 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -139,7 +139,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float* kernel, UINT32* in1 = (UINT32*) im->image[y+1]; UINT32* out = (UINT32*) imOut->image[y]; - out[0] = ((UINT32*) in0)[0]; + out[0] = in0[0]; for (x = 1; x < im->xsize-1; x++) { __m128 ss = _mm_set1_ps(offset); __m128i ssi; @@ -153,7 +153,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float* kernel, ssi = _mm_packus_epi16(ssi, ssi); out[x] = _mm_cvtsi128_si32(ssi); } - out[x] = ((UINT32*) in0)[x]; + out[x] = in0[x]; } } memcpy(imOut->image[y], im->image[y], im->linesize); @@ -171,6 +171,18 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float* kernel, _i2f((UINT8) in0[x+d]) * (kernel)[3] + \ _i2f((UINT8) in0[x+d+d]) * (kernel)[4]) +#define MM_KERNEL1x5(ss, in0, x, kernel, d) \ + ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[0]), \ + _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x-d-d])))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[1]), \ + _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x-d])))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[2]), \ + _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x+0])))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[3]), \ + _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x+d])))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[4]), \ + _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x+d+d])))); + int x = 0, y = 0; memcpy(imOut->image[0], im->image[0], im->linesize); @@ -201,89 +213,33 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float* kernel, out[x+1] = in0[x+1]; } } else { - // Add one time for rounding - offset += 0.5; for (y = 2; y < im->ysize-2; y++) { - UINT8* in_2 = (UINT8*) im->image[y-2]; - UINT8* in_1 = (UINT8*) im->image[y-1]; - UINT8* in0 = (UINT8*) im->image[y]; - UINT8* in1 = (UINT8*) im->image[y+1]; - UINT8* in2 = (UINT8*) im->image[y+2]; + UINT32* in_2 = (UINT32*) im->image[y-2]; + UINT32* in_1 = (UINT32*) im->image[y-1]; + UINT32* in0 = (UINT32*) im->image[y]; + UINT32* in1 = (UINT32*) im->image[y+1]; + UINT32* in2 = (UINT32*) im->image[y+2]; UINT32* out = (UINT32*) imOut->image[y]; - out[0] = ((UINT32*) in0)[0]; - out[1] = ((UINT32*) in0)[1]; - if (im->bands == 2) { - for (x = 2; x < im->xsize-2; x++) { - float ss0 = offset; - float ss3 = offset; - ss0 += KERNEL1x5(in2, x*4+0, &kernel[0], 4); - ss3 += KERNEL1x5(in2, x*4+3, &kernel[0], 4); - ss0 += KERNEL1x5(in1, x*4+0, &kernel[5], 4); - ss3 += KERNEL1x5(in1, x*4+3, &kernel[5], 4); - ss0 += KERNEL1x5(in0, x*4+0, &kernel[10], 4); - ss3 += KERNEL1x5(in0, x*4+3, &kernel[10], 4); - ss0 += KERNEL1x5(in_1, x*4+0, &kernel[15], 4); - ss3 += KERNEL1x5(in_1, x*4+3, &kernel[15], 4); - ss0 += KERNEL1x5(in_2, x*4+0, &kernel[20], 4); - ss3 += KERNEL1x5(in_2, x*4+3, &kernel[20], 4); - out[x] = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3)); - } - } else if (im->bands == 3) { - for (x = 2; x < im->xsize-2; x++) { - float ss0 = offset; - float ss1 = offset; - float ss2 = offset; - ss0 += KERNEL1x5(in2, x*4+0, &kernel[0], 4); - ss1 += KERNEL1x5(in2, x*4+1, &kernel[0], 4); - ss2 += KERNEL1x5(in2, x*4+2, &kernel[0], 4); - ss0 += KERNEL1x5(in1, x*4+0, &kernel[5], 4); - ss1 += KERNEL1x5(in1, x*4+1, &kernel[5], 4); - ss2 += KERNEL1x5(in1, x*4+2, &kernel[5], 4); - ss0 += KERNEL1x5(in0, x*4+0, &kernel[10], 4); - ss1 += KERNEL1x5(in0, x*4+1, &kernel[10], 4); - ss2 += KERNEL1x5(in0, x*4+2, &kernel[10], 4); - ss0 += KERNEL1x5(in_1, x*4+0, &kernel[15], 4); - ss1 += KERNEL1x5(in_1, x*4+1, &kernel[15], 4); - ss2 += KERNEL1x5(in_1, x*4+2, &kernel[15], 4); - ss0 += KERNEL1x5(in_2, x*4+0, &kernel[20], 4); - ss1 += KERNEL1x5(in_2, x*4+1, &kernel[20], 4); - ss2 += KERNEL1x5(in_2, x*4+2, &kernel[20], 4); - out[x] = MAKE_UINT32( - clip8(ss0), clip8(ss1), clip8(ss2), 0); - } - } else if (im->bands == 4) { - for (x = 2; x < im->xsize-2; x++) { - float ss0 = offset; - float ss1 = offset; - float ss2 = offset; - float ss3 = offset; - ss0 += KERNEL1x5(in2, x*4+0, &kernel[0], 4); - ss1 += KERNEL1x5(in2, x*4+1, &kernel[0], 4); - ss2 += KERNEL1x5(in2, x*4+2, &kernel[0], 4); - ss3 += KERNEL1x5(in2, x*4+3, &kernel[0], 4); - ss0 += KERNEL1x5(in1, x*4+0, &kernel[5], 4); - ss1 += KERNEL1x5(in1, x*4+1, &kernel[5], 4); - ss2 += KERNEL1x5(in1, x*4+2, &kernel[5], 4); - ss3 += KERNEL1x5(in1, x*4+3, &kernel[5], 4); - ss0 += KERNEL1x5(in0, x*4+0, &kernel[10], 4); - ss1 += KERNEL1x5(in0, x*4+1, &kernel[10], 4); - ss2 += KERNEL1x5(in0, x*4+2, &kernel[10], 4); - ss3 += KERNEL1x5(in0, x*4+3, &kernel[10], 4); - ss0 += KERNEL1x5(in_1, x*4+0, &kernel[15], 4); - ss1 += KERNEL1x5(in_1, x*4+1, &kernel[15], 4); - ss2 += KERNEL1x5(in_1, x*4+2, &kernel[15], 4); - ss3 += KERNEL1x5(in_1, x*4+3, &kernel[15], 4); - ss0 += KERNEL1x5(in_2, x*4+0, &kernel[20], 4); - ss1 += KERNEL1x5(in_2, x*4+1, &kernel[20], 4); - ss2 += KERNEL1x5(in_2, x*4+2, &kernel[20], 4); - ss3 += KERNEL1x5(in_2, x*4+3, &kernel[20], 4); - out[x] = MAKE_UINT32( - clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3)); - } + out[0] = in0[0]; + out[1] = in0[1]; + for (x = 2; x < im->xsize-2; x++) { + __m128 ss = _mm_set1_ps(offset); + __m128i ssi; + + MM_KERNEL1x5(ss, in2, x, &kernel[0], 1); + MM_KERNEL1x5(ss, in1, x, &kernel[5], 1); + MM_KERNEL1x5(ss, in0, x, &kernel[10], 1); + MM_KERNEL1x5(ss, in_1, x, &kernel[15], 1); + MM_KERNEL1x5(ss, in_2, x, &kernel[20], 1); + + ssi = _mm_cvtps_epi32(ss); + ssi = _mm_packs_epi32(ssi, ssi); + ssi = _mm_packus_epi16(ssi, ssi); + out[x] = _mm_cvtsi128_si32(ssi); } - out[x] = ((UINT32*) in0)[x]; - out[x+1] = ((UINT32*) in0)[x+1]; + out[x] = in0[x]; + out[x+1] = in0[x+1]; } } memcpy(imOut->image[y], im->image[y], im->linesize);