diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index 7360f9681..5fb9b060c 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -480,10 +480,60 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float* kernel, UINT8* in1 = (UINT8*) im->image[y+1]; UINT8* in2 = (UINT8*) im->image[y+2]; UINT8* out = (UINT8*) imOut->image[y]; + __m128 kernelx0 = _mm_set_ps(kernel[15], kernel[10], kernel[5], kernel[0]); + __m128 kernel00 = _mm_loadu_ps(&kernel[0+1]); + __m128 kernel10 = _mm_loadu_ps(&kernel[5+1]); + __m128 kernel20 = _mm_loadu_ps(&kernel[10+1]); + __m128 kernel30 = _mm_loadu_ps(&kernel[15+1]); + __m128 kernel40 = _mm_loadu_ps(&kernel[20+1]); + __m128 kernelx1 = _mm_set_ps(kernel[19], kernel[14], kernel[9], kernel[4]); + __m128 kernel01 = _mm_loadu_ps(&kernel[0+0]); + __m128 kernel11 = _mm_loadu_ps(&kernel[5+0]); + __m128 kernel21 = _mm_loadu_ps(&kernel[10+0]); + __m128 kernel31 = _mm_loadu_ps(&kernel[15+0]); + __m128 kernel41 = _mm_loadu_ps(&kernel[20+0]); out[0] = in0[0]; out[1] = in0[1]; - for (x = 2; x < im->xsize-2; x++) { + x = 2; + for (; x < im->xsize-2-1; x += 2) { + __m128 ss0, ss1; + __m128i ssi0; + __m128 pix00, pix10, pix20, pix30, pix40; + + MM_KERNEL1x5_LOAD(0, x-1); + + ss0 = _mm_setzero_ps(); + ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix00, kernel00)); + ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix10, kernel10)); + ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix20, kernel20)); + ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix30, kernel30)); + ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix40, kernel40)); + ss0 = _mm_add_ps(ss0, _mm_mul_ps( + _mm_set_ps(in_1[x-2], in0[x-2], in1[x-2], in2[x-2]), + kernelx0)); + + ss1 = _mm_setzero_ps(); + ss1 = _mm_add_ps(ss1, _mm_mul_ps(pix00, kernel01)); + ss1 = _mm_add_ps(ss1, _mm_mul_ps(pix10, kernel11)); + ss1 = _mm_add_ps(ss1, _mm_mul_ps(pix20, kernel21)); + ss1 = _mm_add_ps(ss1, _mm_mul_ps(pix30, kernel31)); + ss1 = _mm_add_ps(ss1, _mm_mul_ps(pix40, kernel41)); + ss1 = _mm_add_ps(ss1, _mm_mul_ps( + _mm_set_ps(in_1[x+3], in0[x+3], in1[x+3], in2[x+3]), + kernelx1)); + + ss0 = _mm_hadd_ps(ss0, ss1); + ss0 = _mm_add_ps(ss0, _mm_set_ps( + offset, kernel[24] * in_2[x+3], + offset, kernel[20] * in_2[x-2])); + ss0 = _mm_hadd_ps(ss0, ss0); + ssi0 = _mm_cvtps_epi32(ss0); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + *((UINT32*) &out[x]) = _mm_cvtsi128_si32(ssi0); + } + for (; x < im->xsize-2; x++) { float ss = offset; ss += KERNEL1x5(in2, x, &kernel[0], 1); ss += KERNEL1x5(in1, x, &kernel[5], 1);