diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index efec61657..1bc882e95 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -244,6 +244,57 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float* kernel, UINT32* in0 = (UINT32*) im->image[y]; UINT32* in1 = (UINT32*) im->image[y+1]; UINT32* out = (UINT32*) imOut->image[y]; +#if defined(__AVX2__) + __m256 kernel00 = _mm256_insertf128_ps( + _mm256_set1_ps(kernel[0+0]), + _mm_set1_ps(kernel[0+1]), 1); + __m256 kernel01 = _mm256_castps128_ps256(_mm_set1_ps(kernel[0+2])); + __m256 kernel10 = _mm256_insertf128_ps( + _mm256_set1_ps(kernel[3+0]), + _mm_set1_ps(kernel[3+1]), 1); + __m256 kernel11 = _mm256_castps128_ps256(_mm_set1_ps(kernel[3+2])); + __m256 kernel20 = _mm256_insertf128_ps( + _mm256_set1_ps(kernel[6+0]), + _mm_set1_ps(kernel[6+1]), 1); + __m256 kernel21 = _mm256_castps128_ps256(_mm_set1_ps(kernel[6+2])); + __m256 pix00, pix10, pix20; + __m256 pix01, pix11, pix21; + + out[0] = in0[0]; + x = 1; + pix00 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in1[0])); + pix10 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in0[0])); + pix20 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in_1[0])); + for (; x < im->xsize-1; x += 1) { + __m256 ss; + __m128i ssi; + + pix01 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in1[x+1])); + pix11 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in0[x+1])); + pix21 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in_1[x+1])); + + ss = _mm256_set1_ps(offset); + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix00, kernel00)); + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix01, kernel01)); + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix10, kernel10)); + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix11, kernel11)); + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix20, kernel20)); + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix21, kernel21)); + + ssi = _mm_cvtps_epi32(_mm_add_ps( + _mm256_extractf128_ps(ss, 0), + _mm256_extractf128_ps(ss, 1) + )); + ssi = _mm_packs_epi32(ssi, ssi); + ssi = _mm_packus_epi16(ssi, ssi); + out[x] = _mm_cvtsi128_si32(ssi); + + pix00 = _mm256_permute2f128_ps(pix00, pix01, 0x21); + pix10 = _mm256_permute2f128_ps(pix10, pix11, 0x21); + pix20 = _mm256_permute2f128_ps(pix20, pix21, 0x21); + } + out[x] = in0[x]; +#else __m128 pix00, pix10, pix20; __m128 pix01, pix11, pix21; __m128 pix02, pix12, pix22; @@ -297,6 +348,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float* kernel, out[x] = _mm_cvtsi128_si32(ssi0); } out[x] = in0[x]; +#endif } } memcpy(imOut->image[y], im->image[y], im->linesize);