diff --git a/libImaging/FilterSIMD_3x3f_u8.c b/libImaging/FilterSIMD_3x3f_u8.c index 10fb4716a..64f365008 100644 --- a/libImaging/FilterSIMD_3x3f_u8.c +++ b/libImaging/FilterSIMD_3x3f_u8.c @@ -80,9 +80,11 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, __m128 pix00, pix10, pix20, pix30; __m128i ssi0; - MM_KERNEL1x3_LOAD(0, x-1); + pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]); + pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]); + pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]); + pix30 = _mm_set_ps(0, in2[x+1], in2[x], in2[x-1]); MM_KERNEL1x3_SUM1(ss0, 0, 0); - pix30 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in2[x-1])); MM_KERNEL1x3_SUM1_2(ss1, 0, 0); ss0 = _mm_hadd_ps(ss0, ss0); @@ -110,7 +112,7 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, out[0] = in0[0]; x = 1; - for (; x < im->xsize-1; x++) { + for (; x < im->xsize-2; x++) { __m128 ss; __m128 pix00, pix10, pix20; __m128i ssi0; @@ -123,7 +125,24 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, ssi0 = _mm_cvtps_epi32(ss); ssi0 = _mm_packs_epi32(ssi0, ssi0); ssi0 = _mm_packus_epi16(ssi0, ssi0); - out[x] = _mm_cvtsi128_si32(ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + for (; x < im->xsize-1; x++) { + __m128 ss; + __m128 pix00, pix10, pix20; + __m128i ssi0; + + pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]); + pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]); + pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]); + MM_KERNEL1x3_SUM1(ss, 0, 0); + + ss = _mm_hadd_ps(ss, ss); + ss = _mm_hadd_ps(ss, ss); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); } out[x] = in0[x]; } diff --git a/libImaging/FilterSIMD_3x3i_4u8.c b/libImaging/FilterSIMD_3x3i_4u8.c index 8e970a31d..2a52d45eb 100644 --- a/libImaging/FilterSIMD_3x3i_4u8.c +++ b/libImaging/FilterSIMD_3x3i_4u8.c @@ -54,8 +54,12 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, out[0] = in0[0]; x = 1; - MM_KERNEL_LOAD(0); - for (; x < im->xsize-1-3; x += 4) { + if (im->xsize >= 4) { + MM_KERNEL_LOAD(0); + } + // Last loaded pixel: x+3 + 3 (wide load) + // Last x should be < im->xsize-6 + for (; x < im->xsize-1-5; x += 4) { __m256i ss0, ss2; __m128i ssout; @@ -80,7 +84,25 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, for (; x < im->xsize-1; x++) { __m256i ss = _mm256_set1_epi32(offset); - MM_KERNEL_LOAD(x-1); + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in1[x+1]), 8), + _mm_loadl_epi64((__m128i*) &in1[x-1])), shuffle)); + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); + pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in0[x+1]), 8), + _mm_loadl_epi64((__m128i*) &in0[x-1])), shuffle)); + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); + pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in_1[x+1]), 8), + _mm_loadl_epi64((__m128i*) &in_1[x-1])), shuffle)); + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); + pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + MM_KERNEL_SUM(ss, 0, 0x00); MM_KERNEL_SUM(ss, 1, 0x55); @@ -131,8 +153,12 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, out[0] = in0[0]; x = 1; - MM_KERNEL_LOAD(0); - for (; x < im->xsize-1-3; x += 4) { + if (im->xsize >= 4) { + MM_KERNEL_LOAD(0); + } + // Last loaded pixel: x+3 + 3 (wide load) + // Last x should be < im->xsize-6 + for (; x < im->xsize-1-5; x += 4) { __m128i ss0 = _mm_set1_epi32(offset); __m128i ss1 = _mm_set1_epi32(offset); __m128i ss2 = _mm_set1_epi32(offset); @@ -164,7 +190,19 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, for (; x < im->xsize-1; x++) { __m128i ss = _mm_set1_epi32(offset); - MM_KERNEL_LOAD(x-1); + source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in1[x-1]), shuffle); + pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in1[x+1]), shuffle); + pix01 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in0[x-1]), shuffle); + pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in0[x+1]), shuffle); + pix11 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in_1[x-1]), shuffle); + pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in_1[x+1]), shuffle); + pix21 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + MM_KERNEL_SUM(ss, 0, 0x00); MM_KERNEL_SUM(ss, 1, 0x55); diff --git a/libImaging/FilterSIMD_5x5i_4u8.c b/libImaging/FilterSIMD_5x5i_4u8.c index 5c6dced03..d91d472d0 100644 --- a/libImaging/FilterSIMD_5x5i_4u8.c +++ b/libImaging/FilterSIMD_5x5i_4u8.c @@ -102,7 +102,16 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, __m256i ss0; MM_KERNEL_LOAD(0, x-2); - MM_KERNEL_LOAD(1, x+2); + pix01 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle)); + pix01 = _mm256_inserti128_si256(pix01, _mm256_castsi256_si128(pix01), 1); + pix11 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle)); + pix11 = _mm256_inserti128_si256(pix11, _mm256_castsi256_si128(pix11), 1); + pix21 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle)); + pix21 = _mm256_inserti128_si256(pix21, _mm256_castsi256_si128(pix21), 1); + pix31 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle)); + pix31 = _mm256_inserti128_si256(pix31, _mm256_castsi256_si128(pix31), 1); + pix41 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle)); + pix41 = _mm256_inserti128_si256(pix41, _mm256_castsi256_si128(pix41), 1); ss0 = _mm256_set1_epi32(offset); MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00); @@ -213,7 +222,11 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, __m128i ss0; MM_KERNEL_LOAD(0, x-2); - MM_KERNEL_LOAD(1, x+2); + pix01 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle); + pix11 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle); + pix21 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle); + pix31 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle); + pix41 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle); ss0 = _mm_set1_epi32(offset); MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00);