mirror of
https://github.com/python-pillow/Pillow.git
synced 2025-08-21 04:34:47 +03:00
SIMD Filter. fix memory access for:
3x3f_u8 3x3i_4u8 5x5i_4u8
This commit is contained in:
parent
1bd3fddf44
commit
416e580a77
|
@ -80,9 +80,11 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,
|
||||||
__m128 pix00, pix10, pix20, pix30;
|
__m128 pix00, pix10, pix20, pix30;
|
||||||
__m128i ssi0;
|
__m128i ssi0;
|
||||||
|
|
||||||
MM_KERNEL1x3_LOAD(0, x-1);
|
pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]);
|
||||||
|
pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]);
|
||||||
|
pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]);
|
||||||
|
pix30 = _mm_set_ps(0, in2[x+1], in2[x], in2[x-1]);
|
||||||
MM_KERNEL1x3_SUM1(ss0, 0, 0);
|
MM_KERNEL1x3_SUM1(ss0, 0, 0);
|
||||||
pix30 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in2[x-1]));
|
|
||||||
MM_KERNEL1x3_SUM1_2(ss1, 0, 0);
|
MM_KERNEL1x3_SUM1_2(ss1, 0, 0);
|
||||||
|
|
||||||
ss0 = _mm_hadd_ps(ss0, ss0);
|
ss0 = _mm_hadd_ps(ss0, ss0);
|
||||||
|
@ -110,7 +112,7 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,
|
||||||
|
|
||||||
out[0] = in0[0];
|
out[0] = in0[0];
|
||||||
x = 1;
|
x = 1;
|
||||||
for (; x < im->xsize-1; x++) {
|
for (; x < im->xsize-2; x++) {
|
||||||
__m128 ss;
|
__m128 ss;
|
||||||
__m128 pix00, pix10, pix20;
|
__m128 pix00, pix10, pix20;
|
||||||
__m128i ssi0;
|
__m128i ssi0;
|
||||||
|
@ -125,6 +127,23 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,
|
||||||
ssi0 = _mm_packus_epi16(ssi0, ssi0);
|
ssi0 = _mm_packus_epi16(ssi0, ssi0);
|
||||||
out[x] = _mm_cvtsi128_si32(ssi0);
|
out[x] = _mm_cvtsi128_si32(ssi0);
|
||||||
}
|
}
|
||||||
|
for (; x < im->xsize-1; x++) {
|
||||||
|
__m128 ss;
|
||||||
|
__m128 pix00, pix10, pix20;
|
||||||
|
__m128i ssi0;
|
||||||
|
|
||||||
|
pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]);
|
||||||
|
pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]);
|
||||||
|
pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]);
|
||||||
|
MM_KERNEL1x3_SUM1(ss, 0, 0);
|
||||||
|
|
||||||
|
ss = _mm_hadd_ps(ss, ss);
|
||||||
|
ss = _mm_hadd_ps(ss, ss);
|
||||||
|
ssi0 = _mm_cvtps_epi32(ss);
|
||||||
|
ssi0 = _mm_packs_epi32(ssi0, ssi0);
|
||||||
|
ssi0 = _mm_packus_epi16(ssi0, ssi0);
|
||||||
|
out[x] = _mm_cvtsi128_si32(ssi0);
|
||||||
|
}
|
||||||
out[x] = in0[x];
|
out[x] = in0[x];
|
||||||
}
|
}
|
||||||
memcpy(imOut->image8[y], im->image8[y], im->linesize);
|
memcpy(imOut->image8[y], im->image8[y], im->linesize);
|
||||||
|
|
|
@ -54,8 +54,12 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel,
|
||||||
|
|
||||||
out[0] = in0[0];
|
out[0] = in0[0];
|
||||||
x = 1;
|
x = 1;
|
||||||
MM_KERNEL_LOAD(0);
|
if (im->xsize >= 4) {
|
||||||
for (; x < im->xsize-1-3; x += 4) {
|
MM_KERNEL_LOAD(0);
|
||||||
|
}
|
||||||
|
// Last loaded pixel: x+3 + 3 (wide load)
|
||||||
|
// Last x should be < im->xsize-6
|
||||||
|
for (; x < im->xsize-1-5; x += 4) {
|
||||||
__m256i ss0, ss2;
|
__m256i ss0, ss2;
|
||||||
__m128i ssout;
|
__m128i ssout;
|
||||||
|
|
||||||
|
@ -80,7 +84,25 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel,
|
||||||
for (; x < im->xsize-1; x++) {
|
for (; x < im->xsize-1; x++) {
|
||||||
__m256i ss = _mm256_set1_epi32(offset);
|
__m256i ss = _mm256_set1_epi32(offset);
|
||||||
|
|
||||||
MM_KERNEL_LOAD(x-1);
|
source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128(
|
||||||
|
_mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in1[x+1]), 8),
|
||||||
|
_mm_loadl_epi64((__m128i*) &in1[x-1])), shuffle));
|
||||||
|
source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1);
|
||||||
|
pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
|
||||||
|
pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
|
||||||
|
source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128(
|
||||||
|
_mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in0[x+1]), 8),
|
||||||
|
_mm_loadl_epi64((__m128i*) &in0[x-1])), shuffle));
|
||||||
|
source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1);
|
||||||
|
pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
|
||||||
|
pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
|
||||||
|
source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128(
|
||||||
|
_mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in_1[x+1]), 8),
|
||||||
|
_mm_loadl_epi64((__m128i*) &in_1[x-1])), shuffle));
|
||||||
|
source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1);
|
||||||
|
pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
|
||||||
|
pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
|
||||||
|
|
||||||
MM_KERNEL_SUM(ss, 0, 0x00);
|
MM_KERNEL_SUM(ss, 0, 0x00);
|
||||||
MM_KERNEL_SUM(ss, 1, 0x55);
|
MM_KERNEL_SUM(ss, 1, 0x55);
|
||||||
|
|
||||||
|
@ -131,8 +153,12 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel,
|
||||||
|
|
||||||
out[0] = in0[0];
|
out[0] = in0[0];
|
||||||
x = 1;
|
x = 1;
|
||||||
MM_KERNEL_LOAD(0);
|
if (im->xsize >= 4) {
|
||||||
for (; x < im->xsize-1-3; x += 4) {
|
MM_KERNEL_LOAD(0);
|
||||||
|
}
|
||||||
|
// Last loaded pixel: x+3 + 3 (wide load)
|
||||||
|
// Last x should be < im->xsize-6
|
||||||
|
for (; x < im->xsize-1-5; x += 4) {
|
||||||
__m128i ss0 = _mm_set1_epi32(offset);
|
__m128i ss0 = _mm_set1_epi32(offset);
|
||||||
__m128i ss1 = _mm_set1_epi32(offset);
|
__m128i ss1 = _mm_set1_epi32(offset);
|
||||||
__m128i ss2 = _mm_set1_epi32(offset);
|
__m128i ss2 = _mm_set1_epi32(offset);
|
||||||
|
@ -164,7 +190,19 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel,
|
||||||
for (; x < im->xsize-1; x++) {
|
for (; x < im->xsize-1; x++) {
|
||||||
__m128i ss = _mm_set1_epi32(offset);
|
__m128i ss = _mm_set1_epi32(offset);
|
||||||
|
|
||||||
MM_KERNEL_LOAD(x-1);
|
source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in1[x-1]), shuffle);
|
||||||
|
pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in1[x+1]), shuffle);
|
||||||
|
pix01 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in0[x-1]), shuffle);
|
||||||
|
pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in0[x+1]), shuffle);
|
||||||
|
pix11 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in_1[x-1]), shuffle);
|
||||||
|
pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in_1[x+1]), shuffle);
|
||||||
|
pix21 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
|
||||||
MM_KERNEL_SUM(ss, 0, 0x00);
|
MM_KERNEL_SUM(ss, 0, 0x00);
|
||||||
MM_KERNEL_SUM(ss, 1, 0x55);
|
MM_KERNEL_SUM(ss, 1, 0x55);
|
||||||
|
|
||||||
|
|
|
@ -102,7 +102,16 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel,
|
||||||
__m256i ss0;
|
__m256i ss0;
|
||||||
|
|
||||||
MM_KERNEL_LOAD(0, x-2);
|
MM_KERNEL_LOAD(0, x-2);
|
||||||
MM_KERNEL_LOAD(1, x+2);
|
pix01 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle));
|
||||||
|
pix01 = _mm256_inserti128_si256(pix01, _mm256_castsi256_si128(pix01), 1);
|
||||||
|
pix11 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle));
|
||||||
|
pix11 = _mm256_inserti128_si256(pix11, _mm256_castsi256_si128(pix11), 1);
|
||||||
|
pix21 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle));
|
||||||
|
pix21 = _mm256_inserti128_si256(pix21, _mm256_castsi256_si128(pix21), 1);
|
||||||
|
pix31 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle));
|
||||||
|
pix31 = _mm256_inserti128_si256(pix31, _mm256_castsi256_si128(pix31), 1);
|
||||||
|
pix41 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle));
|
||||||
|
pix41 = _mm256_inserti128_si256(pix41, _mm256_castsi256_si128(pix41), 1);
|
||||||
|
|
||||||
ss0 = _mm256_set1_epi32(offset);
|
ss0 = _mm256_set1_epi32(offset);
|
||||||
MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00);
|
MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00);
|
||||||
|
@ -213,7 +222,11 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel,
|
||||||
__m128i ss0;
|
__m128i ss0;
|
||||||
|
|
||||||
MM_KERNEL_LOAD(0, x-2);
|
MM_KERNEL_LOAD(0, x-2);
|
||||||
MM_KERNEL_LOAD(1, x+2);
|
pix01 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle);
|
||||||
|
pix11 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle);
|
||||||
|
pix21 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle);
|
||||||
|
pix31 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle);
|
||||||
|
pix41 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle);
|
||||||
|
|
||||||
ss0 = _mm_set1_epi32(offset);
|
ss0 = _mm_set1_epi32(offset);
|
||||||
MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00);
|
MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user