mirror of
https://github.com/python-pillow/Pillow.git
synced 2025-08-21 04:34:47 +03:00
SIMD Filter. 5x5 implementation
This commit is contained in:
parent
e9e2ed124a
commit
2f4aad4d57
|
@ -146,7 +146,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) {
|
||||||
UINT32* in1 = (UINT32*) im->image[y+1];
|
UINT32* in1 = (UINT32*) im->image[y+1];
|
||||||
UINT32* out = (UINT32*) imOut->image[y];
|
UINT32* out = (UINT32*) imOut->image[y];
|
||||||
|
|
||||||
out[0] = ((UINT32*) in0)[0];
|
out[0] = in0[0];
|
||||||
for (x = 1; x < im->xsize-1; x++) {
|
for (x = 1; x < im->xsize-1; x++) {
|
||||||
__m128 ss = _mm_set1_ps(offset);
|
__m128 ss = _mm_set1_ps(offset);
|
||||||
__m128i ssi;
|
__m128i ssi;
|
||||||
|
@ -160,7 +160,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) {
|
||||||
ssi = _mm_packus_epi16(ssi, ssi);
|
ssi = _mm_packus_epi16(ssi, ssi);
|
||||||
out[x] = _mm_cvtsi128_si32(ssi);
|
out[x] = _mm_cvtsi128_si32(ssi);
|
||||||
}
|
}
|
||||||
memcpy(out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32));
|
out[x] = in0[x];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
memcpy(imOut->image[y], im->image[y], im->linesize);
|
memcpy(imOut->image[y], im->image[y], im->linesize);
|
||||||
|
@ -174,6 +174,18 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) {
|
||||||
_i2f((UINT8)in0[x + d]) * (kernel)[3] + \
|
_i2f((UINT8)in0[x + d]) * (kernel)[3] + \
|
||||||
_i2f((UINT8)in0[x + d + d]) * (kernel)[4])
|
_i2f((UINT8)in0[x + d + d]) * (kernel)[4])
|
||||||
|
|
||||||
|
#define MM_KERNEL1x5(ss, in0, x, kernel, d) \
|
||||||
|
ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[0]), \
|
||||||
|
_mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x-d-d])))); \
|
||||||
|
ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[1]), \
|
||||||
|
_mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x-d])))); \
|
||||||
|
ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[2]), \
|
||||||
|
_mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x+0])))); \
|
||||||
|
ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[3]), \
|
||||||
|
_mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x+d])))); \
|
||||||
|
ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[4]), \
|
||||||
|
_mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x+d+d]))));
|
||||||
|
|
||||||
int x = 0, y = 0;
|
int x = 0, y = 0;
|
||||||
|
|
||||||
memcpy(imOut->image[0], im->image[0], im->linesize);
|
memcpy(imOut->image[0], im->image[0], im->linesize);
|
||||||
|
@ -204,92 +216,33 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) {
|
||||||
out[x + 1] = in0[x + 1];
|
out[x + 1] = in0[x + 1];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Add one time for rounding
|
for (y = 2; y < im->ysize-2; y++) {
|
||||||
offset += 0.5;
|
UINT32* in_2 = (UINT32*) im->image[y-2];
|
||||||
for (y = 2; y < im->ysize - 2; y++) {
|
UINT32* in_1 = (UINT32*) im->image[y-1];
|
||||||
UINT8 *in_2 = (UINT8 *)im->image[y - 2];
|
UINT32* in0 = (UINT32*) im->image[y];
|
||||||
UINT8 *in_1 = (UINT8 *)im->image[y - 1];
|
UINT32* in1 = (UINT32*) im->image[y+1];
|
||||||
UINT8 *in0 = (UINT8 *)im->image[y];
|
UINT32* in2 = (UINT32*) im->image[y+2];
|
||||||
UINT8 *in1 = (UINT8 *)im->image[y + 1];
|
UINT32* out = (UINT32*) imOut->image[y];
|
||||||
UINT8 *in2 = (UINT8 *)im->image[y + 2];
|
|
||||||
UINT8 *out = (UINT8 *)imOut->image[y];
|
|
||||||
|
|
||||||
memcpy(out, in0, sizeof(UINT32) * 2);
|
out[0] = in0[0];
|
||||||
if (im->bands == 2) {
|
out[1] = in0[1];
|
||||||
for (x = 2; x < im->xsize - 2; x++) {
|
for (x = 2; x < im->xsize-2; x++) {
|
||||||
float ss0 = offset;
|
__m128 ss = _mm_set1_ps(offset);
|
||||||
float ss3 = offset;
|
__m128i ssi;
|
||||||
UINT32 v;
|
|
||||||
ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4);
|
MM_KERNEL1x5(ss, in2, x, &kernel[0], 1);
|
||||||
ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4);
|
MM_KERNEL1x5(ss, in1, x, &kernel[5], 1);
|
||||||
ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4);
|
MM_KERNEL1x5(ss, in0, x, &kernel[10], 1);
|
||||||
ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4);
|
MM_KERNEL1x5(ss, in_1, x, &kernel[15], 1);
|
||||||
ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4);
|
MM_KERNEL1x5(ss, in_2, x, &kernel[20], 1);
|
||||||
ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4);
|
|
||||||
ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4);
|
ssi = _mm_cvtps_epi32(ss);
|
||||||
ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4);
|
ssi = _mm_packs_epi32(ssi, ssi);
|
||||||
ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4);
|
ssi = _mm_packus_epi16(ssi, ssi);
|
||||||
ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4);
|
out[x] = _mm_cvtsi128_si32(ssi);
|
||||||
v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
|
|
||||||
memcpy(out + x * sizeof(v), &v, sizeof(v));
|
|
||||||
}
|
|
||||||
} else if (im->bands == 3) {
|
|
||||||
for (x = 2; x < im->xsize - 2; x++) {
|
|
||||||
float ss0 = offset;
|
|
||||||
float ss1 = offset;
|
|
||||||
float ss2 = offset;
|
|
||||||
UINT32 v;
|
|
||||||
ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4);
|
|
||||||
ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4);
|
|
||||||
ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4);
|
|
||||||
ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4);
|
|
||||||
ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4);
|
|
||||||
ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4);
|
|
||||||
ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4);
|
|
||||||
ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4);
|
|
||||||
ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4);
|
|
||||||
ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4);
|
|
||||||
ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4);
|
|
||||||
ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4);
|
|
||||||
ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4);
|
|
||||||
ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4);
|
|
||||||
ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4);
|
|
||||||
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0);
|
|
||||||
memcpy(out + x * sizeof(v), &v, sizeof(v));
|
|
||||||
}
|
|
||||||
} else if (im->bands == 4) {
|
|
||||||
for (x = 2; x < im->xsize - 2; x++) {
|
|
||||||
float ss0 = offset;
|
|
||||||
float ss1 = offset;
|
|
||||||
float ss2 = offset;
|
|
||||||
float ss3 = offset;
|
|
||||||
UINT32 v;
|
|
||||||
ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4);
|
|
||||||
ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4);
|
|
||||||
ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4);
|
|
||||||
ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4);
|
|
||||||
ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4);
|
|
||||||
ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4);
|
|
||||||
ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4);
|
|
||||||
ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4);
|
|
||||||
ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4);
|
|
||||||
ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4);
|
|
||||||
ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4);
|
|
||||||
ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4);
|
|
||||||
ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4);
|
|
||||||
ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4);
|
|
||||||
ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4);
|
|
||||||
ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4);
|
|
||||||
ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4);
|
|
||||||
ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4);
|
|
||||||
ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4);
|
|
||||||
ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4);
|
|
||||||
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
|
|
||||||
memcpy(out + x * sizeof(v), &v, sizeof(v));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
memcpy(
|
out[x] = in0[x];
|
||||||
out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32) * 2);
|
out[x+1] = in0[x+1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
memcpy(imOut->image[y], im->image[y], im->linesize);
|
memcpy(imOut->image[y], im->image[y], im->linesize);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user