5x5 implementation

This commit is contained in:
Alexander 2017-08-13 17:45:43 +03:00
parent f79e583365
commit 72f5b73df0

View File

@ -139,7 +139,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float* kernel,
UINT32* in1 = (UINT32*) im->image[y+1];
UINT32* out = (UINT32*) imOut->image[y];
out[0] = ((UINT32*) in0)[0];
out[0] = in0[0];
for (x = 1; x < im->xsize-1; x++) {
__m128 ss = _mm_set1_ps(offset);
__m128i ssi;
@ -153,7 +153,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float* kernel,
ssi = _mm_packus_epi16(ssi, ssi);
out[x] = _mm_cvtsi128_si32(ssi);
}
out[x] = ((UINT32*) in0)[x];
out[x] = in0[x];
}
}
memcpy(imOut->image[y], im->image[y], im->linesize);
@ -171,6 +171,18 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float* kernel,
_i2f((UINT8) in0[x+d]) * (kernel)[3] + \
_i2f((UINT8) in0[x+d+d]) * (kernel)[4])
#define MM_KERNEL1x5(ss, in0, x, kernel, d) \
ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[0]), \
_mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x-d-d])))); \
ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[1]), \
_mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x-d])))); \
ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[2]), \
_mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x+0])))); \
ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[3]), \
_mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x+d])))); \
ss = _mm_add_ps(ss, _mm_mul_ps(_mm_set1_ps((kernel)[4]), \
_mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i *) &(in0)[x+d+d]))));
int x = 0, y = 0;
memcpy(imOut->image[0], im->image[0], im->linesize);
@ -201,89 +213,33 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float* kernel,
out[x+1] = in0[x+1];
}
} else {
// Add one time for rounding
offset += 0.5;
for (y = 2; y < im->ysize-2; y++) {
UINT8* in_2 = (UINT8*) im->image[y-2];
UINT8* in_1 = (UINT8*) im->image[y-1];
UINT8* in0 = (UINT8*) im->image[y];
UINT8* in1 = (UINT8*) im->image[y+1];
UINT8* in2 = (UINT8*) im->image[y+2];
UINT32* in_2 = (UINT32*) im->image[y-2];
UINT32* in_1 = (UINT32*) im->image[y-1];
UINT32* in0 = (UINT32*) im->image[y];
UINT32* in1 = (UINT32*) im->image[y+1];
UINT32* in2 = (UINT32*) im->image[y+2];
UINT32* out = (UINT32*) imOut->image[y];
out[0] = ((UINT32*) in0)[0];
out[1] = ((UINT32*) in0)[1];
if (im->bands == 2) {
out[0] = in0[0];
out[1] = in0[1];
for (x = 2; x < im->xsize-2; x++) {
float ss0 = offset;
float ss3 = offset;
ss0 += KERNEL1x5(in2, x*4+0, &kernel[0], 4);
ss3 += KERNEL1x5(in2, x*4+3, &kernel[0], 4);
ss0 += KERNEL1x5(in1, x*4+0, &kernel[5], 4);
ss3 += KERNEL1x5(in1, x*4+3, &kernel[5], 4);
ss0 += KERNEL1x5(in0, x*4+0, &kernel[10], 4);
ss3 += KERNEL1x5(in0, x*4+3, &kernel[10], 4);
ss0 += KERNEL1x5(in_1, x*4+0, &kernel[15], 4);
ss3 += KERNEL1x5(in_1, x*4+3, &kernel[15], 4);
ss0 += KERNEL1x5(in_2, x*4+0, &kernel[20], 4);
ss3 += KERNEL1x5(in_2, x*4+3, &kernel[20], 4);
out[x] = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
__m128 ss = _mm_set1_ps(offset);
__m128i ssi;
MM_KERNEL1x5(ss, in2, x, &kernel[0], 1);
MM_KERNEL1x5(ss, in1, x, &kernel[5], 1);
MM_KERNEL1x5(ss, in0, x, &kernel[10], 1);
MM_KERNEL1x5(ss, in_1, x, &kernel[15], 1);
MM_KERNEL1x5(ss, in_2, x, &kernel[20], 1);
ssi = _mm_cvtps_epi32(ss);
ssi = _mm_packs_epi32(ssi, ssi);
ssi = _mm_packus_epi16(ssi, ssi);
out[x] = _mm_cvtsi128_si32(ssi);
}
} else if (im->bands == 3) {
for (x = 2; x < im->xsize-2; x++) {
float ss0 = offset;
float ss1 = offset;
float ss2 = offset;
ss0 += KERNEL1x5(in2, x*4+0, &kernel[0], 4);
ss1 += KERNEL1x5(in2, x*4+1, &kernel[0], 4);
ss2 += KERNEL1x5(in2, x*4+2, &kernel[0], 4);
ss0 += KERNEL1x5(in1, x*4+0, &kernel[5], 4);
ss1 += KERNEL1x5(in1, x*4+1, &kernel[5], 4);
ss2 += KERNEL1x5(in1, x*4+2, &kernel[5], 4);
ss0 += KERNEL1x5(in0, x*4+0, &kernel[10], 4);
ss1 += KERNEL1x5(in0, x*4+1, &kernel[10], 4);
ss2 += KERNEL1x5(in0, x*4+2, &kernel[10], 4);
ss0 += KERNEL1x5(in_1, x*4+0, &kernel[15], 4);
ss1 += KERNEL1x5(in_1, x*4+1, &kernel[15], 4);
ss2 += KERNEL1x5(in_1, x*4+2, &kernel[15], 4);
ss0 += KERNEL1x5(in_2, x*4+0, &kernel[20], 4);
ss1 += KERNEL1x5(in_2, x*4+1, &kernel[20], 4);
ss2 += KERNEL1x5(in_2, x*4+2, &kernel[20], 4);
out[x] = MAKE_UINT32(
clip8(ss0), clip8(ss1), clip8(ss2), 0);
}
} else if (im->bands == 4) {
for (x = 2; x < im->xsize-2; x++) {
float ss0 = offset;
float ss1 = offset;
float ss2 = offset;
float ss3 = offset;
ss0 += KERNEL1x5(in2, x*4+0, &kernel[0], 4);
ss1 += KERNEL1x5(in2, x*4+1, &kernel[0], 4);
ss2 += KERNEL1x5(in2, x*4+2, &kernel[0], 4);
ss3 += KERNEL1x5(in2, x*4+3, &kernel[0], 4);
ss0 += KERNEL1x5(in1, x*4+0, &kernel[5], 4);
ss1 += KERNEL1x5(in1, x*4+1, &kernel[5], 4);
ss2 += KERNEL1x5(in1, x*4+2, &kernel[5], 4);
ss3 += KERNEL1x5(in1, x*4+3, &kernel[5], 4);
ss0 += KERNEL1x5(in0, x*4+0, &kernel[10], 4);
ss1 += KERNEL1x5(in0, x*4+1, &kernel[10], 4);
ss2 += KERNEL1x5(in0, x*4+2, &kernel[10], 4);
ss3 += KERNEL1x5(in0, x*4+3, &kernel[10], 4);
ss0 += KERNEL1x5(in_1, x*4+0, &kernel[15], 4);
ss1 += KERNEL1x5(in_1, x*4+1, &kernel[15], 4);
ss2 += KERNEL1x5(in_1, x*4+2, &kernel[15], 4);
ss3 += KERNEL1x5(in_1, x*4+3, &kernel[15], 4);
ss0 += KERNEL1x5(in_2, x*4+0, &kernel[20], 4);
ss1 += KERNEL1x5(in_2, x*4+1, &kernel[20], 4);
ss2 += KERNEL1x5(in_2, x*4+2, &kernel[20], 4);
ss3 += KERNEL1x5(in_2, x*4+3, &kernel[20], 4);
out[x] = MAKE_UINT32(
clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
}
}
out[x] = ((UINT32*) in0)[x];
out[x+1] = ((UINT32*) in0)[x+1];
out[x] = in0[x];
out[x+1] = in0[x+1];
}
}
memcpy(imOut->image[y], im->image[y], im->linesize);