SIMD Filter. 3x3 implementation

SIMD Filter. 5x5 implementation

SIMD Filter. fast 3x3 filter

SIMD Filter. a bit faster 5x5 filter

SIMD Filter. improve locality in 5x5 filter

SIMD Filter. rearrange 3x3 filter to match 5x5

SIMD Filter. use macros

SIMD Filter. use macros in 3x3

SIMD Filter. 3x3 SSE4 singleband

SIMD Filter. faster 3x3 singleband SSE4

SIMD Filter. reuse loaded values

SIMD Filter. 3x3 SSE4 singleband: 2 lines

SIMD Filter. First AVX try

SIMD Filter. unroll AVX 2 times

SIMD Filter. Macros for AVX

SIMD Filter. unroll AVX (with no profit)

SIMD Filter. consider last pixel in AVX

SIMD Filter. 5x5 single channel SSE4 (tests failed)

SIMD Filter. fix offset

SIMD Filter. move ImagingFilterxxx functions to separate files

SIMD Filter. 3x3i

SIMD Filter. better macros

SIMD Filter. better loading

SIMD Filter. Rearrange  instruction for speedup

SIMD Filter. reduce number of registers

SIMD Filter. rearrange operations

SIMD Filter. avx2 version

SIMD Filter. finish 3x3i_4u8

SIMD Filter. 5x5i_4u8 SSE4

SIMD Filter. advanced 5x5i_4u8 SSE4

SIMD Filter. 5x5i_4u8 AVX2

SIMD Filter. fix memory access for:

3x3f_u8
3x3i_4u8
5x5i_4u8

SIMD Filter. move files

SIMD Filter. Correct offset for 3x3f_u8

# Conflicts:
#	src/libImaging/Filter.c
This commit is contained in:
Alexander 2017-08-13 17:16:16 +03:00 committed by Aleksandr Karpinskii
parent cb55ee64b5
commit 8e3a81a53c
7 changed files with 1151 additions and 286 deletions

View File

@ -26,16 +26,21 @@
#include "Imaging.h"
static inline UINT8
clip8(float in) {
if (in <= 0.0) {
return 0;
}
if (in >= 255.0) {
return 255;
}
return (UINT8)in;
}
/* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25).
8 is number of bits in result.
Any coefficients delta smaller than this precision will have no effect. */
#define PRECISION_BITS (8 + 5)
/* 16 is number of bis required for kernel storage.
1 bit is reserver for sign.
Largest possible kernel coefficient is */
#define LARGEST_KERNEL (1 << (16 - 1 - PRECISION_BITS))
#include "FilterSIMD_3x3f_u8.c"
#include "FilterSIMD_3x3f_4u8.c"
#include "FilterSIMD_3x3i_4u8.c"
#include "FilterSIMD_5x5f_u8.c"
#include "FilterSIMD_5x5f_4u8.c"
#include "FilterSIMD_5x5i_4u8.c"
static inline INT32
clip32(float in) {
@ -105,283 +110,14 @@ ImagingExpand(Imaging imIn, int xmargin, int ymargin) {
return imOut;
}
void
ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) {
#define KERNEL1x3(in0, x, kernel, d) \
(_i2f(in0[x - d]) * (kernel)[0] + _i2f(in0[x]) * (kernel)[1] + \
_i2f(in0[x + d]) * (kernel)[2])
int x = 0, y = 0;
memcpy(imOut->image[0], im->image[0], im->linesize);
if (im->bands == 1) {
// Add one time for rounding
offset += 0.5;
if (im->type == IMAGING_TYPE_INT32) {
for (y = 1; y < im->ysize - 1; y++) {
INT32 *in_1 = (INT32 *)im->image[y - 1];
INT32 *in0 = (INT32 *)im->image[y];
INT32 *in1 = (INT32 *)im->image[y + 1];
INT32 *out = (INT32 *)imOut->image[y];
out[0] = in0[0];
for (x = 1; x < im->xsize - 1; x++) {
float ss = offset;
ss += KERNEL1x3(in1, x, &kernel[0], 1);
ss += KERNEL1x3(in0, x, &kernel[3], 1);
ss += KERNEL1x3(in_1, x, &kernel[6], 1);
out[x] = clip32(ss);
}
out[x] = in0[x];
}
} else {
for (y = 1; y < im->ysize - 1; y++) {
UINT8 *in_1 = (UINT8 *)im->image[y - 1];
UINT8 *in0 = (UINT8 *)im->image[y];
UINT8 *in1 = (UINT8 *)im->image[y + 1];
UINT8 *out = (UINT8 *)imOut->image[y];
out[0] = in0[0];
for (x = 1; x < im->xsize - 1; x++) {
float ss = offset;
ss += KERNEL1x3(in1, x, &kernel[0], 1);
ss += KERNEL1x3(in0, x, &kernel[3], 1);
ss += KERNEL1x3(in_1, x, &kernel[6], 1);
out[x] = clip8(ss);
}
out[x] = in0[x];
}
}
} else {
// Add one time for rounding
offset += 0.5;
for (y = 1; y < im->ysize - 1; y++) {
UINT8 *in_1 = (UINT8 *)im->image[y - 1];
UINT8 *in0 = (UINT8 *)im->image[y];
UINT8 *in1 = (UINT8 *)im->image[y + 1];
UINT8 *out = (UINT8 *)imOut->image[y];
memcpy(out, in0, sizeof(UINT32));
if (im->bands == 2) {
for (x = 1; x < im->xsize - 1; x++) {
float ss0 = offset;
float ss3 = offset;
UINT32 v;
ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4);
ss3 += KERNEL1x3(in1, x * 4 + 3, &kernel[0], 4);
ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4);
ss3 += KERNEL1x3(in0, x * 4 + 3, &kernel[3], 4);
ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4);
ss3 += KERNEL1x3(in_1, x * 4 + 3, &kernel[6], 4);
v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
memcpy(out + x * sizeof(v), &v, sizeof(v));
}
} else if (im->bands == 3) {
for (x = 1; x < im->xsize - 1; x++) {
float ss0 = offset;
float ss1 = offset;
float ss2 = offset;
UINT32 v;
ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4);
ss1 += KERNEL1x3(in1, x * 4 + 1, &kernel[0], 4);
ss2 += KERNEL1x3(in1, x * 4 + 2, &kernel[0], 4);
ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4);
ss1 += KERNEL1x3(in0, x * 4 + 1, &kernel[3], 4);
ss2 += KERNEL1x3(in0, x * 4 + 2, &kernel[3], 4);
ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4);
ss1 += KERNEL1x3(in_1, x * 4 + 1, &kernel[6], 4);
ss2 += KERNEL1x3(in_1, x * 4 + 2, &kernel[6], 4);
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0);
memcpy(out + x * sizeof(v), &v, sizeof(v));
}
} else if (im->bands == 4) {
for (x = 1; x < im->xsize - 1; x++) {
float ss0 = offset;
float ss1 = offset;
float ss2 = offset;
float ss3 = offset;
UINT32 v;
ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4);
ss1 += KERNEL1x3(in1, x * 4 + 1, &kernel[0], 4);
ss2 += KERNEL1x3(in1, x * 4 + 2, &kernel[0], 4);
ss3 += KERNEL1x3(in1, x * 4 + 3, &kernel[0], 4);
ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4);
ss1 += KERNEL1x3(in0, x * 4 + 1, &kernel[3], 4);
ss2 += KERNEL1x3(in0, x * 4 + 2, &kernel[3], 4);
ss3 += KERNEL1x3(in0, x * 4 + 3, &kernel[3], 4);
ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4);
ss1 += KERNEL1x3(in_1, x * 4 + 1, &kernel[6], 4);
ss2 += KERNEL1x3(in_1, x * 4 + 2, &kernel[6], 4);
ss3 += KERNEL1x3(in_1, x * 4 + 3, &kernel[6], 4);
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
memcpy(out + x * sizeof(v), &v, sizeof(v));
}
}
memcpy(out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32));
}
}
memcpy(imOut->image[y], im->image[y], im->linesize);
}
void
ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) {
#define KERNEL1x5(in0, x, kernel, d) \
(_i2f(in0[x - d - d]) * (kernel)[0] + _i2f(in0[x - d]) * (kernel)[1] + \
_i2f(in0[x]) * (kernel)[2] + _i2f(in0[x + d]) * (kernel)[3] + \
_i2f(in0[x + d + d]) * (kernel)[4])
int x = 0, y = 0;
memcpy(imOut->image[0], im->image[0], im->linesize);
memcpy(imOut->image[1], im->image[1], im->linesize);
if (im->bands == 1) {
// Add one time for rounding
offset += 0.5;
if (im->type == IMAGING_TYPE_INT32) {
for (y = 2; y < im->ysize - 2; y++) {
INT32 *in_2 = (INT32 *)im->image[y - 2];
INT32 *in_1 = (INT32 *)im->image[y - 1];
INT32 *in0 = (INT32 *)im->image[y];
INT32 *in1 = (INT32 *)im->image[y + 1];
INT32 *in2 = (INT32 *)im->image[y + 2];
INT32 *out = (INT32 *)imOut->image[y];
out[0] = in0[0];
out[1] = in0[1];
for (x = 2; x < im->xsize - 2; x++) {
float ss = offset;
ss += KERNEL1x5(in2, x, &kernel[0], 1);
ss += KERNEL1x5(in1, x, &kernel[5], 1);
ss += KERNEL1x5(in0, x, &kernel[10], 1);
ss += KERNEL1x5(in_1, x, &kernel[15], 1);
ss += KERNEL1x5(in_2, x, &kernel[20], 1);
out[x] = clip32(ss);
}
out[x + 0] = in0[x + 0];
out[x + 1] = in0[x + 1];
}
} else {
for (y = 2; y < im->ysize - 2; y++) {
UINT8 *in_2 = (UINT8 *)im->image[y - 2];
UINT8 *in_1 = (UINT8 *)im->image[y - 1];
UINT8 *in0 = (UINT8 *)im->image[y];
UINT8 *in1 = (UINT8 *)im->image[y + 1];
UINT8 *in2 = (UINT8 *)im->image[y + 2];
UINT8 *out = (UINT8 *)imOut->image[y];
out[0] = in0[0];
out[1] = in0[1];
for (x = 2; x < im->xsize - 2; x++) {
float ss = offset;
ss += KERNEL1x5(in2, x, &kernel[0], 1);
ss += KERNEL1x5(in1, x, &kernel[5], 1);
ss += KERNEL1x5(in0, x, &kernel[10], 1);
ss += KERNEL1x5(in_1, x, &kernel[15], 1);
ss += KERNEL1x5(in_2, x, &kernel[20], 1);
out[x] = clip8(ss);
}
out[x + 0] = in0[x + 0];
out[x + 1] = in0[x + 1];
}
}
} else {
// Add one time for rounding
offset += 0.5;
for (y = 2; y < im->ysize - 2; y++) {
UINT8 *in_2 = (UINT8 *)im->image[y - 2];
UINT8 *in_1 = (UINT8 *)im->image[y - 1];
UINT8 *in0 = (UINT8 *)im->image[y];
UINT8 *in1 = (UINT8 *)im->image[y + 1];
UINT8 *in2 = (UINT8 *)im->image[y + 2];
UINT8 *out = (UINT8 *)imOut->image[y];
memcpy(out, in0, sizeof(UINT32) * 2);
if (im->bands == 2) {
for (x = 2; x < im->xsize - 2; x++) {
float ss0 = offset;
float ss3 = offset;
UINT32 v;
ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4);
ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4);
ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4);
ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4);
ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4);
ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4);
ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4);
ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4);
ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4);
ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4);
v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
memcpy(out + x * sizeof(v), &v, sizeof(v));
}
} else if (im->bands == 3) {
for (x = 2; x < im->xsize - 2; x++) {
float ss0 = offset;
float ss1 = offset;
float ss2 = offset;
UINT32 v;
ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4);
ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4);
ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4);
ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4);
ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4);
ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4);
ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4);
ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4);
ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4);
ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4);
ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4);
ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4);
ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4);
ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4);
ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4);
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0);
memcpy(out + x * sizeof(v), &v, sizeof(v));
}
} else if (im->bands == 4) {
for (x = 2; x < im->xsize - 2; x++) {
float ss0 = offset;
float ss1 = offset;
float ss2 = offset;
float ss3 = offset;
UINT32 v;
ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4);
ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4);
ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4);
ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4);
ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4);
ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4);
ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4);
ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4);
ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4);
ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4);
ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4);
ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4);
ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4);
ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4);
ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4);
ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4);
ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4);
ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4);
ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4);
ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4);
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
memcpy(out + x * sizeof(v), &v, sizeof(v));
}
}
memcpy(
out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32) * 2
);
}
}
memcpy(imOut->image[y], im->image[y], im->linesize);
memcpy(imOut->image[y + 1], im->image[y + 1], im->linesize);
}
Imaging
ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 offset) {
Imaging imOut;
ImagingSectionCookie cookie;
Imaging imOut;
int i, fast_path = 1;
FLOAT32 tmp;
INT16 norm_kernel[25];
INT32 norm_offset;
if (im->type != IMAGING_TYPE_UINT8 && im->type != IMAGING_TYPE_INT32) {
return (Imaging)ImagingError_ModeError();
@ -400,13 +136,48 @@ ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 o
return NULL;
}
for (i = 0; i < xsize * ysize; i++) {
tmp = kernel[i] * (1 << PRECISION_BITS);
tmp += (tmp >= 0) ? 0.5 : -0.5;
if (tmp >= 32768 || tmp <= -32769) {
fast_path = 0;
break;
}
norm_kernel[i] = tmp;
}
tmp = offset * (1 << PRECISION_BITS);
tmp += (tmp >= 0) ? 0.5 : -0.5;
if (tmp >= 1 << 30) {
norm_offset = 1 << 30;
} else if (tmp <= -1 << 30) {
norm_offset = -1 << 30;
} else {
norm_offset = tmp;
}
ImagingSectionEnter(&cookie);
if (xsize == 3) {
/* 3x3 kernel. */
ImagingFilter3x3(imOut, im, kernel, offset);
if (im->image8) {
ImagingFilter3x3f_u8(imOut, im, kernel, offset);
} else {
if (fast_path) {
ImagingFilter3x3i_4u8(imOut, im, norm_kernel, norm_offset);
} else {
ImagingFilter3x3f_4u8(imOut, im, kernel, offset);
}
}
} else {
/* 5x5 kernel. */
ImagingFilter5x5(imOut, im, kernel, offset);
if (im->image8) {
ImagingFilter5x5f_u8(imOut, im, kernel, offset);
} else {
if (fast_path) {
ImagingFilter5x5i_4u8(imOut, im, norm_kernel, norm_offset);
} else {
ImagingFilter5x5f_4u8(imOut, im, kernel, offset);
}
}
}
ImagingSectionLeave(&cookie);
return imOut;

View File

@ -0,0 +1,237 @@
void
ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel,
float offset)
{
#define MM_KERNEL1x3_LOAD(row, x) \
pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \
pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \
pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x]));
#define MM_KERNEL1x3_SUM(row, kindex) \
ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, _mm_set1_ps(kernel[3 + kindex]))); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, _mm_set1_ps(kernel[6 + kindex])));
int x = 0, y = 0;
memcpy(imOut->image32[0], im->image32[0], im->linesize);
for (y = 1; y < im->ysize-1; y++) {
INT32* in_1 = im->image32[y-1];
INT32* in0 = im->image32[y];
INT32* in1 = im->image32[y+1];
INT32* out = imOut->image32[y];
#if defined(__AVX2__)
#define MM256_LOAD(row, x) \
pix0##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in1[x])); \
pix1##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in0[x])); \
pix2##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in_1[x]));
#define MM256_SUM(pixrow, kernelrow) \
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix0##pixrow, kernel0##kernelrow)); \
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix1##pixrow, kernel1##kernelrow)); \
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix2##pixrow, kernel2##kernelrow));
#define MM256_PERMUTE(row, from0, from1, control) \
pix0##row = _mm256_permute2f128_ps(pix0##from0, pix0##from1, control); \
pix1##row = _mm256_permute2f128_ps(pix1##from0, pix1##from1, control); \
pix2##row = _mm256_permute2f128_ps(pix2##from0, pix2##from1, control);
#define MM256_OUT(x) \
_mm_cvtps_epi32(_mm_add_ps( \
_mm256_extractf128_ps(ss, 0), \
_mm256_extractf128_ps(ss, 1) \
))
__m256 kernel00 = _mm256_insertf128_ps(
_mm256_set1_ps(kernel[0+0]),
_mm_set1_ps(kernel[0+1]), 1);
__m256 kernel01 = _mm256_castps128_ps256(_mm_set1_ps(kernel[0+2]));
__m256 kernel10 = _mm256_insertf128_ps(
_mm256_set1_ps(kernel[3+0]),
_mm_set1_ps(kernel[3+1]), 1);
__m256 kernel11 = _mm256_castps128_ps256(_mm_set1_ps(kernel[3+2]));
__m256 kernel20 = _mm256_insertf128_ps(
_mm256_set1_ps(kernel[6+0]),
_mm_set1_ps(kernel[6+1]), 1);
__m256 kernel21 = _mm256_castps128_ps256(_mm_set1_ps(kernel[6+2]));
__m256 pix00, pix10, pix20;
__m256 pix01, pix11, pix21;
__m256 pix02, pix12, pix22;
out[0] = in0[0];
x = 1;
MM256_LOAD(0, 0);
for (; x < im->xsize-1-7; x += 8) {
__m256 ss;
__m128i ssi0, ssi1, ssi2, ssi3;
ss = _mm256_castps128_ps256(_mm_set1_ps(offset));
MM256_SUM(0, 0);
MM256_LOAD(1, x+1);
MM256_SUM(1, 1);
ssi0 = MM256_OUT(x);
ss = _mm256_castps128_ps256(_mm_set1_ps(offset));
MM256_SUM(1, 0);
MM256_LOAD(2, x+3);
MM256_SUM(2, 1);
ssi2 = MM256_OUT(x+2);
ss = _mm256_castps128_ps256(_mm_set1_ps(offset));
MM256_PERMUTE(0, 0, 1, 0x21);
MM256_SUM(0, 0);
MM256_PERMUTE(1, 1, 2, 0x21);
MM256_SUM(1, 1);
ssi1 = MM256_OUT(x+1);
ss = _mm256_castps128_ps256(_mm_set1_ps(offset));
MM256_SUM(1, 0);
MM256_PERMUTE(0, 2, 2, 0x21);
MM256_SUM(0, 1);
ssi3 = MM256_OUT(x+3);
ssi0 = _mm_packs_epi32(ssi0, ssi1);
ssi2 = _mm_packs_epi32(ssi2, ssi3);
ssi0 = _mm_packus_epi16(ssi0, ssi2);
_mm_storeu_si128((__m128i*) &out[x], ssi0);
ss = _mm256_castps128_ps256(_mm_set1_ps(offset));
MM256_SUM(2, 0);
MM256_LOAD(1, x+5);
MM256_SUM(1, 1);
ssi0 = MM256_OUT(x+4);
ss = _mm256_castps128_ps256(_mm_set1_ps(offset));
MM256_SUM(1, 0);
MM256_LOAD(0, x+7);
MM256_SUM(0, 1);
ssi2 = MM256_OUT(x+6);
ss = _mm256_castps128_ps256(_mm_set1_ps(offset));
MM256_PERMUTE(2, 2, 1, 0x21);
MM256_SUM(2, 0);
MM256_PERMUTE(1, 1, 0, 0x21);
MM256_SUM(1, 1);
ssi1 = MM256_OUT(x+5);
ss = _mm256_castps128_ps256(_mm_set1_ps(offset));
MM256_SUM(1, 0);
MM256_PERMUTE(2, 0, 0, 0x21);
MM256_SUM(2, 1);
ssi3 = MM256_OUT(x+7);
ssi0 = _mm_packs_epi32(ssi0, ssi1);
ssi2 = _mm_packs_epi32(ssi2, ssi3);
ssi0 = _mm_packus_epi16(ssi0, ssi2);
_mm_storeu_si128((__m128i*) &out[x+4], ssi0);
}
for (; x < im->xsize-1-1; x += 2) {
__m256 ss;
__m128i ssi0, ssi1;
ss = _mm256_castps128_ps256(_mm_set1_ps(offset));
MM256_SUM(0, 0);
MM256_LOAD(1, x+1);
MM256_SUM(1, 1);
ssi0 = MM256_OUT(x);
ss = _mm256_castps128_ps256(_mm_set1_ps(offset));
MM256_PERMUTE(0, 0, 1, 0x21);
MM256_SUM(0, 0);
MM256_PERMUTE(1, 0, 1, 0xf3);
MM256_SUM(1, 1);
ssi1 = MM256_OUT(x+1);
ssi0 = _mm_packs_epi32(ssi0, ssi1);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
_mm_storel_epi64((__m128i*) &out[x], ssi0);
MM256_PERMUTE(0, 0, 1, 0x21);
}
for (; x < im->xsize-1; x++) {
__m128 pix00, pix10, pix20;
__m128 ss = _mm_set1_ps(offset);
__m128i ssi0;
MM_KERNEL1x3_LOAD(0, x-1);
MM_KERNEL1x3_SUM(0, 0);
MM_KERNEL1x3_LOAD(0, x+0);
MM_KERNEL1x3_SUM(0, 1);
MM_KERNEL1x3_LOAD(0, x+1);
MM_KERNEL1x3_SUM(0, 2);
ssi0 = _mm_cvtps_epi32(ss);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
out[x] = _mm_cvtsi128_si32(ssi0);
}
out[x] = in0[x];
#undef MM256_LOAD
#undef MM256_SUM
#undef MM256_PERMUTE
#undef MM256_OUT
#else
__m128 pix00, pix10, pix20;
__m128 pix01, pix11, pix21;
__m128 pix02, pix12, pix22;
MM_KERNEL1x3_LOAD(0, 0);
MM_KERNEL1x3_LOAD(1, 1);
out[0] = in0[0];
x = 1;
for (; x < im->xsize-1-2; x += 3) {
__m128 ss;
__m128i ssi0, ssi1, ssi2;
ss = _mm_set1_ps(offset);
MM_KERNEL1x3_SUM(0, 0);
MM_KERNEL1x3_SUM(1, 1);
MM_KERNEL1x3_LOAD(2, x+1);
MM_KERNEL1x3_SUM(2, 2);
ssi0 = _mm_cvtps_epi32(ss);
ss = _mm_set1_ps(offset);
MM_KERNEL1x3_SUM(1, 0);
MM_KERNEL1x3_SUM(2, 1);
MM_KERNEL1x3_LOAD(0, x+2);
MM_KERNEL1x3_SUM(0, 2);
ssi1 = _mm_cvtps_epi32(ss);
ss = _mm_set1_ps(offset);
MM_KERNEL1x3_SUM(2, 0);
MM_KERNEL1x3_SUM(0, 1);
MM_KERNEL1x3_LOAD(1, x+3);
MM_KERNEL1x3_SUM(1, 2);
ssi2 = _mm_cvtps_epi32(ss);
ssi0 = _mm_packs_epi32(ssi0, ssi1);
ssi1 = _mm_packs_epi32(ssi2, ssi2);
ssi0 = _mm_packus_epi16(ssi0, ssi1);
_mm_storeu_si128((__m128i*) &out[x], ssi0);
}
for (; x < im->xsize-1; x++) {
__m128 ss = _mm_set1_ps(offset);
__m128i ssi0;
MM_KERNEL1x3_LOAD(0, x-1);
MM_KERNEL1x3_SUM(0, 0);
MM_KERNEL1x3_LOAD(0, x+0);
MM_KERNEL1x3_SUM(0, 1);
MM_KERNEL1x3_LOAD(0, x+1);
MM_KERNEL1x3_SUM(0, 2);
ssi0 = _mm_cvtps_epi32(ss);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
out[x] = _mm_cvtsi128_si32(ssi0);
}
out[x] = in0[x];
#endif
}
memcpy(imOut->image32[y], im->image32[y], im->linesize);
#undef MM_KERNEL1x3_LOAD
#undef MM_KERNEL1x3_SUM
}

View File

@ -0,0 +1,159 @@
void
ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,
float offset)
{
#define MM_KERNEL1x3_SUM1(ss, row, kernel) \
ss = _mm_mul_ps(pix0##row, kernel0##kernel); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel1##kernel)); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, kernel2##kernel));
#define MM_KERNEL1x3_SUM1_2(ss, row, kernel) \
ss = _mm_mul_ps(pix3##row, kernel0##kernel); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, kernel1##kernel)); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel2##kernel));
#define MM_KERNEL1x3_LOAD(row, x) \
pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \
pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \
pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x]));
int x, y;
__m128 kernel00 = _mm_set_ps(0, kernel[2], kernel[1], kernel[0]);
__m128 kernel10 = _mm_set_ps(0, kernel[5], kernel[4], kernel[3]);
__m128 kernel20 = _mm_set_ps(0, kernel[8], kernel[7], kernel[6]);
__m128 kernel01 = _mm_set_ps(kernel[2], kernel[1], kernel[0], 0);
__m128 kernel11 = _mm_set_ps(kernel[5], kernel[4], kernel[3], 0);
__m128 kernel21 = _mm_set_ps(kernel[8], kernel[7], kernel[6], 0);
__m128 mm_offset = _mm_set1_ps(offset);
memcpy(imOut->image8[0], im->image8[0], im->linesize);
y = 1;
for (; y < im->ysize-1-1; y += 2) {
UINT8* in_1 = im->image8[y-1];
UINT8* in0 = im->image8[y];
UINT8* in1 = im->image8[y+1];
UINT8* in2 = im->image8[y+2];
UINT8* out0 = imOut->image8[y];
UINT8* out1 = imOut->image8[y+1];
out0[0] = in0[0];
out1[0] = in1[0];
x = 1;
for (; x < im->xsize-1-3; x += 4) {
__m128 ss0, ss1, ss2, ss3, ss4, ss5;
__m128 pix00, pix10, pix20, pix30;
__m128i ssi0;
MM_KERNEL1x3_LOAD(0, x-1);
MM_KERNEL1x3_SUM1(ss0, 0, 0);
MM_KERNEL1x3_SUM1(ss1, 0, 1);
ss0 = _mm_hadd_ps(ss0, ss1);
pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x-1]));
MM_KERNEL1x3_SUM1_2(ss3, 0, 0);
MM_KERNEL1x3_SUM1_2(ss4, 0, 1);
ss3 = _mm_hadd_ps(ss3, ss4);
MM_KERNEL1x3_LOAD(0, x+1);
MM_KERNEL1x3_SUM1(ss1, 0, 0);
MM_KERNEL1x3_SUM1(ss2, 0, 1);
ss1 = _mm_hadd_ps(ss1, ss2);
pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x+1]));
MM_KERNEL1x3_SUM1_2(ss4, 0, 0);
MM_KERNEL1x3_SUM1_2(ss5, 0, 1);
ss4 = _mm_hadd_ps(ss4, ss5);
ss0 = _mm_hadd_ps(ss0, ss1);
ss0 = _mm_add_ps(ss0, mm_offset);
ssi0 = _mm_cvtps_epi32(ss0);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
*((UINT32*) &out0[x]) = _mm_cvtsi128_si32(ssi0);
ss3 = _mm_hadd_ps(ss3, ss4);
ss3 = _mm_add_ps(ss3, mm_offset);
ssi0 = _mm_cvtps_epi32(ss3);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
*((UINT32*) &out1[x]) = _mm_cvtsi128_si32(ssi0);
}
for (; x < im->xsize-1; x++) {
__m128 ss0, ss1;
__m128 pix00, pix10, pix20, pix30;
__m128i ssi0;
pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]);
pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]);
pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]);
pix30 = _mm_set_ps(0, in2[x+1], in2[x], in2[x-1]);
MM_KERNEL1x3_SUM1(ss0, 0, 0);
MM_KERNEL1x3_SUM1_2(ss1, 0, 0);
ss0 = _mm_hadd_ps(ss0, ss0);
ss0 = _mm_hadd_ps(ss0, ss0);
ss0 = _mm_add_ps(ss0, mm_offset);
ssi0 = _mm_cvtps_epi32(ss0);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
out0[x] = _mm_cvtsi128_si32(ssi0);
ss1 = _mm_hadd_ps(ss1, ss1);
ss1 = _mm_hadd_ps(ss1, ss1);
ss1 = _mm_add_ps(ss1, mm_offset);
ssi0 = _mm_cvtps_epi32(ss1);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
out1[x] = _mm_cvtsi128_si32(ssi0);
}
out0[x] = in0[x];
out1[x] = in1[x];
}
for (; y < im->ysize-1; y++) {
UINT8* in_1 = im->image8[y-1];
UINT8* in0 = im->image8[y];
UINT8* in1 = im->image8[y+1];
UINT8* out = imOut->image8[y];
out[0] = in0[0];
x = 1;
for (; x < im->xsize-2; x++) {
__m128 ss;
__m128 pix00, pix10, pix20;
__m128i ssi0;
MM_KERNEL1x3_LOAD(0, x-1);
MM_KERNEL1x3_SUM1(ss, 0, 0);
ss = _mm_hadd_ps(ss, ss);
ss = _mm_hadd_ps(ss, ss);
ss = _mm_add_ps(ss, mm_offset);
ssi0 = _mm_cvtps_epi32(ss);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
out[x] = _mm_cvtsi128_si32(ssi0);
}
for (; x < im->xsize-1; x++) {
__m128 ss;
__m128 pix00, pix10, pix20;
__m128i ssi0;
pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]);
pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]);
pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]);
MM_KERNEL1x3_SUM1(ss, 0, 0);
ss = _mm_hadd_ps(ss, ss);
ss = _mm_hadd_ps(ss, ss);
ss = _mm_add_ps(ss, mm_offset);
ssi0 = _mm_cvtps_epi32(ss);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
out[x] = _mm_cvtsi128_si32(ssi0);
}
out[x] = in0[x];
}
memcpy(imOut->image8[y], im->image8[y], im->linesize);
#undef MM_KERNEL1x3_SUM1
#undef MM_KERNEL1x3_SUM1_2
#undef MM_KERNEL1x3_LOAD
}

View File

@ -0,0 +1,222 @@
void
ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel,
INT32 offset)
{
int x, y;
offset += 1 << (PRECISION_BITS-1);
memcpy(imOut->image32[0], im->image32[0], im->linesize);
for (y = 1; y < im->ysize-1; y++) {
INT32* in_1 = im->image32[y-1];
INT32* in0 = im->image32[y];
INT32* in1 = im->image32[y+1];
INT32* out = imOut->image32[y];
#if defined(__AVX2__)
#define MM_KERNEL_LOAD(x) \
source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle)); \
source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \
pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \
pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \
source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle)); \
source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \
pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \
pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \
source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle)); \
source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \
pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \
pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
#define MM_KERNEL_SUM(ss, row, kctl) \
ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \
pix0##row, _mm256_shuffle_epi32(kernel00, kctl))); \
ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \
pix1##row, _mm256_shuffle_epi32(kernel10, kctl))); \
ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \
pix2##row, _mm256_shuffle_epi32(kernel20, kctl)));
__m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0);
__m256i kernel00 = _mm256_set_epi16(
0, 0, 0, 0, kernel[2], kernel[1], kernel[0], 0,
0, 0, 0, 0, 0, kernel[2], kernel[1], kernel[0]);
__m256i kernel10 = _mm256_set_epi16(
0, 0, 0, 0, kernel[5], kernel[4], kernel[3], 0,
0, 0, 0, 0, 0, kernel[5], kernel[4], kernel[3]);
__m256i kernel20 = _mm256_set_epi16(
0, 0, 0, 0, kernel[8], kernel[7], kernel[6], 0,
0, 0, 0, 0, 0, kernel[8], kernel[7], kernel[6]);
__m256i pix00, pix10, pix20;
__m256i pix01, pix11, pix21;
__m256i source;
out[0] = in0[0];
x = 1;
if (im->xsize >= 4) {
MM_KERNEL_LOAD(0);
}
// Last loaded pixel: x+3 + 3 (wide load)
// Last x should be < im->xsize-6
for (; x < im->xsize-1-5; x += 4) {
__m256i ss0, ss2;
__m128i ssout;
ss0 = _mm256_set1_epi32(offset);
MM_KERNEL_SUM(ss0, 0, 0x00);
MM_KERNEL_SUM(ss0, 1, 0x55);
ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS);
ss2 = _mm256_set1_epi32(offset);
MM_KERNEL_SUM(ss2, 1, 0x00);
MM_KERNEL_LOAD(x+3);
MM_KERNEL_SUM(ss2, 0, 0x55);
ss2 = _mm256_srai_epi32(ss2, PRECISION_BITS);
ss0 = _mm256_packs_epi32(ss0, ss2);
ssout = _mm_packus_epi16(
_mm256_extracti128_si256(ss0, 0),
_mm256_extracti128_si256(ss0, 1));
ssout = _mm_shuffle_epi32(ssout, 0xd8);
_mm_storeu_si128((__m128i*) &out[x], ssout);
}
for (; x < im->xsize-1; x++) {
__m256i ss = _mm256_set1_epi32(offset);
source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128(
_mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in1[x+1]), 8),
_mm_loadl_epi64((__m128i*) &in1[x-1])), shuffle));
source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1);
pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128(
_mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in0[x+1]), 8),
_mm_loadl_epi64((__m128i*) &in0[x-1])), shuffle));
source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1);
pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128(
_mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in_1[x+1]), 8),
_mm_loadl_epi64((__m128i*) &in_1[x-1])), shuffle));
source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1);
pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
MM_KERNEL_SUM(ss, 0, 0x00);
MM_KERNEL_SUM(ss, 1, 0x55);
ss = _mm256_srai_epi32(ss, PRECISION_BITS);
ss = _mm256_packs_epi32(ss, ss);
ss = _mm256_packus_epi16(ss, ss);
out[x] = _mm_cvtsi128_si32(_mm256_castsi256_si128(ss));
}
out[x] = in0[x];
#undef MM_KERNEL_LOAD
#undef MM_KERNEL_SUM
#else
#define MM_KERNEL_LOAD(x) \
source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \
pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \
pix01 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \
source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \
pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \
pix11 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \
source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); \
pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \
pix21 = _mm_unpackhi_epi8(source, _mm_setzero_si128());
#define MM_KERNEL_SUM(ss, row, kctl) \
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
pix0##row, _mm_shuffle_epi32(kernel00, kctl))); \
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
pix1##row, _mm_shuffle_epi32(kernel10, kctl))); \
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
pix2##row, _mm_shuffle_epi32(kernel20, kctl)));
__m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0);
__m128i kernel00 = _mm_set_epi16(
kernel[2], kernel[1], kernel[0], 0,
0, kernel[2], kernel[1], kernel[0]);
__m128i kernel10 = _mm_set_epi16(
kernel[5], kernel[4], kernel[3], 0,
0, kernel[5], kernel[4], kernel[3]);
__m128i kernel20 = _mm_set_epi16(
kernel[8], kernel[7], kernel[6], 0,
0, kernel[8], kernel[7], kernel[6]);
__m128i pix00, pix10, pix20;
__m128i pix01, pix11, pix21;
__m128i source;
out[0] = in0[0];
x = 1;
if (im->xsize >= 4) {
MM_KERNEL_LOAD(0);
}
// Last loaded pixel: x+3 + 3 (wide load)
// Last x should be < im->xsize-6
for (; x < im->xsize-1-5; x += 4) {
__m128i ss0 = _mm_set1_epi32(offset);
__m128i ss1 = _mm_set1_epi32(offset);
__m128i ss2 = _mm_set1_epi32(offset);
__m128i ss3 = _mm_set1_epi32(offset);
MM_KERNEL_SUM(ss0, 0, 0x00);
MM_KERNEL_SUM(ss0, 1, 0x55);
ss0 = _mm_srai_epi32(ss0, PRECISION_BITS);
MM_KERNEL_SUM(ss1, 0, 0xaa);
MM_KERNEL_SUM(ss1, 1, 0xff);
ss1 = _mm_srai_epi32(ss1, PRECISION_BITS);
MM_KERNEL_SUM(ss2, 1, 0x00);
MM_KERNEL_SUM(ss3, 1, 0xaa);
MM_KERNEL_LOAD(x+3);
MM_KERNEL_SUM(ss2, 0, 0x55);
MM_KERNEL_SUM(ss3, 0, 0xff);
ss2 = _mm_srai_epi32(ss2, PRECISION_BITS);
ss3 = _mm_srai_epi32(ss3, PRECISION_BITS);
ss0 = _mm_packs_epi32(ss0, ss1);
ss2 = _mm_packs_epi32(ss2, ss3);
ss0 = _mm_packus_epi16(ss0, ss2);
_mm_storeu_si128((__m128i*) &out[x], ss0);
}
for (; x < im->xsize-1; x++) {
__m128i ss = _mm_set1_epi32(offset);
source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in1[x-1]), shuffle);
pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in1[x+1]), shuffle);
pix01 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in0[x-1]), shuffle);
pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in0[x+1]), shuffle);
pix11 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in_1[x-1]), shuffle);
pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in_1[x+1]), shuffle);
pix21 = _mm_unpacklo_epi8(source, _mm_setzero_si128());
MM_KERNEL_SUM(ss, 0, 0x00);
MM_KERNEL_SUM(ss, 1, 0x55);
ss = _mm_srai_epi32(ss, PRECISION_BITS);
ss = _mm_packs_epi32(ss, ss);
ss = _mm_packus_epi16(ss, ss);
out[x] = _mm_cvtsi128_si32(ss);
}
out[x] = in0[x];
#undef MM_KERNEL_LOAD
#undef MM_KERNEL_SUM
#endif
}
memcpy(imOut->image32[y], im->image32[y], im->linesize);
}

View File

@ -0,0 +1,126 @@
void
ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel,
float offset)
{
#define MM_KERNEL1x5_LOAD(row, x) \
pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \
pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \
pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \
pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \
pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x]));
#define MM_KERNEL1x5_SUM(row, kindex) \
ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, _mm_set1_ps(kernel[5 + kindex]))); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, _mm_set1_ps(kernel[10 + kindex]))); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix3##row, _mm_set1_ps(kernel[15 + kindex]))); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix4##row, _mm_set1_ps(kernel[20 + kindex])));
int x, y;
memcpy(imOut->image32[0], im->image32[0], im->linesize);
memcpy(imOut->image32[1], im->image32[1], im->linesize);
for (y = 2; y < im->ysize-2; y++) {
INT32* in_2 = im->image32[y-2];
INT32* in_1 = im->image32[y-1];
INT32* in0 = im->image32[y];
INT32* in1 = im->image32[y+1];
INT32* in2 = im->image32[y+2];
INT32* out = imOut->image32[y];
__m128 pix00, pix10, pix20, pix30, pix40;
__m128 pix01, pix11, pix21, pix31, pix41;
__m128 pix02, pix12, pix22, pix32, pix42;
__m128 pix03, pix13, pix23, pix33, pix43;
__m128 pix04, pix14, pix24, pix34, pix44;
MM_KERNEL1x5_LOAD(0, 0);
MM_KERNEL1x5_LOAD(1, 1);
MM_KERNEL1x5_LOAD(2, 2);
MM_KERNEL1x5_LOAD(3, 3);
out[0] = in0[0];
out[1] = in0[1];
x = 2;
for (; x < im->xsize-2-4; x += 5) {
__m128 ss;
__m128i ssi0, ssi1, ssi2, ssi3;
ss = _mm_set1_ps(offset);
MM_KERNEL1x5_SUM(0, 0);
MM_KERNEL1x5_SUM(1, 1);
MM_KERNEL1x5_SUM(2, 2);
MM_KERNEL1x5_SUM(3, 3);
MM_KERNEL1x5_LOAD(4, x+2);
MM_KERNEL1x5_SUM(4, 4);
ssi0 = _mm_cvtps_epi32(ss);
ss = _mm_set1_ps(offset);
MM_KERNEL1x5_SUM(1, 0);
MM_KERNEL1x5_SUM(2, 1);
MM_KERNEL1x5_SUM(3, 2);
MM_KERNEL1x5_SUM(4, 3);
MM_KERNEL1x5_LOAD(0, x+3);
MM_KERNEL1x5_SUM(0, 4);
ssi1 = _mm_cvtps_epi32(ss);
ss = _mm_set1_ps(offset);
MM_KERNEL1x5_SUM(2, 0);
MM_KERNEL1x5_SUM(3, 1);
MM_KERNEL1x5_SUM(4, 2);
MM_KERNEL1x5_SUM(0, 3);
MM_KERNEL1x5_LOAD(1, x+4);
MM_KERNEL1x5_SUM(1, 4);
ssi2 = _mm_cvtps_epi32(ss);
ss = _mm_set1_ps(offset);
MM_KERNEL1x5_SUM(3, 0);
MM_KERNEL1x5_SUM(4, 1);
MM_KERNEL1x5_SUM(0, 2);
MM_KERNEL1x5_SUM(1, 3);
MM_KERNEL1x5_LOAD(2, x+5);
MM_KERNEL1x5_SUM(2, 4);
ssi3 = _mm_cvtps_epi32(ss);
ssi0 = _mm_packs_epi32(ssi0, ssi1);
ssi1 = _mm_packs_epi32(ssi2, ssi3);
ssi0 = _mm_packus_epi16(ssi0, ssi1);
_mm_storeu_si128((__m128i*) &out[x], ssi0);
ss = _mm_set1_ps(offset);
MM_KERNEL1x5_SUM(4, 0);
MM_KERNEL1x5_SUM(0, 1);
MM_KERNEL1x5_SUM(1, 2);
MM_KERNEL1x5_SUM(2, 3);
MM_KERNEL1x5_LOAD(3, x+6);
MM_KERNEL1x5_SUM(3, 4);
ssi0 = _mm_cvtps_epi32(ss);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
out[x+4] = _mm_cvtsi128_si32(ssi0);
}
for (; x < im->xsize-2; x++) {
__m128 ss = _mm_set1_ps(offset);
__m128i ssi0;
MM_KERNEL1x5_LOAD(0, x-2);
MM_KERNEL1x5_SUM(0, 0);
MM_KERNEL1x5_LOAD(0, x-1);
MM_KERNEL1x5_SUM(0, 1);
MM_KERNEL1x5_LOAD(0, x+0);
MM_KERNEL1x5_SUM(0, 2);
MM_KERNEL1x5_LOAD(0, x+1);
MM_KERNEL1x5_SUM(0, 3);
MM_KERNEL1x5_LOAD(0, x+2);
MM_KERNEL1x5_SUM(0, 4);
ssi0 = _mm_cvtps_epi32(ss);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
out[x] = _mm_cvtsi128_si32(ssi0);
}
out[x] = in0[x];
out[x+1] = in0[x+1];
}
memcpy(imOut->image32[y], im->image32[y], im->linesize);
memcpy(imOut->image32[y+1], im->image32[y+1], im->linesize);
#undef MM_KERNEL1x5_LOAD
#undef MM_KERNEL1x5_SUM
}

View File

@ -0,0 +1,100 @@
void
ImagingFilter5x5f_u8(Imaging imOut, Imaging im, const float* kernel,
float offset)
{
#define MM_KERNEL1x5_LOAD(row, x) \
pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \
pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \
pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \
pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \
pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x]));
#define MM_KERNEL1x5_SUM(ss, row, krow) \
ss = _mm_mul_ps(pix0##row, kernel0##krow); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel1##krow)); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, kernel2##krow)); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix3##row, kernel3##krow)); \
ss = _mm_add_ps(ss, _mm_mul_ps(pix4##row, kernel4##krow));
int x, y;
memcpy(imOut->image8[0], im->image8[0], im->linesize);
memcpy(imOut->image8[1], im->image8[1], im->linesize);
for (y = 2; y < im->ysize-2; y++) {
UINT8* in_2 = im->image8[y-2];
UINT8* in_1 = im->image8[y-1];
UINT8* in0 = im->image8[y];
UINT8* in1 = im->image8[y+1];
UINT8* in2 = im->image8[y+2];
UINT8* out = imOut->image8[y];
__m128 kernelx0 = _mm_set_ps(kernel[15], kernel[10], kernel[5], kernel[0]);
__m128 kernel00 = _mm_loadu_ps(&kernel[0+1]);
__m128 kernel10 = _mm_loadu_ps(&kernel[5+1]);
__m128 kernel20 = _mm_loadu_ps(&kernel[10+1]);
__m128 kernel30 = _mm_loadu_ps(&kernel[15+1]);
__m128 kernel40 = _mm_loadu_ps(&kernel[20+1]);
__m128 kernelx1 = _mm_set_ps(kernel[19], kernel[14], kernel[9], kernel[4]);
__m128 kernel01 = _mm_loadu_ps(&kernel[0+0]);
__m128 kernel11 = _mm_loadu_ps(&kernel[5+0]);
__m128 kernel21 = _mm_loadu_ps(&kernel[10+0]);
__m128 kernel31 = _mm_loadu_ps(&kernel[15+0]);
__m128 kernel41 = _mm_loadu_ps(&kernel[20+0]);
out[0] = in0[0];
out[1] = in0[1];
x = 2;
for (; x < im->xsize-2-1; x += 2) {
__m128 ss0, ss1;
__m128i ssi0;
__m128 pix00, pix10, pix20, pix30, pix40;
MM_KERNEL1x5_LOAD(0, x-1);
MM_KERNEL1x5_SUM(ss0, 0, 0);
ss0 = _mm_add_ps(ss0, _mm_mul_ps(
_mm_set_ps(in_1[x-2], in0[x-2], in1[x-2], in2[x-2]),
kernelx0));
MM_KERNEL1x5_SUM(ss1, 0, 1);
ss1 = _mm_add_ps(ss1, _mm_mul_ps(
_mm_set_ps(in_1[x+3], in0[x+3], in1[x+3], in2[x+3]),
kernelx1));
ss0 = _mm_hadd_ps(ss0, ss1);
ss0 = _mm_add_ps(ss0, _mm_set_ps(
offset, kernel[24] * in_2[x+3],
offset, kernel[20] * in_2[x-2]));
ss0 = _mm_hadd_ps(ss0, ss0);
ssi0 = _mm_cvtps_epi32(ss0);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
*((UINT32*) &out[x]) = _mm_cvtsi128_si32(ssi0);
}
for (; x < im->xsize-2; x ++) {
__m128 ss0;
__m128i ssi0;
__m128 pix00, pix10, pix20, pix30, pix40;
MM_KERNEL1x5_LOAD(0, x-2);
MM_KERNEL1x5_SUM(ss0, 0, 1);
ss0 = _mm_add_ps(ss0, _mm_mul_ps(
_mm_set_ps(in_1[x+2], in0[x+2], in1[x+2], in2[x+2]),
kernelx1));
ss0 = _mm_add_ps(ss0, _mm_set_ps(
0, 0, offset, kernel[24] * in_2[x+2]));
ss0 = _mm_hadd_ps(ss0, ss0);
ss0 = _mm_hadd_ps(ss0, ss0);
ssi0 = _mm_cvtps_epi32(ss0);
ssi0 = _mm_packs_epi32(ssi0, ssi0);
ssi0 = _mm_packus_epi16(ssi0, ssi0);
out[x] = _mm_cvtsi128_si32(ssi0);
}
out[x+0] = in0[x+0];
out[x+1] = in0[x+1];
}
memcpy(imOut->image8[y], im->image8[y], im->linesize);
memcpy(imOut->image8[y+1], im->image8[y+1], im->linesize);
#undef MM_KERNEL1x5_LOAD
#undef MM_KERNEL1x5_SUM
}

View File

@ -0,0 +1,250 @@
void
ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel,
INT32 offset)
{
int x, y;
offset += 1 << (PRECISION_BITS-1);
memcpy(imOut->image32[0], im->image32[0], im->linesize);
memcpy(imOut->image32[1], im->image32[1], im->linesize);
for (y = 2; y < im->ysize-2; y++) {
INT32* in_2 = im->image32[y-2];
INT32* in_1 = im->image32[y-1];
INT32* in0 = im->image32[y];
INT32* in1 = im->image32[y+1];
INT32* in2 = im->image32[y+2];
INT32* out = imOut->image32[y];
#if defined(__AVX2__)
#define MM_KERNEL_LOAD(row, x) \
pix0##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x]), shuffle)); \
pix0##row = _mm256_inserti128_si256(pix0##row, _mm256_castsi256_si128(pix0##row), 1); \
pix1##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle)); \
pix1##row = _mm256_inserti128_si256(pix1##row, _mm256_castsi256_si128(pix1##row), 1); \
pix2##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle)); \
pix2##row = _mm256_inserti128_si256(pix2##row, _mm256_castsi256_si128(pix2##row), 1); \
pix3##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle)); \
pix3##row = _mm256_inserti128_si256(pix3##row, _mm256_castsi256_si128(pix3##row), 1); \
pix4##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x]), shuffle)); \
pix4##row = _mm256_inserti128_si256(pix4##row, _mm256_castsi256_si128(pix4##row), 1);
#define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \
ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \
unpack_epi8(pix0##row, zero), _mm256_shuffle_epi32(kernel0##krow, kctl))); \
ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \
unpack_epi8(pix1##row, zero), _mm256_shuffle_epi32(kernel1##krow, kctl))); \
ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \
unpack_epi8(pix2##row, zero), _mm256_shuffle_epi32(kernel2##krow, kctl))); \
ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \
unpack_epi8(pix3##row, zero), _mm256_shuffle_epi32(kernel3##krow, kctl))); \
ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \
unpack_epi8(pix4##row, zero), _mm256_shuffle_epi32(kernel4##krow, kctl)));
__m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0);
__m256i kernel00 = _mm256_set_epi16(
0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0], 0,
0, 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0]);
__m256i kernel10 = _mm256_set_epi16(
0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5], 0,
0, 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5]);
__m256i kernel20 = _mm256_set_epi16(
0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10], 0,
0, 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10]);
__m256i kernel30 = _mm256_set_epi16(
0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15], 0,
0, 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15]);
__m256i kernel40 = _mm256_set_epi16(
0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20], 0,
0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20]);
__m256i zero = _mm256_setzero_si256();
__m256i pix00, pix10, pix20, pix30, pix40;
__m256i pix01, pix11, pix21, pix31, pix41;
out[0] = in0[0];
out[1] = in0[1];
x = 2;
MM_KERNEL_LOAD(0, 0);
for (; x < im->xsize-2-3; x += 4) {
__m256i ss0, ss2;
__m128i ssout;
ss0 = _mm256_set1_epi32(offset);
ss2 = _mm256_set1_epi32(offset);
MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00);
MM_KERNEL_SUM(ss0, 0, _mm256_unpackhi_epi8, 0, 0x55);
MM_KERNEL_SUM(ss2, 0, _mm256_unpackhi_epi8, 0, 0x00);
MM_KERNEL_LOAD(1, x+2);
MM_KERNEL_SUM(ss0, 1, _mm256_unpacklo_epi8, 0, 0xaa);
ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS);
MM_KERNEL_SUM(ss2, 1, _mm256_unpacklo_epi8, 0, 0x55);
MM_KERNEL_SUM(ss2, 1, _mm256_unpackhi_epi8, 0, 0xaa);
ss2 = _mm256_srai_epi32(ss2, PRECISION_BITS);
pix00 = pix01;
pix10 = pix11;
pix20 = pix21;
pix30 = pix31;
pix40 = pix41;
ss0 = _mm256_packs_epi32(ss0, ss2);
ssout = _mm_packus_epi16(
_mm256_extracti128_si256(ss0, 0),
_mm256_extracti128_si256(ss0, 1));
ssout = _mm_shuffle_epi32(ssout, 0xd8);
_mm_storeu_si128((__m128i*) &out[x], ssout);
}
for (; x < im->xsize-2; x++) {
__m256i ss0;
MM_KERNEL_LOAD(0, x-2);
pix01 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle));
pix01 = _mm256_inserti128_si256(pix01, _mm256_castsi256_si128(pix01), 1);
pix11 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle));
pix11 = _mm256_inserti128_si256(pix11, _mm256_castsi256_si128(pix11), 1);
pix21 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle));
pix21 = _mm256_inserti128_si256(pix21, _mm256_castsi256_si128(pix21), 1);
pix31 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle));
pix31 = _mm256_inserti128_si256(pix31, _mm256_castsi256_si128(pix31), 1);
pix41 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle));
pix41 = _mm256_inserti128_si256(pix41, _mm256_castsi256_si128(pix41), 1);
ss0 = _mm256_set1_epi32(offset);
MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00);
MM_KERNEL_SUM(ss0, 0, _mm256_unpackhi_epi8, 0, 0x55);
MM_KERNEL_SUM(ss0, 1, _mm256_unpacklo_epi8, 0, 0xaa);
ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS);
ss0 = _mm256_packs_epi32(ss0, ss0);
ss0 = _mm256_packus_epi16(ss0, ss0);
out[x] = _mm_cvtsi128_si32(_mm256_castsi256_si128(ss0));
}
out[x] = in0[x];
out[x+1] = in0[x+1];
#undef MM_KERNEL_LOAD
#undef MM_KERNEL_SUM
#else
#define MM_KERNEL_LOAD(row, x) \
pix0##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x]), shuffle); \
pix1##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \
pix2##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \
pix3##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); \
pix4##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x]), shuffle);
#define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
unpack_epi8(pix0##row, zero), _mm_shuffle_epi32(kernel0##krow, kctl))); \
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
unpack_epi8(pix1##row, zero), _mm_shuffle_epi32(kernel1##krow, kctl))); \
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
unpack_epi8(pix2##row, zero), _mm_shuffle_epi32(kernel2##krow, kctl))); \
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
unpack_epi8(pix3##row, zero), _mm_shuffle_epi32(kernel3##krow, kctl))); \
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
unpack_epi8(pix4##row, zero), _mm_shuffle_epi32(kernel4##krow, kctl)));
__m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0);
__m128i kernel00 = _mm_set_epi16(
0, 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0]);
__m128i kernel10 = _mm_set_epi16(
0, 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5]);
__m128i kernel20 = _mm_set_epi16(
0, 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10]);
__m128i kernel30 = _mm_set_epi16(
0, 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15]);
__m128i kernel40 = _mm_set_epi16(
0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20]);
__m128i kernel01 = _mm_set_epi16(
0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0], 0);
__m128i kernel11 = _mm_set_epi16(
0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5], 0);
__m128i kernel21 = _mm_set_epi16(
0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10], 0);
__m128i kernel31 = _mm_set_epi16(
0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15], 0);
__m128i kernel41 = _mm_set_epi16(
0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20], 0);
__m128i zero = _mm_setzero_si128();
__m128i pix00, pix10, pix20, pix30, pix40;
__m128i pix01, pix11, pix21, pix31, pix41;
out[0] = in0[0];
out[1] = in0[1];
x = 2;
MM_KERNEL_LOAD(0, 0);
for (; x < im->xsize-2-3; x += 4) {
__m128i ss0, ss1, ss2, ss3;
ss0 = _mm_set1_epi32(offset);
ss1 = _mm_set1_epi32(offset);
ss2 = _mm_set1_epi32(offset);
ss3 = _mm_set1_epi32(offset);
MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00);
MM_KERNEL_SUM(ss0, 0, _mm_unpackhi_epi8, 0, 0x55);
MM_KERNEL_SUM(ss1, 0, _mm_unpacklo_epi8, 1, 0x00);
MM_KERNEL_SUM(ss1, 0, _mm_unpackhi_epi8, 1, 0x55);
MM_KERNEL_SUM(ss2, 0, _mm_unpackhi_epi8, 0, 0x00);
MM_KERNEL_SUM(ss3, 0, _mm_unpackhi_epi8, 1, 0x00);
MM_KERNEL_LOAD(1, x+2);
MM_KERNEL_SUM(ss0, 1, _mm_unpacklo_epi8, 0, 0xaa);
ss0 = _mm_srai_epi32(ss0, PRECISION_BITS);
MM_KERNEL_SUM(ss1, 1, _mm_unpacklo_epi8, 1, 0xaa);
ss1 = _mm_srai_epi32(ss1, PRECISION_BITS);
MM_KERNEL_SUM(ss2, 1, _mm_unpacklo_epi8, 0, 0x55);
MM_KERNEL_SUM(ss2, 1, _mm_unpackhi_epi8, 0, 0xaa);
ss2 = _mm_srai_epi32(ss2, PRECISION_BITS);
MM_KERNEL_SUM(ss3, 1, _mm_unpacklo_epi8, 1, 0x55);
MM_KERNEL_SUM(ss3, 1, _mm_unpackhi_epi8, 1, 0xaa);
ss3 = _mm_srai_epi32(ss3, PRECISION_BITS);
pix00 = pix01;
pix10 = pix11;
pix20 = pix21;
pix30 = pix31;
pix40 = pix41;
ss0 = _mm_packs_epi32(ss0, ss1);
ss2 = _mm_packs_epi32(ss2, ss3);
ss0 = _mm_packus_epi16(ss0, ss2);
_mm_storeu_si128((__m128i*) &out[x], ss0);
}
for (; x < im->xsize-2; x++) {
__m128i ss0;
MM_KERNEL_LOAD(0, x-2);
pix01 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle);
pix11 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle);
pix21 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle);
pix31 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle);
pix41 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle);
ss0 = _mm_set1_epi32(offset);
MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00);
MM_KERNEL_SUM(ss0, 0, _mm_unpackhi_epi8, 0, 0x55);
MM_KERNEL_SUM(ss0, 1, _mm_unpacklo_epi8, 0, 0xaa);
ss0 = _mm_srai_epi32(ss0, PRECISION_BITS);
ss0 = _mm_packs_epi32(ss0, ss0);
ss0 = _mm_packus_epi16(ss0, ss0);
out[x] = _mm_cvtsi128_si32(ss0);
}
out[x] = in0[x];
out[x+1] = in0[x+1];
#undef MM_KERNEL_LOAD
#undef MM_KERNEL_SUM
#endif
}
memcpy(imOut->image32[y], im->image32[y], im->linesize);
memcpy(imOut->image32[y+1], im->image32[y+1], im->linesize);
}