mirror of
https://github.com/python-pillow/Pillow.git
synced 2025-08-21 04:34:47 +03:00
SIMD Filter. 3x3i
This commit is contained in:
parent
b561752597
commit
eb57b5eca7
107
libImaging/FilterSIMD_3x3i_4u8.c
Normal file
107
libImaging/FilterSIMD_3x3i_4u8.c
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
void
|
||||||
|
ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel,
|
||||||
|
INT32 offset)
|
||||||
|
{
|
||||||
|
#define MM_KERNEL_LOAD(row, x) \
|
||||||
|
pix0##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \
|
||||||
|
pix1##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \
|
||||||
|
pix2##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle);
|
||||||
|
|
||||||
|
#define MM_KERNEL_SUM(ss, row, krow) \
|
||||||
|
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
|
||||||
|
_mm_unpacklo_epi8(pix0##row, _mm_setzero_si128()), \
|
||||||
|
_mm_unpacklo_epi32(kernel0##krow, kernel0##krow))); \
|
||||||
|
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
|
||||||
|
_mm_unpacklo_epi8(pix1##row, _mm_setzero_si128()), \
|
||||||
|
_mm_unpacklo_epi32(kernel1##krow, kernel1##krow))); \
|
||||||
|
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
|
||||||
|
_mm_unpacklo_epi8(pix2##row, _mm_setzero_si128()), \
|
||||||
|
_mm_unpacklo_epi32(kernel2##krow, kernel2##krow))); \
|
||||||
|
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
|
||||||
|
_mm_unpackhi_epi8(pix0##row, _mm_setzero_si128()), \
|
||||||
|
_mm_unpackhi_epi32(kernel0##krow, kernel0##krow))); \
|
||||||
|
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
|
||||||
|
_mm_unpackhi_epi8(pix1##row, _mm_setzero_si128()), \
|
||||||
|
_mm_unpackhi_epi32(kernel1##krow, kernel1##krow))); \
|
||||||
|
ss = _mm_add_epi32(ss, _mm_madd_epi16( \
|
||||||
|
_mm_unpackhi_epi8(pix2##row, _mm_setzero_si128()), \
|
||||||
|
_mm_unpackhi_epi32(kernel2##krow, kernel2##krow)));
|
||||||
|
|
||||||
|
int x, y;
|
||||||
|
|
||||||
|
offset += 1 << (PRECISION_BITS-1);
|
||||||
|
|
||||||
|
memcpy(imOut->image32[0], im->image32[0], im->linesize);
|
||||||
|
|
||||||
|
for (y = 1; y < im->ysize-1; y++) {
|
||||||
|
INT32* in_1 = im->image32[y-1];
|
||||||
|
INT32* in0 = im->image32[y];
|
||||||
|
INT32* in1 = im->image32[y+1];
|
||||||
|
INT32* out = imOut->image32[y];
|
||||||
|
__m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0);
|
||||||
|
__m128i kernel00 = _mm_set_epi16(
|
||||||
|
0, kernel[2], 0, kernel[2],
|
||||||
|
kernel[1], kernel[0], kernel[1], kernel[0]);
|
||||||
|
__m128i kernel01 = _mm_set_epi16(
|
||||||
|
kernel[2], kernel[1], kernel[2], kernel[1],
|
||||||
|
kernel[0], 0, kernel[0], 0);
|
||||||
|
__m128i kernel10 = _mm_set_epi16(
|
||||||
|
0, kernel[5], 0, kernel[5],
|
||||||
|
kernel[4], kernel[3], kernel[4], kernel[3]);
|
||||||
|
__m128i kernel11 = _mm_set_epi16(
|
||||||
|
kernel[5], kernel[4], kernel[5], kernel[4],
|
||||||
|
kernel[3], 0, kernel[3], 0);
|
||||||
|
__m128i kernel20 = _mm_set_epi16(
|
||||||
|
0, kernel[8], 0, kernel[8],
|
||||||
|
kernel[7], kernel[6], kernel[7], kernel[6]);
|
||||||
|
__m128i kernel21 = _mm_set_epi16(
|
||||||
|
kernel[8], kernel[7], kernel[8], kernel[7],
|
||||||
|
kernel[6], 0, kernel[6], 0);
|
||||||
|
// __m128i source;
|
||||||
|
__m128i pix00, pix10, pix20;
|
||||||
|
// __m128i pix01, pix11, pix21;
|
||||||
|
|
||||||
|
out[0] = in0[0];
|
||||||
|
x = 1;
|
||||||
|
for (; x < im->xsize-1-3; x += 4) {
|
||||||
|
__m128i ss0 = _mm_set1_epi32(offset);
|
||||||
|
__m128i ss1 = _mm_set1_epi32(offset);
|
||||||
|
__m128i ss2 = _mm_set1_epi32(offset);
|
||||||
|
__m128i ss3 = _mm_set1_epi32(offset);
|
||||||
|
|
||||||
|
MM_KERNEL_LOAD(0, x-1);
|
||||||
|
MM_KERNEL_SUM(ss0, 0, 0);
|
||||||
|
ss0 = _mm_srai_epi32(ss0, PRECISION_BITS);
|
||||||
|
MM_KERNEL_SUM(ss1, 0, 1);
|
||||||
|
ss1 = _mm_srai_epi32(ss1, PRECISION_BITS);
|
||||||
|
ss0 = _mm_packs_epi32(ss0, ss1);
|
||||||
|
|
||||||
|
MM_KERNEL_LOAD(0, x+1);
|
||||||
|
MM_KERNEL_SUM(ss2, 0, 0);
|
||||||
|
ss2 = _mm_srai_epi32(ss2, PRECISION_BITS);
|
||||||
|
MM_KERNEL_SUM(ss3, 0, 1);
|
||||||
|
ss3 = _mm_srai_epi32(ss3, PRECISION_BITS);
|
||||||
|
ss2 = _mm_packs_epi32(ss2, ss3);
|
||||||
|
|
||||||
|
ss0 = _mm_packus_epi16(ss0, ss2);
|
||||||
|
_mm_storeu_si128((__m128i*) &out[x], ss0);
|
||||||
|
}
|
||||||
|
for (; x < im->xsize-1; x++) {
|
||||||
|
__m128i ss = _mm_set1_epi32(offset);
|
||||||
|
|
||||||
|
MM_KERNEL_LOAD(0, x-1);
|
||||||
|
|
||||||
|
MM_KERNEL_SUM(ss, 0, 0);
|
||||||
|
ss = _mm_srai_epi32(ss, PRECISION_BITS);
|
||||||
|
|
||||||
|
ss = _mm_packs_epi32(ss, ss);
|
||||||
|
ss = _mm_packus_epi16(ss, ss);
|
||||||
|
out[x] = _mm_cvtsi128_si32(ss);
|
||||||
|
}
|
||||||
|
out[x] = in0[x];
|
||||||
|
}
|
||||||
|
memcpy(imOut->image32[y], im->image32[y], im->linesize);
|
||||||
|
|
||||||
|
#undef MM_KERNEL_LOAD
|
||||||
|
#undef MM_KERNEL_SUM
|
||||||
|
}
|
|
@ -33,9 +33,18 @@
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25).
|
||||||
|
8 is number of bits in result.
|
||||||
|
Any coefficients delta smaller than this precision will have no effect. */
|
||||||
|
#define PRECISION_BITS (8 + 5)
|
||||||
|
/* 16 is number of bis required for kernel storage.
|
||||||
|
1 bit is reserver for sign.
|
||||||
|
Largest possible kernel coefficient is */
|
||||||
|
#define LARGEST_KERNEL (1 << (16 - 1 - PRECISION_BITS))
|
||||||
|
|
||||||
#include "FilterSIMD_3x3f_u8.c"
|
#include "FilterSIMD_3x3f_u8.c"
|
||||||
#include "FilterSIMD_3x3f_4u8.c"
|
#include "FilterSIMD_3x3f_4u8.c"
|
||||||
|
#include "FilterSIMD_3x3i_4u8.c"
|
||||||
#include "FilterSIMD_5x5f_u8.c"
|
#include "FilterSIMD_5x5f_u8.c"
|
||||||
#include "FilterSIMD_5x5f_4u8.c"
|
#include "FilterSIMD_5x5f_4u8.c"
|
||||||
|
|
||||||
|
@ -97,9 +106,15 @@ ImagingExpand(Imaging imIn, int xmargin, int ymargin, int mode) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Imaging
|
Imaging
|
||||||
ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 offset) {
|
ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32* kernel,
|
||||||
Imaging imOut;
|
FLOAT32 offset)
|
||||||
|
{
|
||||||
ImagingSectionCookie cookie;
|
ImagingSectionCookie cookie;
|
||||||
|
Imaging imOut;
|
||||||
|
int i, fast_path = 1;
|
||||||
|
FLOAT32 tmp;
|
||||||
|
INT16 norm_kernel[25];
|
||||||
|
INT32 norm_offset;
|
||||||
|
|
||||||
if (!im || im->type != IMAGING_TYPE_UINT8) {
|
if (!im || im->type != IMAGING_TYPE_UINT8) {
|
||||||
return (Imaging)ImagingError_ModeError();
|
return (Imaging)ImagingError_ModeError();
|
||||||
|
@ -118,14 +133,37 @@ ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 o
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < xsize * ysize; i++) {
|
||||||
|
tmp = kernel[i] * (1 << PRECISION_BITS);
|
||||||
|
tmp += (tmp >= 0) ? 0.5 : -0.5;
|
||||||
|
if (tmp >= 32768 || tmp <= -32769) {
|
||||||
|
fast_path = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
norm_kernel[i] = tmp;
|
||||||
|
}
|
||||||
|
tmp = offset * (1 << PRECISION_BITS);
|
||||||
|
tmp += (tmp >= 0) ? 0.5 : -0.5;
|
||||||
|
if (tmp >= 1 << 30) {
|
||||||
|
norm_offset = 1 << 30;
|
||||||
|
} else if (tmp <= -1 << 30) {
|
||||||
|
norm_offset = -1 << 30;
|
||||||
|
} else {
|
||||||
|
norm_offset = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
ImagingSectionEnter(&cookie);
|
ImagingSectionEnter(&cookie);
|
||||||
if (xsize == 3) {
|
if (xsize == 3) {
|
||||||
/* 3x3 kernel. */
|
/* 3x3 kernel. */
|
||||||
if (im->image8) {
|
if (im->image8) {
|
||||||
ImagingFilter3x3f_u8(imOut, im, kernel, offset);
|
ImagingFilter3x3f_u8(imOut, im, kernel, offset);
|
||||||
|
} else {
|
||||||
|
if (fast_path) {
|
||||||
|
ImagingFilter3x3i_4u8(imOut, im, norm_kernel, norm_offset);
|
||||||
} else {
|
} else {
|
||||||
ImagingFilter3x3f_4u8(imOut, im, kernel, offset);
|
ImagingFilter3x3f_4u8(imOut, im, kernel, offset);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
/* 5x5 kernel. */
|
/* 5x5 kernel. */
|
||||||
if (im->image8) {
|
if (im->image8) {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user