mirror of
https://github.com/python-pillow/Pillow.git
synced 2025-08-21 04:34:47 +03:00
SIMD Filter. 5x5 single channel SSE4 (tests failed)
This commit is contained in:
parent
92c591016e
commit
92d392a99c
|
@ -486,10 +486,60 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) {
|
||||||
UINT8* in1 = (UINT8*) im->image[y+1];
|
UINT8* in1 = (UINT8*) im->image[y+1];
|
||||||
UINT8* in2 = (UINT8*) im->image[y+2];
|
UINT8* in2 = (UINT8*) im->image[y+2];
|
||||||
UINT8* out = (UINT8*) imOut->image[y];
|
UINT8* out = (UINT8*) imOut->image[y];
|
||||||
|
__m128 kernelx0 = _mm_set_ps(kernel[15], kernel[10], kernel[5], kernel[0]);
|
||||||
|
__m128 kernel00 = _mm_loadu_ps(&kernel[0+1]);
|
||||||
|
__m128 kernel10 = _mm_loadu_ps(&kernel[5+1]);
|
||||||
|
__m128 kernel20 = _mm_loadu_ps(&kernel[10+1]);
|
||||||
|
__m128 kernel30 = _mm_loadu_ps(&kernel[15+1]);
|
||||||
|
__m128 kernel40 = _mm_loadu_ps(&kernel[20+1]);
|
||||||
|
__m128 kernelx1 = _mm_set_ps(kernel[19], kernel[14], kernel[9], kernel[4]);
|
||||||
|
__m128 kernel01 = _mm_loadu_ps(&kernel[0+0]);
|
||||||
|
__m128 kernel11 = _mm_loadu_ps(&kernel[5+0]);
|
||||||
|
__m128 kernel21 = _mm_loadu_ps(&kernel[10+0]);
|
||||||
|
__m128 kernel31 = _mm_loadu_ps(&kernel[15+0]);
|
||||||
|
__m128 kernel41 = _mm_loadu_ps(&kernel[20+0]);
|
||||||
|
|
||||||
out[0] = in0[0];
|
out[0] = in0[0];
|
||||||
out[1] = in0[1];
|
out[1] = in0[1];
|
||||||
for (x = 2; x < im->xsize - 2; x++) {
|
x = 2;
|
||||||
|
for (; x < im->xsize-2-1; x += 2) {
|
||||||
|
__m128 ss0, ss1;
|
||||||
|
__m128i ssi0;
|
||||||
|
__m128 pix00, pix10, pix20, pix30, pix40;
|
||||||
|
|
||||||
|
MM_KERNEL1x5_LOAD(0, x-1);
|
||||||
|
|
||||||
|
ss0 = _mm_setzero_ps();
|
||||||
|
ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix00, kernel00));
|
||||||
|
ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix10, kernel10));
|
||||||
|
ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix20, kernel20));
|
||||||
|
ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix30, kernel30));
|
||||||
|
ss0 = _mm_add_ps(ss0, _mm_mul_ps(pix40, kernel40));
|
||||||
|
ss0 = _mm_add_ps(ss0, _mm_mul_ps(
|
||||||
|
_mm_set_ps(in_1[x-2], in0[x-2], in1[x-2], in2[x-2]),
|
||||||
|
kernelx0));
|
||||||
|
|
||||||
|
ss1 = _mm_setzero_ps();
|
||||||
|
ss1 = _mm_add_ps(ss1, _mm_mul_ps(pix00, kernel01));
|
||||||
|
ss1 = _mm_add_ps(ss1, _mm_mul_ps(pix10, kernel11));
|
||||||
|
ss1 = _mm_add_ps(ss1, _mm_mul_ps(pix20, kernel21));
|
||||||
|
ss1 = _mm_add_ps(ss1, _mm_mul_ps(pix30, kernel31));
|
||||||
|
ss1 = _mm_add_ps(ss1, _mm_mul_ps(pix40, kernel41));
|
||||||
|
ss1 = _mm_add_ps(ss1, _mm_mul_ps(
|
||||||
|
_mm_set_ps(in_1[x+3], in0[x+3], in1[x+3], in2[x+3]),
|
||||||
|
kernelx1));
|
||||||
|
|
||||||
|
ss0 = _mm_hadd_ps(ss0, ss1);
|
||||||
|
ss0 = _mm_add_ps(ss0, _mm_set_ps(
|
||||||
|
offset, kernel[24] * in_2[x+3],
|
||||||
|
offset, kernel[20] * in_2[x-2]));
|
||||||
|
ss0 = _mm_hadd_ps(ss0, ss0);
|
||||||
|
ssi0 = _mm_cvtps_epi32(ss0);
|
||||||
|
ssi0 = _mm_packs_epi32(ssi0, ssi0);
|
||||||
|
ssi0 = _mm_packus_epi16(ssi0, ssi0);
|
||||||
|
*((UINT32*) &out[x]) = _mm_cvtsi128_si32(ssi0);
|
||||||
|
}
|
||||||
|
for (; x < im->xsize-2; x++) {
|
||||||
float ss = offset;
|
float ss = offset;
|
||||||
ss += KERNEL1x5(in2, x, &kernel[0], 1);
|
ss += KERNEL1x5(in2, x, &kernel[0], 1);
|
||||||
ss += KERNEL1x5(in1, x, &kernel[5], 1);
|
ss += KERNEL1x5(in1, x, &kernel[5], 1);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user