mirror of
https://github.com/python-pillow/Pillow.git
synced 2025-08-20 12:14:46 +03:00
SIMD Filter. First AVX try
This commit is contained in:
parent
f2e021aa94
commit
80dfbaf22d
|
@ -244,6 +244,57 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float* kernel,
|
|||
UINT32* in0 = (UINT32*) im->image[y];
|
||||
UINT32* in1 = (UINT32*) im->image[y+1];
|
||||
UINT32* out = (UINT32*) imOut->image[y];
|
||||
#if defined(__AVX2__)
|
||||
__m256 kernel00 = _mm256_insertf128_ps(
|
||||
_mm256_set1_ps(kernel[0+0]),
|
||||
_mm_set1_ps(kernel[0+1]), 1);
|
||||
__m256 kernel01 = _mm256_castps128_ps256(_mm_set1_ps(kernel[0+2]));
|
||||
__m256 kernel10 = _mm256_insertf128_ps(
|
||||
_mm256_set1_ps(kernel[3+0]),
|
||||
_mm_set1_ps(kernel[3+1]), 1);
|
||||
__m256 kernel11 = _mm256_castps128_ps256(_mm_set1_ps(kernel[3+2]));
|
||||
__m256 kernel20 = _mm256_insertf128_ps(
|
||||
_mm256_set1_ps(kernel[6+0]),
|
||||
_mm_set1_ps(kernel[6+1]), 1);
|
||||
__m256 kernel21 = _mm256_castps128_ps256(_mm_set1_ps(kernel[6+2]));
|
||||
__m256 pix00, pix10, pix20;
|
||||
__m256 pix01, pix11, pix21;
|
||||
|
||||
out[0] = in0[0];
|
||||
x = 1;
|
||||
pix00 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in1[0]));
|
||||
pix10 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in0[0]));
|
||||
pix20 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in_1[0]));
|
||||
for (; x < im->xsize-1; x += 1) {
|
||||
__m256 ss;
|
||||
__m128i ssi;
|
||||
|
||||
pix01 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in1[x+1]));
|
||||
pix11 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in0[x+1]));
|
||||
pix21 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in_1[x+1]));
|
||||
|
||||
ss = _mm256_set1_ps(offset);
|
||||
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix00, kernel00));
|
||||
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix01, kernel01));
|
||||
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix10, kernel10));
|
||||
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix11, kernel11));
|
||||
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix20, kernel20));
|
||||
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix21, kernel21));
|
||||
|
||||
ssi = _mm_cvtps_epi32(_mm_add_ps(
|
||||
_mm256_extractf128_ps(ss, 0),
|
||||
_mm256_extractf128_ps(ss, 1)
|
||||
));
|
||||
ssi = _mm_packs_epi32(ssi, ssi);
|
||||
ssi = _mm_packus_epi16(ssi, ssi);
|
||||
out[x] = _mm_cvtsi128_si32(ssi);
|
||||
|
||||
pix00 = _mm256_permute2f128_ps(pix00, pix01, 0x21);
|
||||
pix10 = _mm256_permute2f128_ps(pix10, pix11, 0x21);
|
||||
pix20 = _mm256_permute2f128_ps(pix20, pix21, 0x21);
|
||||
}
|
||||
out[x] = in0[x];
|
||||
#else
|
||||
__m128 pix00, pix10, pix20;
|
||||
__m128 pix01, pix11, pix21;
|
||||
__m128 pix02, pix12, pix22;
|
||||
|
@ -297,6 +348,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float* kernel,
|
|||
out[x] = _mm_cvtsi128_si32(ssi0);
|
||||
}
|
||||
out[x] = in0[x];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
memcpy(imOut->image[y], im->image[y], im->linesize);
|
||||
|
|
Loading…
Reference in New Issue
Block a user