SIMD Filter. First AVX try

This commit is contained in:
Alexander 2017-09-04 13:05:59 +03:00 committed by Alexander Karpinsky
parent f2e021aa94
commit 80dfbaf22d

View File

@ -244,6 +244,57 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float* kernel,
UINT32* in0 = (UINT32*) im->image[y]; UINT32* in0 = (UINT32*) im->image[y];
UINT32* in1 = (UINT32*) im->image[y+1]; UINT32* in1 = (UINT32*) im->image[y+1];
UINT32* out = (UINT32*) imOut->image[y]; UINT32* out = (UINT32*) imOut->image[y];
#if defined(__AVX2__)
__m256 kernel00 = _mm256_insertf128_ps(
_mm256_set1_ps(kernel[0+0]),
_mm_set1_ps(kernel[0+1]), 1);
__m256 kernel01 = _mm256_castps128_ps256(_mm_set1_ps(kernel[0+2]));
__m256 kernel10 = _mm256_insertf128_ps(
_mm256_set1_ps(kernel[3+0]),
_mm_set1_ps(kernel[3+1]), 1);
__m256 kernel11 = _mm256_castps128_ps256(_mm_set1_ps(kernel[3+2]));
__m256 kernel20 = _mm256_insertf128_ps(
_mm256_set1_ps(kernel[6+0]),
_mm_set1_ps(kernel[6+1]), 1);
__m256 kernel21 = _mm256_castps128_ps256(_mm_set1_ps(kernel[6+2]));
__m256 pix00, pix10, pix20;
__m256 pix01, pix11, pix21;
out[0] = in0[0];
x = 1;
pix00 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in1[0]));
pix10 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in0[0]));
pix20 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in_1[0]));
for (; x < im->xsize-1; x += 1) {
__m256 ss;
__m128i ssi;
pix01 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in1[x+1]));
pix11 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in0[x+1]));
pix21 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in_1[x+1]));
ss = _mm256_set1_ps(offset);
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix00, kernel00));
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix01, kernel01));
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix10, kernel10));
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix11, kernel11));
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix20, kernel20));
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix21, kernel21));
ssi = _mm_cvtps_epi32(_mm_add_ps(
_mm256_extractf128_ps(ss, 0),
_mm256_extractf128_ps(ss, 1)
));
ssi = _mm_packs_epi32(ssi, ssi);
ssi = _mm_packus_epi16(ssi, ssi);
out[x] = _mm_cvtsi128_si32(ssi);
pix00 = _mm256_permute2f128_ps(pix00, pix01, 0x21);
pix10 = _mm256_permute2f128_ps(pix10, pix11, 0x21);
pix20 = _mm256_permute2f128_ps(pix20, pix21, 0x21);
}
out[x] = in0[x];
#else
__m128 pix00, pix10, pix20; __m128 pix00, pix10, pix20;
__m128 pix01, pix11, pix21; __m128 pix01, pix11, pix21;
__m128 pix02, pix12, pix22; __m128 pix02, pix12, pix22;
@ -297,6 +348,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float* kernel,
out[x] = _mm_cvtsi128_si32(ssi0); out[x] = _mm_cvtsi128_si32(ssi0);
} }
out[x] = in0[x]; out[x] = in0[x];
#endif
} }
} }
memcpy(imOut->image[y], im->image[y], im->linesize); memcpy(imOut->image[y], im->image[y], im->linesize);