SIMD ColorLUT. finally fix alpha copy

This commit is contained in:
Alexander 2018-03-30 12:43:16 +03:00 committed by Alexander Karpinsky
parent c31c78819f
commit 6e22415933

View File

@ -100,22 +100,23 @@ ImagingColorLUT3D_linear(
ImagingSectionEnter(&cookie); ImagingSectionEnter(&cookie);
for (y = 0; y < imOut->ysize; y++) { for (y = 0; y < imOut->ysize; y++) {
UINT8* rowIn = (UINT8 *)imIn->image[y]; UINT32* rowIn = (UINT32 *)imIn->image[y];
UINT32* rowOut = (UINT32 *)imOut->image[y]; UINT32* rowOut = (UINT32 *)imOut->image[y];
UINT8* rowIn8 = (UINT8 *)imIn->image[y];
UINT8* rowOut8 = (UINT8 *)imOut->image[y];
int x = 0; int x = 0;
#if defined(__AVX2__) #if defined(__AVX2__)
{ {
__m256i index = _mm256_mullo_epi32(scale256, __m256i index = _mm256_mullo_epi32(scale256, mm256_cvtepu8_epi32(&rowIn[x]));
_mm256_cvtepu8_epi32(*(__m128i *) &rowIn[x*4]));
__m256i idxs = _mm256_hadd_epi32(_mm256_hadd_epi32( __m256i idxs = _mm256_hadd_epi32(_mm256_hadd_epi32(
_mm256_madd_epi16(index_mul256, _mm256_srli_epi32(index, SCALE_BITS)), _mm256_madd_epi16(index_mul256, _mm256_srli_epi32(index, SCALE_BITS)),
_mm256_setzero_si256()), _mm256_setzero_si256()); _mm256_setzero_si256()), _mm256_setzero_si256());
int idx1 = _mm_cvtsi128_si32(_mm256_castsi256_si128(idxs)); int idx1 = _mm_cvtsi128_si32(_mm256_castsi256_si128(idxs));
int idx2 = _mm256_extract_epi32(idxs, 4); int idx2 = _mm256_extract_epi32(idxs, 4);
for (; x < imOut->xsize - 1; x += 2) { for (; x < imOut->xsize - 1; x += 2) {
__m256i next_index = _mm256_mullo_epi32(scale256, __m256i next_index = _mm256_mullo_epi32(scale256, mm256_cvtepu8_epi32(&rowIn[x + 2]));
_mm256_cvtepu8_epi32(*(__m128i *) &rowIn[x*4 + 8]));
__m256i next_idxs = _mm256_hadd_epi32(_mm256_hadd_epi32( __m256i next_idxs = _mm256_hadd_epi32(_mm256_hadd_epi32(
_mm256_madd_epi16(index_mul256, _mm256_srli_epi32(next_index, SCALE_BITS)), _mm256_madd_epi16(index_mul256, _mm256_srli_epi32(next_index, SCALE_BITS)),
_mm256_setzero_si256()), _mm256_setzero_si256()); _mm256_setzero_si256()), _mm256_setzero_si256());
@ -189,6 +190,8 @@ ImagingColorLUT3D_linear(
result = _mm256_packus_epi16(result, result); result = _mm256_packus_epi16(result, result);
rowOut[x + 0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(result)); rowOut[x + 0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(result));
rowOut[x + 1] = _mm256_extract_epi32(result, 4); rowOut[x + 1] = _mm256_extract_epi32(result, 4);
rowOut8[x*4 + 3] = rowIn8[x*4 + 3];
rowOut8[x*4 + 7] = rowIn8[x*4 + 7];
} }
if (table_channels == 4) { if (table_channels == 4) {
@ -251,15 +254,14 @@ ImagingColorLUT3D_linear(
} }
#endif #endif
__m128i index = _mm_mullo_epi32(scale, __m128i index = _mm_mullo_epi32(scale, mm_cvtepu8_epi32(&rowIn[x]));
_mm_cvtepu8_epi32(*(__m128i *) &rowIn[x*4]));
int idx = _mm_cvtsi128_si32( int idx = _mm_cvtsi128_si32(
_mm_hadd_epi32(_mm_hadd_epi32( _mm_hadd_epi32(_mm_hadd_epi32(
_mm_madd_epi16(index_mul, _mm_srli_epi32(index, SCALE_BITS)), _mm_madd_epi16(index_mul, _mm_srli_epi32(index, SCALE_BITS)),
_mm_setzero_si128()), _mm_setzero_si128())); _mm_setzero_si128()), _mm_setzero_si128()));
for (; x < imOut->xsize; x++) { for (; x < imOut->xsize; x++) {
__m128i next_index = _mm_mullo_epi32(scale, __m128i next_index = _mm_mullo_epi32(scale, mm_cvtepu8_epi32(&rowIn[x + 1]));
_mm_cvtepu8_epi32(*(__m128i *) &rowIn[x*4 + 4]));
int next_idx = _mm_cvtsi128_si32( int next_idx = _mm_cvtsi128_si32(
_mm_hadd_epi32(_mm_hadd_epi32( _mm_hadd_epi32(_mm_hadd_epi32(
_mm_madd_epi16(index_mul, _mm_srli_epi32(next_index, SCALE_BITS)), _mm_madd_epi16(index_mul, _mm_srli_epi32(next_index, SCALE_BITS)),
@ -318,6 +320,7 @@ ImagingColorLUT3D_linear(
result = _mm_packs_epi32(result, result); result = _mm_packs_epi32(result, result);
rowOut[x] = _mm_cvtsi128_si32(_mm_packus_epi16(result, result)); rowOut[x] = _mm_cvtsi128_si32(_mm_packus_epi16(result, result));
rowOut8[x*4 + 3] = rowIn8[x*4 + 3];
} }
if (table_channels == 4) { if (table_channels == 4) {