SIMD ColorLUT. First try

This commit is contained in:
Alexander 2018-03-27 03:20:19 +03:00 committed by Alexander Karpinsky
parent 23451b8423
commit 16f69404f1
2 changed files with 82 additions and 73 deletions

View File

@ -761,7 +761,7 @@ _prepare_lut_table(PyObject *table, Py_ssize_t table_size) {
} }
/* malloc check ok, max is 2 * 4 * 65**3 = 2197000 */ /* malloc check ok, max is 2 * 4 * 65**3 = 2197000 */
prepared = (INT16 *)malloc(sizeof(INT16) * table_size); prepared = (INT16 *)malloc(sizeof(INT16) * table_size + 2);
if (!prepared) { if (!prepared) {
if (free_table_data) { if (free_table_data) {
free(table_data); free(table_data);

View File

@ -13,7 +13,8 @@
#define SCALE_BITS (32 - 8 - 6) #define SCALE_BITS (32 - 8 - 6)
#define SCALE_MASK ((1 << SCALE_BITS) - 1) #define SCALE_MASK ((1 << SCALE_BITS) - 1)
#define SHIFT_BITS (16 - 1) #define SHIFT_BITS (16 - 2)
#define SHIFT_ROUNDING (1<<(SHIFT_BITS-1))
static inline UINT8 static inline UINT8
clip8(int in) { clip8(int in) {
@ -74,10 +75,17 @@ ImagingColorLUT3D_linear(
+1 cells will be outside of the table. +1 cells will be outside of the table.
With this compensation we never hit the upper cells With this compensation we never hit the upper cells
but this also doesn't introduce any noticeable difference. */ but this also doesn't introduce any noticeable difference. */
UINT32 scale1D = (size1D - 1) / 255.0 * (1 << SCALE_BITS);
UINT32 scale2D = (size2D - 1) / 255.0 * (1 << SCALE_BITS);
UINT32 scale3D = (size3D - 1) / 255.0 * (1 << SCALE_BITS);
int size1D_2D = size1D * size2D; int size1D_2D = size1D * size2D;
__m128i scale = _mm_set_epi32(0,
(size3D - 1) / 255.0 * (1<<SCALE_BITS),
(size2D - 1) / 255.0 * (1<<SCALE_BITS),
(size1D - 1) / 255.0 * (1<<SCALE_BITS));
__m128i scale_mask = _mm_set1_epi32(SCALE_MASK);
__m128i index_mul = _mm_set_epi32(0, size1D_2D, size1D, 1);
__m128i shift_rounding = _mm_set1_epi32(SHIFT_ROUNDING);
__m128i shuffle_source = _mm_set_epi8(-1,-1, -1,-1, 11,10, 5,4, 9,8, 3,2, 7,6, 1,0);
__m128i left_mask = _mm_set1_epi32(0x0000ffff);
__m128i right_mask = _mm_set1_epi32(0xffff0000);
int x, y; int x, y;
ImagingSectionCookie cookie; ImagingSectionCookie cookie;
@ -101,84 +109,85 @@ ImagingColorLUT3D_linear(
UINT8 *rowIn = (UINT8 *)imIn->image[y]; UINT8 *rowIn = (UINT8 *)imIn->image[y];
char *rowOut = (char *)imOut->image[y]; char *rowOut = (char *)imOut->image[y];
for (x = 0; x < imOut->xsize; x++) { for (x = 0; x < imOut->xsize; x++) {
UINT32 index1D = rowIn[x * 4 + 0] * scale1D; __m128i index = _mm_mullo_epi32(scale,
UINT32 index2D = rowIn[x * 4 + 1] * scale2D; _mm_cvtepu8_epi32(*(__m128i *) &rowIn[x*4]));
UINT32 index3D = rowIn[x * 4 + 2] * scale3D; __m128i shift = _mm_srli_epi32(
INT16 shift1D = (SCALE_MASK & index1D) >> (SCALE_BITS - SHIFT_BITS); _mm_and_si128(scale_mask, index), (SCALE_BITS - SHIFT_BITS));
INT16 shift2D = (SCALE_MASK & index2D) >> (SCALE_BITS - SHIFT_BITS); int idx = table_channels * _mm_extract_epi32(
INT16 shift3D = (SCALE_MASK & index3D) >> (SCALE_BITS - SHIFT_BITS); _mm_hadd_epi32(_mm_hadd_epi32(
int idx = table_channels * table_index3D( _mm_madd_epi16(index_mul, _mm_srli_epi32(index, SCALE_BITS)),
index1D >> SCALE_BITS, _mm_setzero_si128()), _mm_setzero_si128()), 0);
index2D >> SCALE_BITS, __m128i shift1D, shift2D, shift3D;
index3D >> SCALE_BITS, __m128i source, left, right, result;
size1D, __m128i leftleft, leftright, rightleft, rightright;
size1D_2D);
INT16 result[4], left[4], right[4]; shift = _mm_or_si128(
INT16 leftleft[4], leftright[4], rightleft[4], rightright[4]; _mm_sub_epi32(_mm_set1_epi32(1<<SHIFT_BITS), shift),
_mm_slli_epi32(shift, 16));
shift1D = _mm_shuffle_epi32(shift, 0x00);
shift2D = _mm_shuffle_epi32(shift, 0x55);
shift3D = _mm_shuffle_epi32(shift, 0xaa);
if (table_channels == 3) { if (table_channels == 3) {
UINT32 v; source = _mm_shuffle_epi8(
interpolate3(leftleft, &table[idx + 0], &table[idx + 3], shift1D); _mm_loadu_si128((__m128i *) &table[idx + 0]), shuffle_source);
interpolate3( leftleft = _mm_and_si128(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(
leftright, source, shift1D), shift_rounding), SHIFT_BITS), left_mask);
&table[idx + size1D * 3],
&table[idx + size1D * 3 + 3],
shift1D);
interpolate3(left, leftleft, leftright, shift2D);
interpolate3( source = _mm_shuffle_epi8(
rightleft, _mm_loadu_si128((__m128i *) &table[idx + size1D*3]), shuffle_source);
&table[idx + size1D_2D * 3], leftright = _mm_and_si128(_mm_slli_epi32(_mm_add_epi32(_mm_madd_epi16(
&table[idx + size1D_2D * 3 + 3], source, shift1D), shift_rounding), 16 - SHIFT_BITS), right_mask);
shift1D);
interpolate3(
rightright,
&table[idx + size1D_2D * 3 + size1D * 3],
&table[idx + size1D_2D * 3 + size1D * 3 + 3],
shift1D);
interpolate3(right, rightleft, rightright, shift2D);
interpolate3(result, left, right, shift3D); source = _mm_shuffle_epi8(
_mm_loadu_si128((__m128i *) &table[idx + size1D_2D*3]), shuffle_source);
rightleft = _mm_and_si128(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(
source, shift1D), shift_rounding), SHIFT_BITS), left_mask);
source = _mm_shuffle_epi8(
_mm_loadu_si128((__m128i *) &table[idx + size1D_2D*3 + size1D*3]), shuffle_source);
rightright = _mm_and_si128(_mm_slli_epi32(_mm_add_epi32(_mm_madd_epi16(
source, shift1D), shift_rounding), 16 - SHIFT_BITS), right_mask);
v = MAKE_UINT32( left = _mm_and_si128(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(
clip8(result[0]), _mm_or_si128(leftleft, leftright), shift2D),
clip8(result[1]), shift_rounding), SHIFT_BITS), left_mask);
clip8(result[2]),
rowIn[x * 4 + 3]); right = _mm_and_si128(_mm_slli_epi32(_mm_add_epi32(_mm_madd_epi16(
memcpy(rowOut + x * sizeof(v), &v, sizeof(v)); _mm_or_si128(rightleft, rightright), shift2D),
shift_rounding), 16 - SHIFT_BITS), right_mask);
result = _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(
_mm_or_si128(left, right), shift3D),
shift_rounding), SHIFT_BITS);
result = _mm_srai_epi32(
_mm_add_epi32(_mm_set1_epi32(PRECISION_ROUNDING), result),
PRECISION_BITS);
result = _mm_packs_epi32(result, result);
rowOut[x] = _mm_cvtsi128_si32(_mm_packus_epi16(result, result));
} }
if (table_channels == 4) { // if (table_channels == 4) {
UINT32 v; // interpolate4(leftleft, &table[idx + 0], &table[idx + 4], shift1D);
interpolate4(leftleft, &table[idx + 0], &table[idx + 4], shift1D); // interpolate4(leftright, &table[idx + size1D*4],
interpolate4( // &table[idx + size1D*4 + 4], shift1D);
leftright, // interpolate4(left, leftleft, leftright, shift2D);
&table[idx + size1D * 4],
&table[idx + size1D * 4 + 4],
shift1D);
interpolate4(left, leftleft, leftright, shift2D);
interpolate4( // interpolate4(rightleft, &table[idx + size1D_2D*4],
rightleft, // &table[idx + size1D_2D*4 + 4], shift1D);
&table[idx + size1D_2D * 4], // interpolate4(rightright, &table[idx + size1D_2D*4 + size1D*4],
&table[idx + size1D_2D * 4 + 4], // &table[idx + size1D_2D*4 + size1D*4 + 4], shift1D);
shift1D); // interpolate4(right, rightleft, rightright, shift2D);
interpolate4(
rightright,
&table[idx + size1D_2D * 4 + size1D * 4],
&table[idx + size1D_2D * 4 + size1D * 4 + 4],
shift1D);
interpolate4(right, rightleft, rightright, shift2D);
interpolate4(result, left, right, shift3D); // interpolate4(result, left, right, shift3D);
v = MAKE_UINT32( // rowOut[x] = MAKE_UINT32(
clip8(result[0]), // clip8(result[0]), clip8(result[1]),
clip8(result[1]), // clip8(result[2]), clip8(result[3]));
clip8(result[2]), // }
clip8(result[3]));
memcpy(rowOut + x * sizeof(v), &v, sizeof(v));
}
} }
} }
ImagingSectionLeave(&cookie); ImagingSectionLeave(&cookie);