SIMD. Fix memory read error on some compilers

This commit is contained in:
Alexander 2020-01-07 02:35:42 +03:00 committed by Alexander Karpinsky
parent f9c5589cac
commit 33b11280b0
8 changed files with 56 additions and 43 deletions

View File

@ -22,8 +22,8 @@ ImagingLineBoxBlur32(UINT32 *lineOut, UINT32 *lineIn, int lastx, int radius, int
#define MOVE_ACC(acc, subtract, add) \ #define MOVE_ACC(acc, subtract, add) \
{ \ { \
__m128i tmp1 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[add])); \ __m128i tmp1 = mm_cvtepu8_epi32(&lineIn[add]); \
__m128i tmp2 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[subtract])); \ __m128i tmp2 = mm_cvtepu8_epi32(&lineIn[subtract]); \
acc = _mm_sub_epi32(_mm_add_epi32(acc, tmp1), tmp2); \ acc = _mm_sub_epi32(_mm_add_epi32(acc, tmp1), tmp2); \
} }
@ -32,8 +32,8 @@ ImagingLineBoxBlur32(UINT32 *lineOut, UINT32 *lineIn, int lastx, int radius, int
#define ADD_FAR(bulk, acc, left, right) \ #define ADD_FAR(bulk, acc, left, right) \
{ \ { \
__m128i tmp3 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[left])); \ __m128i tmp3 = mm_cvtepu8_epi32(&lineIn[left]); \
__m128i tmp4 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[right])); \ __m128i tmp4 = mm_cvtepu8_epi32(&lineIn[right]); \
__m128i tmp5 = _mm_mullo_epi32(_mm_add_epi32(tmp3, tmp4), fws); \ __m128i tmp5 = _mm_mullo_epi32(_mm_add_epi32(tmp3, tmp4), fws); \
__m128i tmp6 = _mm_mullo_epi32(acc, wws); \ __m128i tmp6 = _mm_mullo_epi32(acc, wws); \
bulk = _mm_add_epi32(tmp5, tmp6); \ bulk = _mm_add_epi32(tmp5, tmp6); \
@ -52,19 +52,19 @@ ImagingLineBoxBlur32(UINT32 *lineOut, UINT32 *lineIn, int lastx, int radius, int
then from "0" to "radius-1". */ then from "0" to "radius-1". */
// acc[0] = lineIn[0][0] * (radius + 1); // acc[0] = lineIn[0][0] * (radius + 1);
acc = _mm_mullo_epi32( acc = _mm_mullo_epi32(
_mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[0])), mm_cvtepu8_epi32(&lineIn[0]),
_mm_set1_epi32(radius + 1) _mm_set1_epi32(radius + 1)
); );
/* As radius can be bigger than xsize, iterate to edgeA -1. */ /* As radius can be bigger than xsize, iterate to edgeA -1. */
for (x = 0; x < edgeA - 1; x++) { for (x = 0; x < edgeA - 1; x++) {
// acc[0] += lineIn[x][0]; // acc[0] += lineIn[x][0];
acc = _mm_add_epi32(acc, _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[x]))); acc = _mm_add_epi32(acc, mm_cvtepu8_epi32(&lineIn[x]));
} }
/* Then multiply remainder to last x. */ /* Then multiply remainder to last x. */
// acc[0] += lineIn[lastx][0] * (radius - edgeA + 1); // acc[0] += lineIn[lastx][0] * (radius - edgeA + 1);
acc = _mm_add_epi32(acc, _mm_mullo_epi32( acc = _mm_add_epi32(acc, _mm_mullo_epi32(
_mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[x])), mm_cvtepu8_epi32(&lineIn[x]),
_mm_set1_epi32(radius - edgeA + 1) _mm_set1_epi32(radius - edgeA + 1)
)); ));
@ -132,8 +132,8 @@ ImagingLineBoxBlurZero32(UINT32 *lineOut, UINT32 *lineIn, int lastx, int edgeA,
#define ADD_FAR(bulk, acc, left, right) \ #define ADD_FAR(bulk, acc, left, right) \
{ \ { \
__m128i tmp3 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[left])); \ __m128i tmp3 = mm_cvtepu8_epi32(&lineIn[left]); \
__m128i tmp4 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[right])); \ __m128i tmp4 = mm_cvtepu8_epi32(&lineIn[right]); \
__m128i tmp5 = _mm_mullo_epi32(_mm_add_epi32(tmp3, tmp4), fws); \ __m128i tmp5 = _mm_mullo_epi32(_mm_add_epi32(tmp3, tmp4), fws); \
__m128i tmp6 = _mm_mullo_epi32(acc, wws); \ __m128i tmp6 = _mm_mullo_epi32(acc, wws); \
bulk = _mm_add_epi32(tmp5, tmp6); \ bulk = _mm_add_epi32(tmp5, tmp6); \
@ -151,21 +151,21 @@ ImagingLineBoxBlurZero32(UINT32 *lineOut, UINT32 *lineIn, int lastx, int edgeA,
/* Subtract pixel from left ("0"). /* Subtract pixel from left ("0").
Add pixels from radius. */ Add pixels from radius. */
for (x = 0; x < edgeA; x++) { for (x = 0; x < edgeA; x++) {
acc = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[0])); acc = mm_cvtepu8_epi32(&lineIn[0]);
ADD_FAR(bulk, acc, 0, x + 1); ADD_FAR(bulk, acc, 0, x + 1);
SAVE(x, bulk); SAVE(x, bulk);
} }
/* Subtract previous pixel from "-radius". /* Subtract previous pixel from "-radius".
Add pixels from radius. */ Add pixels from radius. */
for (x = edgeA; x < edgeB; x++) { for (x = edgeA; x < edgeB; x++) {
acc = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[x])); acc = mm_cvtepu8_epi32(&lineIn[x]);
ADD_FAR(bulk, acc, x - 1, x + 1); ADD_FAR(bulk, acc, x - 1, x + 1);
SAVE(x, bulk); SAVE(x, bulk);
} }
/* Subtract previous pixel from "-radius". /* Subtract previous pixel from "-radius".
Add last pixel. */ Add last pixel. */
for (x = edgeB; x <= lastx; x++) { for (x = edgeB; x <= lastx; x++) {
acc = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[lastx])); acc = mm_cvtepu8_epi32(&lineIn[lastx]);
ADD_FAR(bulk, acc, x - 1, lastx); ADD_FAR(bulk, acc, x - 1, lastx);
SAVE(x, bulk); SAVE(x, bulk);
} }

View File

@ -3,9 +3,9 @@ ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel,
float offset) float offset)
{ {
#define MM_KERNEL1x3_LOAD(row, x) \ #define MM_KERNEL1x3_LOAD(row, x) \
pix0##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in1[x])); \ pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \
pix1##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in0[x])); \ pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \
pix2##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_1[x])); pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x]));
#define MM_KERNEL1x3_SUM(row, kindex) \ #define MM_KERNEL1x3_SUM(row, kindex) \
ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \
@ -25,9 +25,9 @@ ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel,
#if defined(__AVX2__) #if defined(__AVX2__)
#define MM256_LOAD(row, x) \ #define MM256_LOAD(row, x) \
pix0##row = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in1[x])); \ pix0##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in1[x])); \
pix1##row = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in0[x])); \ pix1##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in0[x])); \
pix2##row = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in_1[x])); pix2##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in_1[x]));
#define MM256_SUM(pixrow, kernelrow) \ #define MM256_SUM(pixrow, kernelrow) \
ss = _mm256_add_ps(ss, _mm256_mul_ps(pix0##pixrow, kernel0##kernelrow)); \ ss = _mm256_add_ps(ss, _mm256_mul_ps(pix0##pixrow, kernel0##kernelrow)); \

View File

@ -15,9 +15,9 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,
ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel2##kernel)); ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel2##kernel));
#define MM_KERNEL1x3_LOAD(row, x) \ #define MM_KERNEL1x3_LOAD(row, x) \
pix0##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in1[x])); \ pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \
pix1##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in0[x])); \ pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \
pix2##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_1[x])); pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x]));
int x, y; int x, y;
__m128 kernel00 = _mm_set_ps(0, kernel[2], kernel[1], kernel[0]); __m128 kernel00 = _mm_set_ps(0, kernel[2], kernel[1], kernel[0]);
@ -49,7 +49,7 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,
MM_KERNEL1x3_SUM1(ss0, 0, 0); MM_KERNEL1x3_SUM1(ss0, 0, 0);
MM_KERNEL1x3_SUM1(ss1, 0, 1); MM_KERNEL1x3_SUM1(ss1, 0, 1);
ss0 = _mm_hadd_ps(ss0, ss1); ss0 = _mm_hadd_ps(ss0, ss1);
pix30 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in2[x-1])); pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x-1]));
MM_KERNEL1x3_SUM1_2(ss3, 0, 0); MM_KERNEL1x3_SUM1_2(ss3, 0, 0);
MM_KERNEL1x3_SUM1_2(ss4, 0, 1); MM_KERNEL1x3_SUM1_2(ss4, 0, 1);
ss3 = _mm_hadd_ps(ss3, ss4); ss3 = _mm_hadd_ps(ss3, ss4);
@ -58,7 +58,7 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel,
MM_KERNEL1x3_SUM1(ss1, 0, 0); MM_KERNEL1x3_SUM1(ss1, 0, 0);
MM_KERNEL1x3_SUM1(ss2, 0, 1); MM_KERNEL1x3_SUM1(ss2, 0, 1);
ss1 = _mm_hadd_ps(ss1, ss2); ss1 = _mm_hadd_ps(ss1, ss2);
pix30 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in2[x+1])); pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x+1]));
MM_KERNEL1x3_SUM1_2(ss4, 0, 0); MM_KERNEL1x3_SUM1_2(ss4, 0, 0);
MM_KERNEL1x3_SUM1_2(ss5, 0, 1); MM_KERNEL1x3_SUM1_2(ss5, 0, 1);
ss4 = _mm_hadd_ps(ss4, ss5); ss4 = _mm_hadd_ps(ss4, ss5);

View File

@ -3,11 +3,11 @@ ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel,
float offset) float offset)
{ {
#define MM_KERNEL1x5_LOAD(row, x) \ #define MM_KERNEL1x5_LOAD(row, x) \
pix0##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in2[x])); \ pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \
pix1##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in1[x])); \ pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \
pix2##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in0[x])); \ pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \
pix3##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_1[x])); \ pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \
pix4##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_2[x])); pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x]));
#define MM_KERNEL1x5_SUM(row, kindex) \ #define MM_KERNEL1x5_SUM(row, kindex) \
ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \

View File

@ -3,11 +3,11 @@ ImagingFilter5x5f_u8(Imaging imOut, Imaging im, const float* kernel,
float offset) float offset)
{ {
#define MM_KERNEL1x5_LOAD(row, x) \ #define MM_KERNEL1x5_LOAD(row, x) \
pix0##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in2[x])); \ pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \
pix1##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in1[x])); \ pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \
pix2##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in0[x])); \ pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \
pix3##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_1[x])); \ pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \
pix4##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_2[x])); pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x]));
#define MM_KERNEL1x5_SUM(ss, row, krow) \ #define MM_KERNEL1x5_SUM(ss, row, krow) \
ss = _mm_setzero_ps(); \ ss = _mm_setzero_ps(); \

View File

@ -40,3 +40,16 @@ static float __attribute__((always_inline)) inline _i2f(int v) {
#else #else
static float inline _i2f(int v) { return (float)v; } static float inline _i2f(int v) { return (float)v; }
#endif #endif
__m128i inline
mm_cvtepu8_epi32(void *ptr) {
return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(INT32 *) ptr));
}
#if defined(__AVX2__)
__m256i inline
mm256_cvtepu8_epi32(void *ptr) {
return _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*(int64_t *) ptr));
}
#endif

View File

@ -82,13 +82,13 @@ ImagingResampleHorizontalConvolution8u4x(
// [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0 // [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0
pix = _mm256_inserti128_si256(_mm256_castsi128_si256( pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin])), mm_cvtepu8_epi32(&lineIn0[x + xmin])),
_mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]), 1); mm_cvtepu8_epi32(&lineIn1[x + xmin]), 1);
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
pix = _mm256_inserti128_si256(_mm256_castsi128_si256( pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin])), mm_cvtepu8_epi32(&lineIn2[x + xmin])),
_mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]), 1); mm_cvtepu8_epi32(&lineIn3[x + xmin]), 1);
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
} }
@ -182,16 +182,16 @@ ImagingResampleHorizontalConvolution8u4x(
// [16] xx k0 xx k0 xx k0 xx k0 // [16] xx k0 xx k0 xx k0 xx k0
mmk = _mm_set1_epi32(k[x]); mmk = _mm_set1_epi32(k[x]);
// [16] xx a0 xx b0 xx g0 xx r0 // [16] xx a0 xx b0 xx g0 xx r0
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin]); pix = mm_cvtepu8_epi32(&lineIn0[x + xmin]);
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]); pix = mm_cvtepu8_epi32(&lineIn1[x + xmin]);
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin]); pix = mm_cvtepu8_epi32(&lineIn2[x + xmin]);
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk)); sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk));
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]); pix = mm_cvtepu8_epi32(&lineIn3[x + xmin]);
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk)); sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk));
} }
@ -354,7 +354,7 @@ ImagingResampleHorizontalConvolution8u(UINT32 *lineOut, UINT32 *lineIn,
} }
for (; x < xmax; x ++) { for (; x < xmax; x ++) {
__m128i pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn[x + xmin]); __m128i pix = mm_cvtepu8_epi32(&lineIn[x + xmin]);
__m128i mmk = _mm_set1_epi32(k[x]); __m128i mmk = _mm_set1_epi32(k[x]);
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
} }

View File

@ -244,7 +244,7 @@ ImagingResampleVerticalConvolution8u(UINT32 *lineOut, Imaging imIn,
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
} }
for (; x < xmax; x++) { for (; x < xmax; x++) {
__m128i pix = _mm_cvtepu8_epi32(*(__m128i *) &imIn->image32[x + xmin][xx]); __m128i pix = mm_cvtepu8_epi32(&imIn->image32[x + xmin][xx]);
__m128i mmk = _mm_set1_epi32(k[x]); __m128i mmk = _mm_set1_epi32(k[x]);
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
} }