From 33b11280b06cd7e87077dccaf5aa31e8f2d311a8 Mon Sep 17 00:00:00 2001 From: Alexander Date: Tue, 7 Jan 2020 02:35:42 +0300 Subject: [PATCH] SIMD. Fix memory read error on some compilers --- src/libImaging/BoxBlur.c | 24 ++++++++++----------- src/libImaging/FilterSIMD_3x3f_4u8.c | 12 +++++------ src/libImaging/FilterSIMD_3x3f_u8.c | 10 ++++----- src/libImaging/FilterSIMD_5x5f_4u8.c | 10 ++++----- src/libImaging/FilterSIMD_5x5f_u8.c | 10 ++++----- src/libImaging/ImagingUtils.h | 13 +++++++++++ src/libImaging/ResampleSIMDHorizontalConv.c | 18 ++++++++-------- src/libImaging/ResampleSIMDVerticalConv.c | 2 +- 8 files changed, 56 insertions(+), 43 deletions(-) diff --git a/src/libImaging/BoxBlur.c b/src/libImaging/BoxBlur.c index 928c86a9b..7997ccd90 100644 --- a/src/libImaging/BoxBlur.c +++ b/src/libImaging/BoxBlur.c @@ -22,8 +22,8 @@ ImagingLineBoxBlur32(UINT32 *lineOut, UINT32 *lineIn, int lastx, int radius, int #define MOVE_ACC(acc, subtract, add) \ { \ - __m128i tmp1 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[add])); \ - __m128i tmp2 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[subtract])); \ + __m128i tmp1 = mm_cvtepu8_epi32(&lineIn[add]); \ + __m128i tmp2 = mm_cvtepu8_epi32(&lineIn[subtract]); \ acc = _mm_sub_epi32(_mm_add_epi32(acc, tmp1), tmp2); \ } @@ -32,8 +32,8 @@ ImagingLineBoxBlur32(UINT32 *lineOut, UINT32 *lineIn, int lastx, int radius, int #define ADD_FAR(bulk, acc, left, right) \ { \ - __m128i tmp3 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[left])); \ - __m128i tmp4 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[right])); \ + __m128i tmp3 = mm_cvtepu8_epi32(&lineIn[left]); \ + __m128i tmp4 = mm_cvtepu8_epi32(&lineIn[right]); \ __m128i tmp5 = _mm_mullo_epi32(_mm_add_epi32(tmp3, tmp4), fws); \ __m128i tmp6 = _mm_mullo_epi32(acc, wws); \ bulk = _mm_add_epi32(tmp5, tmp6); \ @@ -52,19 +52,19 @@ ImagingLineBoxBlur32(UINT32 *lineOut, UINT32 *lineIn, int lastx, int radius, int then from "0" to "radius-1". */ // acc[0] = lineIn[0][0] * (radius + 1); acc = _mm_mullo_epi32( - _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[0])), + mm_cvtepu8_epi32(&lineIn[0]), _mm_set1_epi32(radius + 1) ); /* As radius can be bigger than xsize, iterate to edgeA -1. */ for (x = 0; x < edgeA - 1; x++) { // acc[0] += lineIn[x][0]; - acc = _mm_add_epi32(acc, _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[x]))); + acc = _mm_add_epi32(acc, mm_cvtepu8_epi32(&lineIn[x])); } /* Then multiply remainder to last x. */ // acc[0] += lineIn[lastx][0] * (radius - edgeA + 1); acc = _mm_add_epi32(acc, _mm_mullo_epi32( - _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[x])), + mm_cvtepu8_epi32(&lineIn[x]), _mm_set1_epi32(radius - edgeA + 1) )); @@ -132,8 +132,8 @@ ImagingLineBoxBlurZero32(UINT32 *lineOut, UINT32 *lineIn, int lastx, int edgeA, #define ADD_FAR(bulk, acc, left, right) \ { \ - __m128i tmp3 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[left])); \ - __m128i tmp4 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[right])); \ + __m128i tmp3 = mm_cvtepu8_epi32(&lineIn[left]); \ + __m128i tmp4 = mm_cvtepu8_epi32(&lineIn[right]); \ __m128i tmp5 = _mm_mullo_epi32(_mm_add_epi32(tmp3, tmp4), fws); \ __m128i tmp6 = _mm_mullo_epi32(acc, wws); \ bulk = _mm_add_epi32(tmp5, tmp6); \ @@ -151,21 +151,21 @@ ImagingLineBoxBlurZero32(UINT32 *lineOut, UINT32 *lineIn, int lastx, int edgeA, /* Subtract pixel from left ("0"). Add pixels from radius. */ for (x = 0; x < edgeA; x++) { - acc = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[0])); + acc = mm_cvtepu8_epi32(&lineIn[0]); ADD_FAR(bulk, acc, 0, x + 1); SAVE(x, bulk); } /* Subtract previous pixel from "-radius". Add pixels from radius. */ for (x = edgeA; x < edgeB; x++) { - acc = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[x])); + acc = mm_cvtepu8_epi32(&lineIn[x]); ADD_FAR(bulk, acc, x - 1, x + 1); SAVE(x, bulk); } /* Subtract previous pixel from "-radius". Add last pixel. */ for (x = edgeB; x <= lastx; x++) { - acc = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(lineIn[lastx])); + acc = mm_cvtepu8_epi32(&lineIn[lastx]); ADD_FAR(bulk, acc, x - 1, lastx); SAVE(x, bulk); } diff --git a/src/libImaging/FilterSIMD_3x3f_4u8.c b/src/libImaging/FilterSIMD_3x3f_4u8.c index 8d665f16a..02272f688 100644 --- a/src/libImaging/FilterSIMD_3x3f_4u8.c +++ b/src/libImaging/FilterSIMD_3x3f_4u8.c @@ -3,9 +3,9 @@ ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel, float offset) { #define MM_KERNEL1x3_LOAD(row, x) \ - pix0##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in1[x])); \ - pix1##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in0[x])); \ - pix2##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_1[x])); + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); #define MM_KERNEL1x3_SUM(row, kindex) \ ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ @@ -25,9 +25,9 @@ ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel, #if defined(__AVX2__) #define MM256_LOAD(row, x) \ - pix0##row = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in1[x])); \ - pix1##row = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in0[x])); \ - pix2##row = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(*(__m128i*) &in_1[x])); + pix0##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in1[x])); \ + pix1##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in0[x])); \ + pix2##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in_1[x])); #define MM256_SUM(pixrow, kernelrow) \ ss = _mm256_add_ps(ss, _mm256_mul_ps(pix0##pixrow, kernel0##kernelrow)); \ diff --git a/src/libImaging/FilterSIMD_3x3f_u8.c b/src/libImaging/FilterSIMD_3x3f_u8.c index 64f365008..f5eefc970 100644 --- a/src/libImaging/FilterSIMD_3x3f_u8.c +++ b/src/libImaging/FilterSIMD_3x3f_u8.c @@ -15,9 +15,9 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel2##kernel)); #define MM_KERNEL1x3_LOAD(row, x) \ - pix0##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in1[x])); \ - pix1##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in0[x])); \ - pix2##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_1[x])); + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); int x, y; __m128 kernel00 = _mm_set_ps(0, kernel[2], kernel[1], kernel[0]); @@ -49,7 +49,7 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, MM_KERNEL1x3_SUM1(ss0, 0, 0); MM_KERNEL1x3_SUM1(ss1, 0, 1); ss0 = _mm_hadd_ps(ss0, ss1); - pix30 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in2[x-1])); + pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x-1])); MM_KERNEL1x3_SUM1_2(ss3, 0, 0); MM_KERNEL1x3_SUM1_2(ss4, 0, 1); ss3 = _mm_hadd_ps(ss3, ss4); @@ -58,7 +58,7 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, MM_KERNEL1x3_SUM1(ss1, 0, 0); MM_KERNEL1x3_SUM1(ss2, 0, 1); ss1 = _mm_hadd_ps(ss1, ss2); - pix30 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in2[x+1])); + pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x+1])); MM_KERNEL1x3_SUM1_2(ss4, 0, 0); MM_KERNEL1x3_SUM1_2(ss5, 0, 1); ss4 = _mm_hadd_ps(ss4, ss5); diff --git a/src/libImaging/FilterSIMD_5x5f_4u8.c b/src/libImaging/FilterSIMD_5x5f_4u8.c index 8cb52df06..097242adb 100644 --- a/src/libImaging/FilterSIMD_5x5f_4u8.c +++ b/src/libImaging/FilterSIMD_5x5f_4u8.c @@ -3,11 +3,11 @@ ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel, float offset) { #define MM_KERNEL1x5_LOAD(row, x) \ - pix0##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in2[x])); \ - pix1##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in1[x])); \ - pix2##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in0[x])); \ - pix3##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_1[x])); \ - pix4##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_2[x])); + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \ + pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x])); #define MM_KERNEL1x5_SUM(row, kindex) \ ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ diff --git a/src/libImaging/FilterSIMD_5x5f_u8.c b/src/libImaging/FilterSIMD_5x5f_u8.c index b913aa275..562cb5d0e 100644 --- a/src/libImaging/FilterSIMD_5x5f_u8.c +++ b/src/libImaging/FilterSIMD_5x5f_u8.c @@ -3,11 +3,11 @@ ImagingFilter5x5f_u8(Imaging imOut, Imaging im, const float* kernel, float offset) { #define MM_KERNEL1x5_LOAD(row, x) \ - pix0##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in2[x])); \ - pix1##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in1[x])); \ - pix2##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in0[x])); \ - pix3##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_1[x])); \ - pix4##row = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(*(__m128i*) &in_2[x])); + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \ + pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x])); #define MM_KERNEL1x5_SUM(ss, row, krow) \ ss = _mm_setzero_ps(); \ diff --git a/src/libImaging/ImagingUtils.h b/src/libImaging/ImagingUtils.h index 0c0c1eda9..e9d2f894f 100644 --- a/src/libImaging/ImagingUtils.h +++ b/src/libImaging/ImagingUtils.h @@ -40,3 +40,16 @@ static float __attribute__((always_inline)) inline _i2f(int v) { #else static float inline _i2f(int v) { return (float)v; } #endif + + +__m128i inline +mm_cvtepu8_epi32(void *ptr) { + return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(INT32 *) ptr)); +} + +#if defined(__AVX2__) +__m256i inline +mm256_cvtepu8_epi32(void *ptr) { + return _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*(int64_t *) ptr)); +} +#endif diff --git a/src/libImaging/ResampleSIMDHorizontalConv.c b/src/libImaging/ResampleSIMDHorizontalConv.c index 002e53d40..6aef5b09d 100644 --- a/src/libImaging/ResampleSIMDHorizontalConv.c +++ b/src/libImaging/ResampleSIMDHorizontalConv.c @@ -82,13 +82,13 @@ ImagingResampleHorizontalConvolution8u4x( // [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0 pix = _mm256_inserti128_si256(_mm256_castsi128_si256( - _mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin])), - _mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]), 1); + mm_cvtepu8_epi32(&lineIn0[x + xmin])), + mm_cvtepu8_epi32(&lineIn1[x + xmin]), 1); sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); pix = _mm256_inserti128_si256(_mm256_castsi128_si256( - _mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin])), - _mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]), 1); + mm_cvtepu8_epi32(&lineIn2[x + xmin])), + mm_cvtepu8_epi32(&lineIn3[x + xmin]), 1); sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); } @@ -182,16 +182,16 @@ ImagingResampleHorizontalConvolution8u4x( // [16] xx k0 xx k0 xx k0 xx k0 mmk = _mm_set1_epi32(k[x]); // [16] xx a0 xx b0 xx g0 xx r0 - pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin]); + pix = mm_cvtepu8_epi32(&lineIn0[x + xmin]); sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); - pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]); + pix = mm_cvtepu8_epi32(&lineIn1[x + xmin]); sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); - pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin]); + pix = mm_cvtepu8_epi32(&lineIn2[x + xmin]); sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk)); - pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]); + pix = mm_cvtepu8_epi32(&lineIn3[x + xmin]); sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk)); } @@ -354,7 +354,7 @@ ImagingResampleHorizontalConvolution8u(UINT32 *lineOut, UINT32 *lineIn, } for (; x < xmax; x ++) { - __m128i pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn[x + xmin]); + __m128i pix = mm_cvtepu8_epi32(&lineIn[x + xmin]); __m128i mmk = _mm_set1_epi32(k[x]); sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); } diff --git a/src/libImaging/ResampleSIMDVerticalConv.c b/src/libImaging/ResampleSIMDVerticalConv.c index cbf46e17d..a2e4d18a9 100644 --- a/src/libImaging/ResampleSIMDVerticalConv.c +++ b/src/libImaging/ResampleSIMDVerticalConv.c @@ -244,7 +244,7 @@ ImagingResampleVerticalConvolution8u(UINT32 *lineOut, Imaging imIn, sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); } for (; x < xmax; x++) { - __m128i pix = _mm_cvtepu8_epi32(*(__m128i *) &imIn->image32[x + xmin][xx]); + __m128i pix = mm_cvtepu8_epi32(&imIn->image32[x + xmin][xx]); __m128i mmk = _mm_set1_epi32(k[x]); sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); }