SIMD Resample. fix wrong usage of xmax for division compensation

This commit is contained in:
Alexander 2019-01-20 15:27:30 +03:00 committed by Alexander Karpinsky
parent 247cba2e30
commit b3c28911c0
2 changed files with 8 additions and 7 deletions

View File

@ -17,7 +17,7 @@ ImagingResampleHorizontalConvolution8u4x(
{ {
__m256i sss0, sss1; __m256i sss0, sss1;
__m256i zero = _mm256_setzero_si256(); __m256i zero = _mm256_setzero_si256();
__m256i initial = _mm256_set1_epi32(1 << (coefs_precision -1)); __m256i initial = _mm256_set1_epi32(1 << (coefs_precision-1));
sss0 = initial; sss0 = initial;
sss1 = initial; sss1 = initial;
@ -106,7 +106,7 @@ ImagingResampleHorizontalConvolution8u4x(
#else #else
{ {
__m128i sss0, sss1, sss2, sss3; __m128i sss0, sss1, sss2, sss3;
__m128i initial = _mm_set1_epi32(1 << (coefs_precision -1)); __m128i initial = _mm_set1_epi32(1 << (coefs_precision-1));
sss0 = initial; sss0 = initial;
sss1 = initial; sss1 = initial;
sss2 = initial; sss2 = initial;
@ -231,10 +231,11 @@ ImagingResampleHorizontalConvolution8u(UINT32 *lineOut, UINT32 *lineIn,
#if defined(__AVX2__) #if defined(__AVX2__)
if (xmax < 8) { if (xmax < 8) {
sss = _mm_set1_epi32((1 << (coefs_precision -1)) + xmax / 2); sss = _mm_set1_epi32(1 << (coefs_precision-1));
} else { } else {
__m256i sss256 = _mm256_set1_epi32((1 << (coefs_precision -2)) + xmax / 4); // Lower part will be added to higher, use only half of the error
__m256i sss256 = _mm256_set1_epi32(1 << (coefs_precision-2));
for (; x < xmax - 7; x += 8) { for (; x < xmax - 7; x += 8) {
__m256i pix, mmk, source; __m256i pix, mmk, source;
@ -289,7 +290,7 @@ ImagingResampleHorizontalConvolution8u(UINT32 *lineOut, UINT32 *lineIn,
#else #else
sss = _mm_set1_epi32((1 << (coefs_precision -1)) + xmax / 2); sss = _mm_set1_epi32(1 << (coefs_precision-1));
for (; x < xmax - 7; x += 8) { for (; x < xmax - 7; x += 8) {
__m128i pix, mmk, source; __m128i pix, mmk, source;

View File

@ -7,11 +7,11 @@ ImagingResampleVerticalConvolution8u(UINT32 *lineOut, Imaging imIn,
int xx = 0; int xx = 0;
int xsize = imIn->xsize; int xsize = imIn->xsize;
__m128i initial = _mm_set1_epi32(1 << (coefs_precision -1)); __m128i initial = _mm_set1_epi32(1 << (coefs_precision-1));
#if defined(__AVX2__) #if defined(__AVX2__)
__m256i initial_256 = _mm256_set1_epi32(1 << (coefs_precision -1)); __m256i initial_256 = _mm256_set1_epi32(1 << (coefs_precision-1));
for (; xx < xsize - 7; xx += 8) { for (; xx < xsize - 7; xx += 8) {
__m256i sss0 = initial_256; __m256i sss0 = initial_256;