diff --git a/Tests/test_image_resample.py b/Tests/test_image_resample.py index 8bf2ce916..bd75be6b1 100644 --- a/Tests/test_image_resample.py +++ b/Tests/test_image_resample.py @@ -81,12 +81,13 @@ class TestImagingCoreResampleAccuracy: s_px[size[0] - x - 1, y] = 255 - val return sample - def check_case(self, case, sample): + def check_case(self, case, sample, strict=True): s_px = sample.load() c_px = case.load() for y in range(case.size[1]): for x in range(case.size[0]): - if c_px[x, y] != s_px[x, y]: + diff = abs(c_px[x, y] - s_px[x, y]) + if (strict and diff) or diff > 1: message = ( f"\nHave: \n{self.serialize_image(case)}\n" f"\nExpected: \n{self.serialize_image(sample)}" @@ -217,7 +218,7 @@ class TestImagingCoreResampleAccuracy: "b8 b7 b4 bf c4 a0" ) for channel in case.split(): - self.check_case(channel, self.make_sample(data, (12, 12))) + self.check_case(channel, self.make_sample(data, (12, 12)), strict=False) def test_box_filter_correct_range(self): im = Image.new("RGB", (8, 8), "#1688ff").resize((100, 100), Image.BOX) diff --git a/libImaging/ResampleSIMDHorizontalConv.c b/libImaging/ResampleSIMDHorizontalConv.c new file mode 100644 index 000000000..6f8e603c3 --- /dev/null +++ b/libImaging/ResampleSIMDHorizontalConv.c @@ -0,0 +1,382 @@ +void +ImagingResampleHorizontalConvolution8u4x( + UINT32 *lineOut0, UINT32 *lineOut1, UINT32 *lineOut2, UINT32 *lineOut3, + UINT32 *lineIn0, UINT32 *lineIn1, UINT32 *lineIn2, UINT32 *lineIn3, + int xsize, int *xbounds, INT16 *kk, int kmax, int coefs_precision) +{ + int xmin, xmax, xx, x; + INT16 *k; + + for (xx = 0; xx < xsize; xx++) { + xmin = xbounds[xx * 2 + 0]; + xmax = xbounds[xx * 2 + 1]; + k = &kk[xx * kmax]; + x = 0; + +#if defined(__AVX2__) + + __m256i sss0, sss1; + __m256i zero = _mm256_setzero_si256(); + __m256i initial = _mm256_set1_epi32((1 << (coefs_precision -1))); + sss0 = initial; + sss1 = initial; + + for (; x < xmax - 3; x += 4) { + __m256i pix, mmk0, mmk1, source, ksource; + __m128i tmp = _mm_loadl_epi64((__m128i *) &k[x]); + ksource = _mm256_insertf128_si256( + _mm256_castsi128_si256(tmp), tmp, 1); + + mmk0 = _mm256_shuffle_epi8(ksource, _mm256_set_epi8( + 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, + 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0)); + mmk1 = _mm256_shuffle_epi8(ksource, _mm256_set_epi8( + 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, + 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4)); + + source = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) &lineIn0[x + xmin])), + _mm_loadu_si128((__m128i *) &lineIn1[x + xmin]), 1); + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk0)); + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8, + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8)); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk1)); + + source = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) &lineIn2[x + xmin])), + _mm_loadu_si128((__m128i *) &lineIn3[x + xmin]), 1); + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk0)); + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8, + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8)); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk1)); + } + + for (; x < xmax - 1; x += 2) { + __m256i pix, mmk; + __m128i ksource = _mm_cvtsi32_si128(*(int *) &k[x]); + ksource = _mm_shuffle_epi8(ksource, _mm_set_epi8( + 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0)); + mmk = _mm256_inserti128_si256(_mm256_castsi128_si256(ksource), ksource, 1); + + pix = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *) &lineIn0[x + xmin])), + _mm_loadl_epi64((__m128i *) &lineIn1[x + xmin]), 1); + pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); + + pix = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *) &lineIn2[x + xmin])), + _mm_loadl_epi64((__m128i *) &lineIn3[x + xmin]), 1); + pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); + } + + for (; x < xmax; x ++) { + __m256i pix, mmk; + + // [16] xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 + mmk = _mm256_set1_epi32(k[x]); + + // [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0 + pix = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin])), + _mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]), 1); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); + + pix = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin])), + _mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]), 1); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); + } + + sss0 = _mm256_srai_epi32(sss0, coefs_precision); + sss1 = _mm256_srai_epi32(sss1, coefs_precision); + sss0 = _mm256_packs_epi32(sss0, zero); + sss1 = _mm256_packs_epi32(sss1, zero); + sss0 = _mm256_packus_epi16(sss0, zero); + sss1 = _mm256_packus_epi16(sss1, zero); + lineOut0[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 0)); + lineOut1[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1)); + lineOut2[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 0)); + lineOut3[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1)); + +#else + + __m128i sss0, sss1, sss2, sss3; + __m128i initial = _mm_set1_epi32((1 << (coefs_precision -1))); + sss0 = initial; + sss1 = initial; + sss2 = initial; + sss3 = initial; + + for (; x < xmax - 3; x += 4) { + __m128i pix, mmk_lo, mmk_hi, source; + __m128i mask_lo = _mm_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0); + __m128i mask_hi = _mm_set_epi8( + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8); + + // [16] xx xx xx xx k3 k2 k1 k0 + __m128i ksource = _mm_loadl_epi64((__m128i *) &k[x]); + // [16] k1 k0 k1 k0 k1 k0 k1 k0 + mmk_lo = _mm_shuffle_epi8(ksource, _mm_set_epi8( + 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0)); + mmk_hi = _mm_shuffle_epi8(ksource, _mm_set_epi8( + 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4)); + + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + source = _mm_loadu_si128((__m128i *) &lineIn0[x + xmin]); + // [16] a1 a0 b1 b0 g1 g0 r1 r0 + pix = _mm_shuffle_epi8(source, mask_lo); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk_lo)); + // [16] a3 a2 b3 b2 g3 g2 r3 r2 + pix = _mm_shuffle_epi8(source, mask_hi); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk_hi)); + + source = _mm_loadu_si128((__m128i *) &lineIn1[x + xmin]); + pix = _mm_shuffle_epi8(source, mask_lo); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk_lo)); + pix = _mm_shuffle_epi8(source, mask_hi); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk_hi)); + + source = _mm_loadu_si128((__m128i *) &lineIn2[x + xmin]); + pix = _mm_shuffle_epi8(source, mask_lo); + sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk_lo)); + pix = _mm_shuffle_epi8(source, mask_hi); + sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk_hi)); + + source = _mm_loadu_si128((__m128i *) &lineIn3[x + xmin]); + pix = _mm_shuffle_epi8(source, mask_lo); + sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk_lo)); + pix = _mm_shuffle_epi8(source, mask_hi); + sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk_hi)); + } + + for (; x < xmax - 1; x += 2) { + __m128i pix, mmk; + __m128i mask = _mm_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0); + + // [16] k1 k0 k1 k0 k1 k0 k1 k0 + mmk = _mm_set1_epi32(*(int *) &k[x]); + + // [8] x x x x x x x x a1 b1 g1 r1 a0 b0 g0 r0 + pix = _mm_loadl_epi64((__m128i *) &lineIn0[x + xmin]); + // [16] a1 a0 b1 b0 g1 g0 r1 r0 + pix = _mm_shuffle_epi8(pix, mask); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); + + pix = _mm_loadl_epi64((__m128i *) &lineIn1[x + xmin]); + pix = _mm_shuffle_epi8(pix, mask); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); + + pix = _mm_loadl_epi64((__m128i *) &lineIn2[x + xmin]); + pix = _mm_shuffle_epi8(pix, mask); + sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk)); + + pix = _mm_loadl_epi64((__m128i *) &lineIn3[x + xmin]); + pix = _mm_shuffle_epi8(pix, mask); + sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk)); + } + + for (; x < xmax; x ++) { + __m128i pix, mmk; + // [16] xx k0 xx k0 xx k0 xx k0 + mmk = _mm_set1_epi32(k[x]); + // [16] xx a0 xx b0 xx g0 xx r0 + pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin]); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); + + pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); + + pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin]); + sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk)); + + pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]); + sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk)); + } + + sss0 = _mm_srai_epi32(sss0, coefs_precision); + sss1 = _mm_srai_epi32(sss1, coefs_precision); + sss2 = _mm_srai_epi32(sss2, coefs_precision); + sss3 = _mm_srai_epi32(sss3, coefs_precision); + sss0 = _mm_packs_epi32(sss0, sss0); + sss1 = _mm_packs_epi32(sss1, sss1); + sss2 = _mm_packs_epi32(sss2, sss2); + sss3 = _mm_packs_epi32(sss3, sss3); + lineOut0[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss0, sss0)); + lineOut1[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss1, sss1)); + lineOut2[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss2, sss2)); + lineOut3[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss3, sss3)); + +#endif + + } +} + + +void +ImagingResampleHorizontalConvolution8u(UINT32 *lineOut, UINT32 *lineIn, + int xsize, int *xbounds, INT16 *kk, int kmax, int coefs_precision) +{ + int xmin, xmax, xx, x; + INT16 *k; + + for (xx = 0; xx < xsize; xx++) { + __m128i sss; + xmin = xbounds[xx * 2 + 0]; + xmax = xbounds[xx * 2 + 1]; + k = &kk[xx * kmax]; + x = 0; + +#if defined(__AVX2__) + + if (xmax < 8) { + sss = _mm_set1_epi32((1 << (coefs_precision -1)) + xmax / 2); + } else { + + __m256i sss256 = _mm256_set1_epi32((1 << (coefs_precision -2)) + xmax / 4); + + for (; x < xmax - 7; x += 8) { + __m256i pix, mmk, source; + __m128i tmp = _mm_loadu_si128((__m128i *) &k[x]); + __m256i ksource = _mm256_insertf128_si256( + _mm256_castsi128_si256(tmp), tmp, 1); + + source = _mm256_loadu_si256((__m256i *) &lineIn[x + xmin]); + + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8( + 11,10, 9,8, 11,10, 9,8, 11,10, 9,8, 11,10, 9,8, + 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0)); + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk)); + + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8, + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8)); + mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8( + 15,14, 13,12, 15,14, 13,12, 15,14, 13,12, 15,14, 13,12, + 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4)); + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk)); + } + + for (; x < xmax - 3; x += 4) { + __m256i pix, mmk, source; + __m128i tmp = _mm_loadl_epi64((__m128i *) &k[x]); + __m256i ksource = _mm256_insertf128_si256( + _mm256_castsi128_si256(tmp), tmp, 1); + + tmp = _mm_loadu_si128((__m128i *) &lineIn[x + xmin]); + source = _mm256_insertf128_si256( + _mm256_castsi128_si256(tmp), tmp, 1); + + pix = _mm256_shuffle_epi8(source, _mm256_set_epi8( + -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8, + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8( + 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, + 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0)); + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk)); + } + + sss = _mm_add_epi32( + _mm256_extracti128_si256(sss256, 0), + _mm256_extracti128_si256(sss256, 1) + ); + + } + +#else + + sss = _mm_set1_epi32((1 << (coefs_precision -1)) + xmax / 2); + + for (; x < xmax - 7; x += 8) { + __m128i pix, mmk, source; + __m128i ksource = _mm_loadu_si128((__m128i *) &k[x]); + + source = _mm_loadu_si128((__m128i *) &lineIn[x + xmin]); + + pix = _mm_shuffle_epi8(source, _mm_set_epi8( + -1,11, -1,3, -1,10, -1,2, -1,9, -1,1, -1,8, -1,0)); + mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8( + 5,4, 1,0, 5,4, 1,0, 5,4, 1,0, 5,4, 1,0)); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + + pix = _mm_shuffle_epi8(source, _mm_set_epi8( + -1,15, -1,7, -1,14, -1,6, -1,13, -1,5, -1,12, -1,4)); + mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8( + 7,6, 3,2, 7,6, 3,2, 7,6, 3,2, 7,6, 3,2)); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + + source = _mm_loadu_si128((__m128i *) &lineIn[x + 4 + xmin]); + + pix = _mm_shuffle_epi8(source, _mm_set_epi8( + -1,11, -1,3, -1,10, -1,2, -1,9, -1,1, -1,8, -1,0)); + mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8( + 13,12, 9,8, 13,12, 9,8, 13,12, 9,8, 13,12, 9,8)); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + + pix = _mm_shuffle_epi8(source, _mm_set_epi8( + -1,15, -1,7, -1,14, -1,6, -1,13, -1,5, -1,12, -1,4)); + mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8( + 15,14, 11,10, 15,14, 11,10, 15,14, 11,10, 15,14, 11,10)); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + for (; x < xmax - 3; x += 4) { + __m128i pix, mmk; + __m128i source = _mm_loadu_si128((__m128i *) &lineIn[x + xmin]); + __m128i ksource = _mm_loadl_epi64((__m128i *) &k[x]); + + pix = _mm_shuffle_epi8(source, _mm_set_epi8( + -1,11, -1,3, -1,10, -1,2, -1,9, -1,1, -1,8, -1,0)); + mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8( + 5,4, 1,0, 5,4, 1,0, 5,4, 1,0, 5,4, 1,0)); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + + pix = _mm_shuffle_epi8(source, _mm_set_epi8( + -1,15, -1,7, -1,14, -1,6, -1,13, -1,5, -1,12, -1,4)); + mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8( + 7,6, 3,2, 7,6, 3,2, 7,6, 3,2, 7,6, 3,2)); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + +#endif + + for (; x < xmax - 1; x += 2) { + __m128i pix, mmk; + __m128i source = _mm_loadl_epi64((__m128i *) &lineIn[x + xmin]); + __m128i ksource = _mm_cvtsi32_si128(*(int *) &k[x]); + + pix = _mm_shuffle_epi8(source, _mm_set_epi8( + -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0)); + mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8( + 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0)); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + for (; x < xmax; x ++) { + __m128i pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn[x + xmin]); + __m128i mmk = _mm_set1_epi32(k[x]); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + sss = _mm_srai_epi32(sss, coefs_precision); + sss = _mm_packs_epi32(sss, sss); + lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss)); + } +} diff --git a/libImaging/ResampleSIMDVerticalConv.c b/libImaging/ResampleSIMDVerticalConv.c new file mode 100644 index 000000000..02648f7b9 --- /dev/null +++ b/libImaging/ResampleSIMDVerticalConv.c @@ -0,0 +1,263 @@ + +void +ImagingResampleVerticalConvolution8u(UINT32 *lineOut, Imaging imIn, + int xmin, int xmax, INT16 *k, int coefs_precision) +{ + int x; + int xx = 0; + int xsize = imIn->xsize; + + __m128i initial = _mm_set1_epi32((1 << (coefs_precision -1)) + xmax / 2); + +#if defined(__AVX2__) + + __m256i initial_256 = _mm256_set1_epi32((1 << (coefs_precision -1)) + xmax / 2); + + for (; xx < xsize - 7; xx += 8) { + __m256i sss0 = initial_256; + __m256i sss1 = initial_256; + __m256i sss2 = initial_256; + __m256i sss3 = initial_256; + x = 0; + for (; x < xmax - 1; x += 2) { + __m256i source, source1, source2; + __m256i pix, mmk, mmk1; + mmk = _mm256_set1_epi32(k[x]); + mmk1 = _mm256_set1_epi32(k[x + 1]); + mmk = _mm256_unpacklo_epi16( + _mm256_packs_epi32(mmk, mmk), + _mm256_packs_epi32(mmk1, mmk1)); + + source1 = _mm256_loadu_si256( // top line + (__m256i *) &imIn->image32[x + xmin][xx]); + source2 = _mm256_loadu_si256( // bottom line + (__m256i *) &imIn->image32[x + 1 + xmin][xx]); + + source = _mm256_unpacklo_epi8(source1, source2); + pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); + pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); + + source = _mm256_unpackhi_epi8(source1, source2); + pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk)); + pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk)); + } + for (; x < xmax; x += 1) { + __m256i source, source1, pix, mmk; + mmk = _mm256_set1_epi32(k[x]); + + source1 = _mm256_loadu_si256( // top line + (__m256i *) &imIn->image32[x + xmin][xx]); + + source = _mm256_unpacklo_epi8(source1, _mm256_setzero_si256()); + pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); + pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk)); + + source = _mm256_unpackhi_epi8(source1, _mm256_setzero_si256()); + pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk)); + pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk)); + } + sss0 = _mm256_srai_epi32(sss0, coefs_precision); + sss1 = _mm256_srai_epi32(sss1, coefs_precision); + sss2 = _mm256_srai_epi32(sss2, coefs_precision); + sss3 = _mm256_srai_epi32(sss3, coefs_precision); + + sss0 = _mm256_packs_epi32(sss0, sss1); + sss2 = _mm256_packs_epi32(sss2, sss3); + sss0 = _mm256_packus_epi16(sss0, sss2); + _mm256_storeu_si256((__m256i *) &lineOut[xx], sss0); + } + +#else + + for (; xx < xsize - 7; xx += 8) { + __m128i sss0 = initial; + __m128i sss1 = initial; + __m128i sss2 = initial; + __m128i sss3 = initial; + __m128i sss4 = initial; + __m128i sss5 = initial; + __m128i sss6 = initial; + __m128i sss7 = initial; + x = 0; + for (; x < xmax - 1; x += 2) { + __m128i source, source1, source2; + __m128i pix, mmk, mmk1; + mmk = _mm_set1_epi32(k[x]); + mmk1 = _mm_set1_epi32(k[x + 1]); + mmk = _mm_unpacklo_epi16( + _mm_packs_epi32(mmk, mmk), + _mm_packs_epi32(mmk1, mmk1)); + + source1 = _mm_loadu_si128( // top line + (__m128i *) &imIn->image32[x + xmin][xx]); + source2 = _mm_loadu_si128( // bottom line + (__m128i *) &imIn->image32[x + 1 + xmin][xx]); + + source = _mm_unpacklo_epi8(source1, source2); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); + + source = _mm_unpackhi_epi8(source1, source2); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk)); + + source1 = _mm_loadu_si128( // top line + (__m128i *) &imIn->image32[x + xmin][xx + 4]); + source2 = _mm_loadu_si128( // bottom line + (__m128i *) &imIn->image32[x + 1 + xmin][xx + 4]); + + source = _mm_unpacklo_epi8(source1, source2); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss4 = _mm_add_epi32(sss4, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss5 = _mm_add_epi32(sss5, _mm_madd_epi16(pix, mmk)); + + source = _mm_unpackhi_epi8(source1, source2); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss6 = _mm_add_epi32(sss6, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss7 = _mm_add_epi32(sss7, _mm_madd_epi16(pix, mmk)); + } + for (; x < xmax; x += 1) { + __m128i source, source1, pix, mmk; + mmk = _mm_set1_epi32(k[x]); + + source1 = _mm_loadu_si128( // top line + (__m128i *) &imIn->image32[x + xmin][xx]); + + source = _mm_unpacklo_epi8(source1, _mm_setzero_si128()); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); + + source = _mm_unpackhi_epi8(source1, _mm_setzero_si128()); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk)); + + source1 = _mm_loadu_si128( // top line + (__m128i *) &imIn->image32[x + xmin][xx + 4]); + + source = _mm_unpacklo_epi8(source1, _mm_setzero_si128()); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss4 = _mm_add_epi32(sss4, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss5 = _mm_add_epi32(sss5, _mm_madd_epi16(pix, mmk)); + + source = _mm_unpackhi_epi8(source1, _mm_setzero_si128()); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss6 = _mm_add_epi32(sss6, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss7 = _mm_add_epi32(sss7, _mm_madd_epi16(pix, mmk)); + } + sss0 = _mm_srai_epi32(sss0, coefs_precision); + sss1 = _mm_srai_epi32(sss1, coefs_precision); + sss2 = _mm_srai_epi32(sss2, coefs_precision); + sss3 = _mm_srai_epi32(sss3, coefs_precision); + sss4 = _mm_srai_epi32(sss4, coefs_precision); + sss5 = _mm_srai_epi32(sss5, coefs_precision); + sss6 = _mm_srai_epi32(sss6, coefs_precision); + sss7 = _mm_srai_epi32(sss7, coefs_precision); + + sss0 = _mm_packs_epi32(sss0, sss1); + sss2 = _mm_packs_epi32(sss2, sss3); + sss0 = _mm_packus_epi16(sss0, sss2); + _mm_storeu_si128((__m128i *) &lineOut[xx], sss0); + sss4 = _mm_packs_epi32(sss4, sss5); + sss6 = _mm_packs_epi32(sss6, sss7); + sss4 = _mm_packus_epi16(sss4, sss6); + _mm_storeu_si128((__m128i *) &lineOut[xx + 4], sss4); + } + +#endif + + for (; xx < xsize - 1; xx += 2) { + __m128i sss0 = initial; // left row + __m128i sss1 = initial; // right row + x = 0; + for (; x < xmax - 1; x += 2) { + __m128i source, source1, source2; + __m128i pix, mmk, mmk1; + mmk = _mm_set1_epi32(k[x]); + mmk1 = _mm_set1_epi32(k[x + 1]); + mmk = _mm_unpacklo_epi16( + _mm_packs_epi32(mmk, mmk), + _mm_packs_epi32(mmk1, mmk1)); + + source1 = _mm_loadl_epi64( // top line + (__m128i *) &imIn->image32[x + xmin][xx]); + source2 = _mm_loadl_epi64( // bottom line + (__m128i *) &imIn->image32[x + 1 + xmin][xx]); + + source = _mm_unpacklo_epi8(source1, source2); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); + } + for (; x < xmax; x += 1) { + __m128i source, source1, pix, mmk; + mmk = _mm_set1_epi32(k[x]); + + source1 = _mm_loadl_epi64( // top line + (__m128i *) &imIn->image32[x + xmin][xx]); + + source = _mm_unpacklo_epi8(source1, _mm_setzero_si128()); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); + pix = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); + } + sss0 = _mm_srai_epi32(sss0, coefs_precision); + sss1 = _mm_srai_epi32(sss1, coefs_precision); + + sss0 = _mm_packs_epi32(sss0, sss1); + sss0 = _mm_packus_epi16(sss0, sss0); + _mm_storel_epi64((__m128i *) &lineOut[xx], sss0); + } + + for (; xx < xsize; xx++) { + __m128i sss = initial; + x = 0; + for (; x < xmax - 1; x += 2) { + __m128i source, source1, source2; + __m128i pix, mmk, mmk1; + mmk = _mm_set1_epi32(k[x]); + mmk1 = _mm_set1_epi32(k[x + 1]); + mmk = _mm_unpacklo_epi16( + _mm_packs_epi32(mmk, mmk), + _mm_packs_epi32(mmk1, mmk1)); + + source1 = _mm_cvtsi32_si128( // top line + *(int *) &imIn->image32[x + xmin][xx]); + source2 = _mm_cvtsi32_si128( // bottom line + *(int *) &imIn->image32[x + 1 + xmin][xx]); + + source = _mm_unpacklo_epi8(source1, source2); + pix = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + for (; x < xmax; x++) { + __m128i pix = _mm_cvtepu8_epi32(*(__m128i *) &imIn->image32[x + xmin][xx]); + __m128i mmk = _mm_set1_epi32(k[x]); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + sss = _mm_srai_epi32(sss, coefs_precision); + sss = _mm_packs_epi32(sss, sss); + lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss)); + } +} diff --git a/src/libImaging/Resample.c b/src/libImaging/Resample.c index cf79d8a4e..e1a015ef9 100644 --- a/src/libImaging/Resample.c +++ b/src/libImaging/Resample.c @@ -1,6 +1,17 @@ #include "Imaging.h" #include +#include +#include +#include + +#if defined(__AVX2__) + #include +#endif + + +#include "ResampleSIMDHorizontalConv.c" +#include "ResampleSIMDVerticalConv.c" #define ROUND_UP(f) ((int)((f) >= 0.0 ? (f) + 0.5F : (f)-0.5F)) @@ -90,6 +101,8 @@ static struct filter LANCZOS = {lanczos_filter, 3.0}; in the other it will be more than 1.0. That is why we need two extra bits for overflow and int type. */ #define PRECISION_BITS (32 - 8 - 2) +/* We use signed INT16 type to store coefficients. */ +#define MAX_COEFS_PRECISION (16 - 1) /* Handles values form -640 to 639. */ UINT8 _clip8_lookups[1280] = { @@ -173,9 +186,9 @@ UINT8 _clip8_lookups[1280] = { UINT8 *clip8_lookups = &_clip8_lookups[640]; -static inline UINT8 -clip8(int in) { - return clip8_lookups[in >> PRECISION_BITS]; +static inline UINT8 clip8(int in, int precision) +{ + return clip8_lookups[in >> precision]; } int @@ -265,34 +278,54 @@ precompute_coeffs( return ksize; } -void -normalize_coeffs_8bpc(int outSize, int ksize, double *prekk) { + +int +normalize_coeffs_8bpc(int outSize, int ksize, double *prekk) +{ int x; - INT32 *kk; + INT16 *kk; + int coefs_precision; + double maxkk; // use the same buffer for normalized coefficients - kk = (INT32 *)prekk; + kk = (INT16 *) prekk; + + maxkk = prekk[0]; + for (x = 0; x < outSize * ksize; x++) { + if (maxkk < prekk[x]) { + maxkk = prekk[x]; + } + } + + coefs_precision = 0; + while (maxkk < (1 << (MAX_COEFS_PRECISION-1)) && (coefs_precision < PRECISION_BITS)) { + maxkk *= 2; + coefs_precision += 1; + }; for (x = 0; x < outSize * ksize; x++) { if (prekk[x] < 0) { - kk[x] = (int)(-0.5 + prekk[x] * (1 << PRECISION_BITS)); + kk[x] = (int) (-0.5 + prekk[x] * (1 << coefs_precision)); } else { - kk[x] = (int)(0.5 + prekk[x] * (1 << PRECISION_BITS)); + kk[x] = (int) (0.5 + prekk[x] * (1 << coefs_precision)); } } + return coefs_precision; } + void ImagingResampleHorizontal_8bpc( Imaging imOut, Imaging imIn, int offset, int ksize, int *bounds, double *prekk) { ImagingSectionCookie cookie; - int ss0, ss1, ss2, ss3; + int ss0; int xx, yy, x, xmin, xmax; - INT32 *k, *kk; + INT16 *k, *kk; + int coefs_precision; // use the same buffer for normalized coefficients - kk = (INT32 *)prekk; - normalize_coeffs_8bpc(imOut->xsize, ksize, prekk); + kk = (INT16 *) prekk; + coefs_precision = normalize_coeffs_8bpc(imOut->xsize, ksize, prekk); ImagingSectionEnter(&cookie); if (imIn->image8) { @@ -301,74 +334,36 @@ ImagingResampleHorizontal_8bpc( xmin = bounds[xx * 2 + 0]; xmax = bounds[xx * 2 + 1]; k = &kk[xx * ksize]; - ss0 = 1 << (PRECISION_BITS - 1); + ss0 = 1 << (coefs_precision -1); for (x = 0; x < xmax; x++) { ss0 += ((UINT8)imIn->image8[yy + offset][x + xmin]) * k[x]; } - imOut->image8[yy][xx] = clip8(ss0); + imOut->image8[yy][xx] = clip8(ss0, coefs_precision); } } } else if (imIn->type == IMAGING_TYPE_UINT8) { - if (imIn->bands == 2) { - for (yy = 0; yy < imOut->ysize; yy++) { - for (xx = 0; xx < imOut->xsize; xx++) { - UINT32 v; - xmin = bounds[xx * 2 + 0]; - xmax = bounds[xx * 2 + 1]; - k = &kk[xx * ksize]; - ss0 = ss3 = 1 << (PRECISION_BITS - 1); - for (x = 0; x < xmax; x++) { - ss0 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 0]) * - k[x]; - ss3 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 3]) * - k[x]; - } - v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3)); - memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v)); - } - } - } else if (imIn->bands == 3) { - for (yy = 0; yy < imOut->ysize; yy++) { - for (xx = 0; xx < imOut->xsize; xx++) { - UINT32 v; - xmin = bounds[xx * 2 + 0]; - xmax = bounds[xx * 2 + 1]; - k = &kk[xx * ksize]; - ss0 = ss1 = ss2 = 1 << (PRECISION_BITS - 1); - for (x = 0; x < xmax; x++) { - ss0 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 0]) * - k[x]; - ss1 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 1]) * - k[x]; - ss2 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 2]) * - k[x]; - } - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0); - memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v)); - } - } - } else { - for (yy = 0; yy < imOut->ysize; yy++) { - for (xx = 0; xx < imOut->xsize; xx++) { - UINT32 v; - xmin = bounds[xx * 2 + 0]; - xmax = bounds[xx * 2 + 1]; - k = &kk[xx * ksize]; - ss0 = ss1 = ss2 = ss3 = 1 << (PRECISION_BITS - 1); - for (x = 0; x < xmax; x++) { - ss0 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 0]) * - k[x]; - ss1 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 1]) * - k[x]; - ss2 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 2]) * - k[x]; - ss3 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 3]) * - k[x]; - } - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3)); - memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v)); - } - } + yy = 0; + for (; yy < imOut->ysize - 3; yy += 4) { + ImagingResampleHorizontalConvolution8u4x( + (UINT32 *) imOut->image32[yy], + (UINT32 *) imOut->image32[yy + 1], + (UINT32 *) imOut->image32[yy + 2], + (UINT32 *) imOut->image32[yy + 3], + (UINT32 *) imIn->image32[yy + offset], + (UINT32 *) imIn->image32[yy + offset + 1], + (UINT32 *) imIn->image32[yy + offset + 2], + (UINT32 *) imIn->image32[yy + offset + 3], + imOut->xsize, bounds, kk, ksize, + coefs_precision + ); + } + for (; yy < imOut->ysize; yy++) { + ImagingResampleHorizontalConvolution8u( + (UINT32 *) imOut->image32[yy], + (UINT32 *) imIn->image32[yy + offset], + imOut->xsize, bounds, kk, ksize, + coefs_precision + ); } } ImagingSectionLeave(&cookie); @@ -378,13 +373,14 @@ void ImagingResampleVertical_8bpc( Imaging imOut, Imaging imIn, int offset, int ksize, int *bounds, double *prekk) { ImagingSectionCookie cookie; - int ss0, ss1, ss2, ss3; + int ss0; int xx, yy, y, ymin, ymax; - INT32 *k, *kk; + INT16 *k, *kk; + int coefs_precision; // use the same buffer for normalized coefficients - kk = (INT32 *)prekk; - normalize_coeffs_8bpc(imOut->ysize, ksize, prekk); + kk = (INT16 *) prekk; + coefs_precision = normalize_coeffs_8bpc(imOut->ysize, ksize, prekk); ImagingSectionEnter(&cookie); if (imIn->image8) { @@ -393,65 +389,22 @@ ImagingResampleVertical_8bpc( ymin = bounds[yy * 2 + 0]; ymax = bounds[yy * 2 + 1]; for (xx = 0; xx < imOut->xsize; xx++) { - ss0 = 1 << (PRECISION_BITS - 1); + ss0 = 1 << (coefs_precision -1); for (y = 0; y < ymax; y++) { ss0 += ((UINT8)imIn->image8[y + ymin][xx]) * k[y]; } - imOut->image8[yy][xx] = clip8(ss0); + imOut->image8[yy][xx] = clip8(ss0, coefs_precision); } } } else if (imIn->type == IMAGING_TYPE_UINT8) { - if (imIn->bands == 2) { - for (yy = 0; yy < imOut->ysize; yy++) { - k = &kk[yy * ksize]; - ymin = bounds[yy * 2 + 0]; - ymax = bounds[yy * 2 + 1]; - for (xx = 0; xx < imOut->xsize; xx++) { - UINT32 v; - ss0 = ss3 = 1 << (PRECISION_BITS - 1); - for (y = 0; y < ymax; y++) { - ss0 += ((UINT8)imIn->image[y + ymin][xx * 4 + 0]) * k[y]; - ss3 += ((UINT8)imIn->image[y + ymin][xx * 4 + 3]) * k[y]; - } - v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3)); - memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v)); - } - } - } else if (imIn->bands == 3) { - for (yy = 0; yy < imOut->ysize; yy++) { - k = &kk[yy * ksize]; - ymin = bounds[yy * 2 + 0]; - ymax = bounds[yy * 2 + 1]; - for (xx = 0; xx < imOut->xsize; xx++) { - UINT32 v; - ss0 = ss1 = ss2 = 1 << (PRECISION_BITS - 1); - for (y = 0; y < ymax; y++) { - ss0 += ((UINT8)imIn->image[y + ymin][xx * 4 + 0]) * k[y]; - ss1 += ((UINT8)imIn->image[y + ymin][xx * 4 + 1]) * k[y]; - ss2 += ((UINT8)imIn->image[y + ymin][xx * 4 + 2]) * k[y]; - } - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0); - memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v)); - } - } - } else { - for (yy = 0; yy < imOut->ysize; yy++) { - k = &kk[yy * ksize]; - ymin = bounds[yy * 2 + 0]; - ymax = bounds[yy * 2 + 1]; - for (xx = 0; xx < imOut->xsize; xx++) { - UINT32 v; - ss0 = ss1 = ss2 = ss3 = 1 << (PRECISION_BITS - 1); - for (y = 0; y < ymax; y++) { - ss0 += ((UINT8)imIn->image[y + ymin][xx * 4 + 0]) * k[y]; - ss1 += ((UINT8)imIn->image[y + ymin][xx * 4 + 1]) * k[y]; - ss2 += ((UINT8)imIn->image[y + ymin][xx * 4 + 2]) * k[y]; - ss3 += ((UINT8)imIn->image[y + ymin][xx * 4 + 3]) * k[y]; - } - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3)); - memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v)); - } - } + for (yy = 0; yy < imOut->ysize; yy++) { + k = &kk[yy * ksize]; + ymin = bounds[yy * 2 + 0]; + ymax = bounds[yy * 2 + 1]; + ImagingResampleVerticalConvolution8u( + (UINT32 *) imOut->image32[yy], imIn, + ymin, ymax, k, coefs_precision + ); } } ImagingSectionLeave(&cookie);