SIMD Resample. unrolled SSE4 & AVX2

This commit is contained in:
homm 2016-10-04 02:49:07 +03:00 committed by Alexander Karpinsky
parent b4a54840ee
commit 8796b40ef6
4 changed files with 734 additions and 135 deletions

View File

@ -81,12 +81,13 @@ class TestImagingCoreResampleAccuracy:
s_px[size[0] - x - 1, y] = 255 - val s_px[size[0] - x - 1, y] = 255 - val
return sample return sample
def check_case(self, case, sample): def check_case(self, case, sample, strict=True):
s_px = sample.load() s_px = sample.load()
c_px = case.load() c_px = case.load()
for y in range(case.size[1]): for y in range(case.size[1]):
for x in range(case.size[0]): for x in range(case.size[0]):
if c_px[x, y] != s_px[x, y]: diff = abs(c_px[x, y] - s_px[x, y])
if (strict and diff) or diff > 1:
message = ( message = (
f"\nHave: \n{self.serialize_image(case)}\n" f"\nHave: \n{self.serialize_image(case)}\n"
f"\nExpected: \n{self.serialize_image(sample)}" f"\nExpected: \n{self.serialize_image(sample)}"
@ -217,7 +218,7 @@ class TestImagingCoreResampleAccuracy:
"b8 b7 b4 bf c4 a0" "b8 b7 b4 bf c4 a0"
) )
for channel in case.split(): for channel in case.split():
self.check_case(channel, self.make_sample(data, (12, 12))) self.check_case(channel, self.make_sample(data, (12, 12)), strict=False)
def test_box_filter_correct_range(self): def test_box_filter_correct_range(self):
im = Image.new("RGB", (8, 8), "#1688ff").resize((100, 100), Image.BOX) im = Image.new("RGB", (8, 8), "#1688ff").resize((100, 100), Image.BOX)

View File

@ -0,0 +1,382 @@
void
ImagingResampleHorizontalConvolution8u4x(
UINT32 *lineOut0, UINT32 *lineOut1, UINT32 *lineOut2, UINT32 *lineOut3,
UINT32 *lineIn0, UINT32 *lineIn1, UINT32 *lineIn2, UINT32 *lineIn3,
int xsize, int *xbounds, INT16 *kk, int kmax, int coefs_precision)
{
int xmin, xmax, xx, x;
INT16 *k;
for (xx = 0; xx < xsize; xx++) {
xmin = xbounds[xx * 2 + 0];
xmax = xbounds[xx * 2 + 1];
k = &kk[xx * kmax];
x = 0;
#if defined(__AVX2__)
__m256i sss0, sss1;
__m256i zero = _mm256_setzero_si256();
__m256i initial = _mm256_set1_epi32((1 << (coefs_precision -1)));
sss0 = initial;
sss1 = initial;
for (; x < xmax - 3; x += 4) {
__m256i pix, mmk0, mmk1, source, ksource;
__m128i tmp = _mm_loadl_epi64((__m128i *) &k[x]);
ksource = _mm256_insertf128_si256(
_mm256_castsi128_si256(tmp), tmp, 1);
mmk0 = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0,
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
mmk1 = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4,
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4));
source = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_loadu_si128((__m128i *) &lineIn0[x + xmin])),
_mm_loadu_si128((__m128i *) &lineIn1[x + xmin]), 1);
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk0));
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk1));
source = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_loadu_si128((__m128i *) &lineIn2[x + xmin])),
_mm_loadu_si128((__m128i *) &lineIn3[x + xmin]), 1);
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk0));
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk1));
}
for (; x < xmax - 1; x += 2) {
__m256i pix, mmk;
__m128i ksource = _mm_cvtsi32_si128(*(int *) &k[x]);
ksource = _mm_shuffle_epi8(ksource, _mm_set_epi8(
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
mmk = _mm256_inserti128_si256(_mm256_castsi128_si256(ksource), ksource, 1);
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_loadl_epi64((__m128i *) &lineIn0[x + xmin])),
_mm_loadl_epi64((__m128i *) &lineIn1[x + xmin]), 1);
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_loadl_epi64((__m128i *) &lineIn2[x + xmin])),
_mm_loadl_epi64((__m128i *) &lineIn3[x + xmin]), 1);
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
}
for (; x < xmax; x ++) {
__m256i pix, mmk;
// [16] xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0
mmk = _mm256_set1_epi32(k[x]);
// [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin])),
_mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]), 1);
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
_mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin])),
_mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]), 1);
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
}
sss0 = _mm256_srai_epi32(sss0, coefs_precision);
sss1 = _mm256_srai_epi32(sss1, coefs_precision);
sss0 = _mm256_packs_epi32(sss0, zero);
sss1 = _mm256_packs_epi32(sss1, zero);
sss0 = _mm256_packus_epi16(sss0, zero);
sss1 = _mm256_packus_epi16(sss1, zero);
lineOut0[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 0));
lineOut1[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1));
lineOut2[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 0));
lineOut3[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1));
#else
__m128i sss0, sss1, sss2, sss3;
__m128i initial = _mm_set1_epi32((1 << (coefs_precision -1)));
sss0 = initial;
sss1 = initial;
sss2 = initial;
sss3 = initial;
for (; x < xmax - 3; x += 4) {
__m128i pix, mmk_lo, mmk_hi, source;
__m128i mask_lo = _mm_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0);
__m128i mask_hi = _mm_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8);
// [16] xx xx xx xx k3 k2 k1 k0
__m128i ksource = _mm_loadl_epi64((__m128i *) &k[x]);
// [16] k1 k0 k1 k0 k1 k0 k1 k0
mmk_lo = _mm_shuffle_epi8(ksource, _mm_set_epi8(
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
mmk_hi = _mm_shuffle_epi8(ksource, _mm_set_epi8(
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4));
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
source = _mm_loadu_si128((__m128i *) &lineIn0[x + xmin]);
// [16] a1 a0 b1 b0 g1 g0 r1 r0
pix = _mm_shuffle_epi8(source, mask_lo);
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk_lo));
// [16] a3 a2 b3 b2 g3 g2 r3 r2
pix = _mm_shuffle_epi8(source, mask_hi);
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk_hi));
source = _mm_loadu_si128((__m128i *) &lineIn1[x + xmin]);
pix = _mm_shuffle_epi8(source, mask_lo);
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk_lo));
pix = _mm_shuffle_epi8(source, mask_hi);
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk_hi));
source = _mm_loadu_si128((__m128i *) &lineIn2[x + xmin]);
pix = _mm_shuffle_epi8(source, mask_lo);
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk_lo));
pix = _mm_shuffle_epi8(source, mask_hi);
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk_hi));
source = _mm_loadu_si128((__m128i *) &lineIn3[x + xmin]);
pix = _mm_shuffle_epi8(source, mask_lo);
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk_lo));
pix = _mm_shuffle_epi8(source, mask_hi);
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk_hi));
}
for (; x < xmax - 1; x += 2) {
__m128i pix, mmk;
__m128i mask = _mm_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0);
// [16] k1 k0 k1 k0 k1 k0 k1 k0
mmk = _mm_set1_epi32(*(int *) &k[x]);
// [8] x x x x x x x x a1 b1 g1 r1 a0 b0 g0 r0
pix = _mm_loadl_epi64((__m128i *) &lineIn0[x + xmin]);
// [16] a1 a0 b1 b0 g1 g0 r1 r0
pix = _mm_shuffle_epi8(pix, mask);
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
pix = _mm_loadl_epi64((__m128i *) &lineIn1[x + xmin]);
pix = _mm_shuffle_epi8(pix, mask);
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
pix = _mm_loadl_epi64((__m128i *) &lineIn2[x + xmin]);
pix = _mm_shuffle_epi8(pix, mask);
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk));
pix = _mm_loadl_epi64((__m128i *) &lineIn3[x + xmin]);
pix = _mm_shuffle_epi8(pix, mask);
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk));
}
for (; x < xmax; x ++) {
__m128i pix, mmk;
// [16] xx k0 xx k0 xx k0 xx k0
mmk = _mm_set1_epi32(k[x]);
// [16] xx a0 xx b0 xx g0 xx r0
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin]);
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]);
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin]);
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk));
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]);
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk));
}
sss0 = _mm_srai_epi32(sss0, coefs_precision);
sss1 = _mm_srai_epi32(sss1, coefs_precision);
sss2 = _mm_srai_epi32(sss2, coefs_precision);
sss3 = _mm_srai_epi32(sss3, coefs_precision);
sss0 = _mm_packs_epi32(sss0, sss0);
sss1 = _mm_packs_epi32(sss1, sss1);
sss2 = _mm_packs_epi32(sss2, sss2);
sss3 = _mm_packs_epi32(sss3, sss3);
lineOut0[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss0, sss0));
lineOut1[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss1, sss1));
lineOut2[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss2, sss2));
lineOut3[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss3, sss3));
#endif
}
}
void
ImagingResampleHorizontalConvolution8u(UINT32 *lineOut, UINT32 *lineIn,
int xsize, int *xbounds, INT16 *kk, int kmax, int coefs_precision)
{
int xmin, xmax, xx, x;
INT16 *k;
for (xx = 0; xx < xsize; xx++) {
__m128i sss;
xmin = xbounds[xx * 2 + 0];
xmax = xbounds[xx * 2 + 1];
k = &kk[xx * kmax];
x = 0;
#if defined(__AVX2__)
if (xmax < 8) {
sss = _mm_set1_epi32((1 << (coefs_precision -1)) + xmax / 2);
} else {
__m256i sss256 = _mm256_set1_epi32((1 << (coefs_precision -2)) + xmax / 4);
for (; x < xmax - 7; x += 8) {
__m256i pix, mmk, source;
__m128i tmp = _mm_loadu_si128((__m128i *) &k[x]);
__m256i ksource = _mm256_insertf128_si256(
_mm256_castsi128_si256(tmp), tmp, 1);
source = _mm256_loadu_si256((__m256i *) &lineIn[x + xmin]);
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
11,10, 9,8, 11,10, 9,8, 11,10, 9,8, 11,10, 9,8,
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
15,14, 13,12, 15,14, 13,12, 15,14, 13,12, 15,14, 13,12,
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4));
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
}
for (; x < xmax - 3; x += 4) {
__m256i pix, mmk, source;
__m128i tmp = _mm_loadl_epi64((__m128i *) &k[x]);
__m256i ksource = _mm256_insertf128_si256(
_mm256_castsi128_si256(tmp), tmp, 1);
tmp = _mm_loadu_si128((__m128i *) &lineIn[x + xmin]);
source = _mm256_insertf128_si256(
_mm256_castsi128_si256(tmp), tmp, 1);
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4,
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
}
sss = _mm_add_epi32(
_mm256_extracti128_si256(sss256, 0),
_mm256_extracti128_si256(sss256, 1)
);
}
#else
sss = _mm_set1_epi32((1 << (coefs_precision -1)) + xmax / 2);
for (; x < xmax - 7; x += 8) {
__m128i pix, mmk, source;
__m128i ksource = _mm_loadu_si128((__m128i *) &k[x]);
source = _mm_loadu_si128((__m128i *) &lineIn[x + xmin]);
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
-1,11, -1,3, -1,10, -1,2, -1,9, -1,1, -1,8, -1,0));
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
5,4, 1,0, 5,4, 1,0, 5,4, 1,0, 5,4, 1,0));
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
-1,15, -1,7, -1,14, -1,6, -1,13, -1,5, -1,12, -1,4));
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
7,6, 3,2, 7,6, 3,2, 7,6, 3,2, 7,6, 3,2));
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
source = _mm_loadu_si128((__m128i *) &lineIn[x + 4 + xmin]);
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
-1,11, -1,3, -1,10, -1,2, -1,9, -1,1, -1,8, -1,0));
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
13,12, 9,8, 13,12, 9,8, 13,12, 9,8, 13,12, 9,8));
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
-1,15, -1,7, -1,14, -1,6, -1,13, -1,5, -1,12, -1,4));
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
15,14, 11,10, 15,14, 11,10, 15,14, 11,10, 15,14, 11,10));
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
}
for (; x < xmax - 3; x += 4) {
__m128i pix, mmk;
__m128i source = _mm_loadu_si128((__m128i *) &lineIn[x + xmin]);
__m128i ksource = _mm_loadl_epi64((__m128i *) &k[x]);
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
-1,11, -1,3, -1,10, -1,2, -1,9, -1,1, -1,8, -1,0));
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
5,4, 1,0, 5,4, 1,0, 5,4, 1,0, 5,4, 1,0));
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
-1,15, -1,7, -1,14, -1,6, -1,13, -1,5, -1,12, -1,4));
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
7,6, 3,2, 7,6, 3,2, 7,6, 3,2, 7,6, 3,2));
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
}
#endif
for (; x < xmax - 1; x += 2) {
__m128i pix, mmk;
__m128i source = _mm_loadl_epi64((__m128i *) &lineIn[x + xmin]);
__m128i ksource = _mm_cvtsi32_si128(*(int *) &k[x]);
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
}
for (; x < xmax; x ++) {
__m128i pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn[x + xmin]);
__m128i mmk = _mm_set1_epi32(k[x]);
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
}
sss = _mm_srai_epi32(sss, coefs_precision);
sss = _mm_packs_epi32(sss, sss);
lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
}
}

View File

@ -0,0 +1,263 @@
void
ImagingResampleVerticalConvolution8u(UINT32 *lineOut, Imaging imIn,
int xmin, int xmax, INT16 *k, int coefs_precision)
{
int x;
int xx = 0;
int xsize = imIn->xsize;
__m128i initial = _mm_set1_epi32((1 << (coefs_precision -1)) + xmax / 2);
#if defined(__AVX2__)
__m256i initial_256 = _mm256_set1_epi32((1 << (coefs_precision -1)) + xmax / 2);
for (; xx < xsize - 7; xx += 8) {
__m256i sss0 = initial_256;
__m256i sss1 = initial_256;
__m256i sss2 = initial_256;
__m256i sss3 = initial_256;
x = 0;
for (; x < xmax - 1; x += 2) {
__m256i source, source1, source2;
__m256i pix, mmk, mmk1;
mmk = _mm256_set1_epi32(k[x]);
mmk1 = _mm256_set1_epi32(k[x + 1]);
mmk = _mm256_unpacklo_epi16(
_mm256_packs_epi32(mmk, mmk),
_mm256_packs_epi32(mmk1, mmk1));
source1 = _mm256_loadu_si256( // top line
(__m256i *) &imIn->image32[x + xmin][xx]);
source2 = _mm256_loadu_si256( // bottom line
(__m256i *) &imIn->image32[x + 1 + xmin][xx]);
source = _mm256_unpacklo_epi8(source1, source2);
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
source = _mm256_unpackhi_epi8(source1, source2);
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
}
for (; x < xmax; x += 1) {
__m256i source, source1, pix, mmk;
mmk = _mm256_set1_epi32(k[x]);
source1 = _mm256_loadu_si256( // top line
(__m256i *) &imIn->image32[x + xmin][xx]);
source = _mm256_unpacklo_epi8(source1, _mm256_setzero_si256());
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
source = _mm256_unpackhi_epi8(source1, _mm256_setzero_si256());
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
}
sss0 = _mm256_srai_epi32(sss0, coefs_precision);
sss1 = _mm256_srai_epi32(sss1, coefs_precision);
sss2 = _mm256_srai_epi32(sss2, coefs_precision);
sss3 = _mm256_srai_epi32(sss3, coefs_precision);
sss0 = _mm256_packs_epi32(sss0, sss1);
sss2 = _mm256_packs_epi32(sss2, sss3);
sss0 = _mm256_packus_epi16(sss0, sss2);
_mm256_storeu_si256((__m256i *) &lineOut[xx], sss0);
}
#else
for (; xx < xsize - 7; xx += 8) {
__m128i sss0 = initial;
__m128i sss1 = initial;
__m128i sss2 = initial;
__m128i sss3 = initial;
__m128i sss4 = initial;
__m128i sss5 = initial;
__m128i sss6 = initial;
__m128i sss7 = initial;
x = 0;
for (; x < xmax - 1; x += 2) {
__m128i source, source1, source2;
__m128i pix, mmk, mmk1;
mmk = _mm_set1_epi32(k[x]);
mmk1 = _mm_set1_epi32(k[x + 1]);
mmk = _mm_unpacklo_epi16(
_mm_packs_epi32(mmk, mmk),
_mm_packs_epi32(mmk1, mmk1));
source1 = _mm_loadu_si128( // top line
(__m128i *) &imIn->image32[x + xmin][xx]);
source2 = _mm_loadu_si128( // bottom line
(__m128i *) &imIn->image32[x + 1 + xmin][xx]);
source = _mm_unpacklo_epi8(source1, source2);
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
source = _mm_unpackhi_epi8(source1, source2);
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk));
source1 = _mm_loadu_si128( // top line
(__m128i *) &imIn->image32[x + xmin][xx + 4]);
source2 = _mm_loadu_si128( // bottom line
(__m128i *) &imIn->image32[x + 1 + xmin][xx + 4]);
source = _mm_unpacklo_epi8(source1, source2);
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss4 = _mm_add_epi32(sss4, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss5 = _mm_add_epi32(sss5, _mm_madd_epi16(pix, mmk));
source = _mm_unpackhi_epi8(source1, source2);
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss6 = _mm_add_epi32(sss6, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss7 = _mm_add_epi32(sss7, _mm_madd_epi16(pix, mmk));
}
for (; x < xmax; x += 1) {
__m128i source, source1, pix, mmk;
mmk = _mm_set1_epi32(k[x]);
source1 = _mm_loadu_si128( // top line
(__m128i *) &imIn->image32[x + xmin][xx]);
source = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
source = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk));
source1 = _mm_loadu_si128( // top line
(__m128i *) &imIn->image32[x + xmin][xx + 4]);
source = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss4 = _mm_add_epi32(sss4, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss5 = _mm_add_epi32(sss5, _mm_madd_epi16(pix, mmk));
source = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss6 = _mm_add_epi32(sss6, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss7 = _mm_add_epi32(sss7, _mm_madd_epi16(pix, mmk));
}
sss0 = _mm_srai_epi32(sss0, coefs_precision);
sss1 = _mm_srai_epi32(sss1, coefs_precision);
sss2 = _mm_srai_epi32(sss2, coefs_precision);
sss3 = _mm_srai_epi32(sss3, coefs_precision);
sss4 = _mm_srai_epi32(sss4, coefs_precision);
sss5 = _mm_srai_epi32(sss5, coefs_precision);
sss6 = _mm_srai_epi32(sss6, coefs_precision);
sss7 = _mm_srai_epi32(sss7, coefs_precision);
sss0 = _mm_packs_epi32(sss0, sss1);
sss2 = _mm_packs_epi32(sss2, sss3);
sss0 = _mm_packus_epi16(sss0, sss2);
_mm_storeu_si128((__m128i *) &lineOut[xx], sss0);
sss4 = _mm_packs_epi32(sss4, sss5);
sss6 = _mm_packs_epi32(sss6, sss7);
sss4 = _mm_packus_epi16(sss4, sss6);
_mm_storeu_si128((__m128i *) &lineOut[xx + 4], sss4);
}
#endif
for (; xx < xsize - 1; xx += 2) {
__m128i sss0 = initial; // left row
__m128i sss1 = initial; // right row
x = 0;
for (; x < xmax - 1; x += 2) {
__m128i source, source1, source2;
__m128i pix, mmk, mmk1;
mmk = _mm_set1_epi32(k[x]);
mmk1 = _mm_set1_epi32(k[x + 1]);
mmk = _mm_unpacklo_epi16(
_mm_packs_epi32(mmk, mmk),
_mm_packs_epi32(mmk1, mmk1));
source1 = _mm_loadl_epi64( // top line
(__m128i *) &imIn->image32[x + xmin][xx]);
source2 = _mm_loadl_epi64( // bottom line
(__m128i *) &imIn->image32[x + 1 + xmin][xx]);
source = _mm_unpacklo_epi8(source1, source2);
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
}
for (; x < xmax; x += 1) {
__m128i source, source1, pix, mmk;
mmk = _mm_set1_epi32(k[x]);
source1 = _mm_loadl_epi64( // top line
(__m128i *) &imIn->image32[x + xmin][xx]);
source = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
}
sss0 = _mm_srai_epi32(sss0, coefs_precision);
sss1 = _mm_srai_epi32(sss1, coefs_precision);
sss0 = _mm_packs_epi32(sss0, sss1);
sss0 = _mm_packus_epi16(sss0, sss0);
_mm_storel_epi64((__m128i *) &lineOut[xx], sss0);
}
for (; xx < xsize; xx++) {
__m128i sss = initial;
x = 0;
for (; x < xmax - 1; x += 2) {
__m128i source, source1, source2;
__m128i pix, mmk, mmk1;
mmk = _mm_set1_epi32(k[x]);
mmk1 = _mm_set1_epi32(k[x + 1]);
mmk = _mm_unpacklo_epi16(
_mm_packs_epi32(mmk, mmk),
_mm_packs_epi32(mmk1, mmk1));
source1 = _mm_cvtsi32_si128( // top line
*(int *) &imIn->image32[x + xmin][xx]);
source2 = _mm_cvtsi32_si128( // bottom line
*(int *) &imIn->image32[x + 1 + xmin][xx]);
source = _mm_unpacklo_epi8(source1, source2);
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
}
for (; x < xmax; x++) {
__m128i pix = _mm_cvtepu8_epi32(*(__m128i *) &imIn->image32[x + xmin][xx]);
__m128i mmk = _mm_set1_epi32(k[x]);
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
}
sss = _mm_srai_epi32(sss, coefs_precision);
sss = _mm_packs_epi32(sss, sss);
lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
}
}

View File

@ -1,6 +1,17 @@
#include "Imaging.h" #include "Imaging.h"
#include <math.h> #include <math.h>
#include <emmintrin.h>
#include <mmintrin.h>
#include <smmintrin.h>
#if defined(__AVX2__)
#include <immintrin.h>
#endif
#include "ResampleSIMDHorizontalConv.c"
#include "ResampleSIMDVerticalConv.c"
#define ROUND_UP(f) ((int)((f) >= 0.0 ? (f) + 0.5F : (f)-0.5F)) #define ROUND_UP(f) ((int)((f) >= 0.0 ? (f) + 0.5F : (f)-0.5F))
@ -90,6 +101,8 @@ static struct filter LANCZOS = {lanczos_filter, 3.0};
in the other it will be more than 1.0. That is why we need in the other it will be more than 1.0. That is why we need
two extra bits for overflow and int type. */ two extra bits for overflow and int type. */
#define PRECISION_BITS (32 - 8 - 2) #define PRECISION_BITS (32 - 8 - 2)
/* We use signed INT16 type to store coefficients. */
#define MAX_COEFS_PRECISION (16 - 1)
/* Handles values form -640 to 639. */ /* Handles values form -640 to 639. */
UINT8 _clip8_lookups[1280] = { UINT8 _clip8_lookups[1280] = {
@ -173,9 +186,9 @@ UINT8 _clip8_lookups[1280] = {
UINT8 *clip8_lookups = &_clip8_lookups[640]; UINT8 *clip8_lookups = &_clip8_lookups[640];
static inline UINT8 static inline UINT8 clip8(int in, int precision)
clip8(int in) { {
return clip8_lookups[in >> PRECISION_BITS]; return clip8_lookups[in >> precision];
} }
int int
@ -265,34 +278,54 @@ precompute_coeffs(
return ksize; return ksize;
} }
void
normalize_coeffs_8bpc(int outSize, int ksize, double *prekk) { int
normalize_coeffs_8bpc(int outSize, int ksize, double *prekk)
{
int x; int x;
INT32 *kk; INT16 *kk;
int coefs_precision;
double maxkk;
// use the same buffer for normalized coefficients // use the same buffer for normalized coefficients
kk = (INT32 *)prekk; kk = (INT16 *) prekk;
maxkk = prekk[0];
for (x = 0; x < outSize * ksize; x++) {
if (maxkk < prekk[x]) {
maxkk = prekk[x];
}
}
coefs_precision = 0;
while (maxkk < (1 << (MAX_COEFS_PRECISION-1)) && (coefs_precision < PRECISION_BITS)) {
maxkk *= 2;
coefs_precision += 1;
};
for (x = 0; x < outSize * ksize; x++) { for (x = 0; x < outSize * ksize; x++) {
if (prekk[x] < 0) { if (prekk[x] < 0) {
kk[x] = (int)(-0.5 + prekk[x] * (1 << PRECISION_BITS)); kk[x] = (int) (-0.5 + prekk[x] * (1 << coefs_precision));
} else { } else {
kk[x] = (int)(0.5 + prekk[x] * (1 << PRECISION_BITS)); kk[x] = (int) (0.5 + prekk[x] * (1 << coefs_precision));
} }
} }
return coefs_precision;
} }
void void
ImagingResampleHorizontal_8bpc( ImagingResampleHorizontal_8bpc(
Imaging imOut, Imaging imIn, int offset, int ksize, int *bounds, double *prekk) { Imaging imOut, Imaging imIn, int offset, int ksize, int *bounds, double *prekk) {
ImagingSectionCookie cookie; ImagingSectionCookie cookie;
int ss0, ss1, ss2, ss3; int ss0;
int xx, yy, x, xmin, xmax; int xx, yy, x, xmin, xmax;
INT32 *k, *kk; INT16 *k, *kk;
int coefs_precision;
// use the same buffer for normalized coefficients // use the same buffer for normalized coefficients
kk = (INT32 *)prekk; kk = (INT16 *) prekk;
normalize_coeffs_8bpc(imOut->xsize, ksize, prekk); coefs_precision = normalize_coeffs_8bpc(imOut->xsize, ksize, prekk);
ImagingSectionEnter(&cookie); ImagingSectionEnter(&cookie);
if (imIn->image8) { if (imIn->image8) {
@ -301,74 +334,36 @@ ImagingResampleHorizontal_8bpc(
xmin = bounds[xx * 2 + 0]; xmin = bounds[xx * 2 + 0];
xmax = bounds[xx * 2 + 1]; xmax = bounds[xx * 2 + 1];
k = &kk[xx * ksize]; k = &kk[xx * ksize];
ss0 = 1 << (PRECISION_BITS - 1); ss0 = 1 << (coefs_precision -1);
for (x = 0; x < xmax; x++) { for (x = 0; x < xmax; x++) {
ss0 += ((UINT8)imIn->image8[yy + offset][x + xmin]) * k[x]; ss0 += ((UINT8)imIn->image8[yy + offset][x + xmin]) * k[x];
} }
imOut->image8[yy][xx] = clip8(ss0); imOut->image8[yy][xx] = clip8(ss0, coefs_precision);
} }
} }
} else if (imIn->type == IMAGING_TYPE_UINT8) { } else if (imIn->type == IMAGING_TYPE_UINT8) {
if (imIn->bands == 2) { yy = 0;
for (yy = 0; yy < imOut->ysize; yy++) { for (; yy < imOut->ysize - 3; yy += 4) {
for (xx = 0; xx < imOut->xsize; xx++) { ImagingResampleHorizontalConvolution8u4x(
UINT32 v; (UINT32 *) imOut->image32[yy],
xmin = bounds[xx * 2 + 0]; (UINT32 *) imOut->image32[yy + 1],
xmax = bounds[xx * 2 + 1]; (UINT32 *) imOut->image32[yy + 2],
k = &kk[xx * ksize]; (UINT32 *) imOut->image32[yy + 3],
ss0 = ss3 = 1 << (PRECISION_BITS - 1); (UINT32 *) imIn->image32[yy + offset],
for (x = 0; x < xmax; x++) { (UINT32 *) imIn->image32[yy + offset + 1],
ss0 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 0]) * (UINT32 *) imIn->image32[yy + offset + 2],
k[x]; (UINT32 *) imIn->image32[yy + offset + 3],
ss3 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 3]) * imOut->xsize, bounds, kk, ksize,
k[x]; coefs_precision
} );
v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
}
}
} else if (imIn->bands == 3) {
for (yy = 0; yy < imOut->ysize; yy++) {
for (xx = 0; xx < imOut->xsize; xx++) {
UINT32 v;
xmin = bounds[xx * 2 + 0];
xmax = bounds[xx * 2 + 1];
k = &kk[xx * ksize];
ss0 = ss1 = ss2 = 1 << (PRECISION_BITS - 1);
for (x = 0; x < xmax; x++) {
ss0 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 0]) *
k[x];
ss1 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 1]) *
k[x];
ss2 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 2]) *
k[x];
}
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0);
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
}
}
} else {
for (yy = 0; yy < imOut->ysize; yy++) {
for (xx = 0; xx < imOut->xsize; xx++) {
UINT32 v;
xmin = bounds[xx * 2 + 0];
xmax = bounds[xx * 2 + 1];
k = &kk[xx * ksize];
ss0 = ss1 = ss2 = ss3 = 1 << (PRECISION_BITS - 1);
for (x = 0; x < xmax; x++) {
ss0 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 0]) *
k[x];
ss1 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 1]) *
k[x];
ss2 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 2]) *
k[x];
ss3 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 3]) *
k[x];
}
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
}
} }
for (; yy < imOut->ysize; yy++) {
ImagingResampleHorizontalConvolution8u(
(UINT32 *) imOut->image32[yy],
(UINT32 *) imIn->image32[yy + offset],
imOut->xsize, bounds, kk, ksize,
coefs_precision
);
} }
} }
ImagingSectionLeave(&cookie); ImagingSectionLeave(&cookie);
@ -378,13 +373,14 @@ void
ImagingResampleVertical_8bpc( ImagingResampleVertical_8bpc(
Imaging imOut, Imaging imIn, int offset, int ksize, int *bounds, double *prekk) { Imaging imOut, Imaging imIn, int offset, int ksize, int *bounds, double *prekk) {
ImagingSectionCookie cookie; ImagingSectionCookie cookie;
int ss0, ss1, ss2, ss3; int ss0;
int xx, yy, y, ymin, ymax; int xx, yy, y, ymin, ymax;
INT32 *k, *kk; INT16 *k, *kk;
int coefs_precision;
// use the same buffer for normalized coefficients // use the same buffer for normalized coefficients
kk = (INT32 *)prekk; kk = (INT16 *) prekk;
normalize_coeffs_8bpc(imOut->ysize, ksize, prekk); coefs_precision = normalize_coeffs_8bpc(imOut->ysize, ksize, prekk);
ImagingSectionEnter(&cookie); ImagingSectionEnter(&cookie);
if (imIn->image8) { if (imIn->image8) {
@ -393,65 +389,22 @@ ImagingResampleVertical_8bpc(
ymin = bounds[yy * 2 + 0]; ymin = bounds[yy * 2 + 0];
ymax = bounds[yy * 2 + 1]; ymax = bounds[yy * 2 + 1];
for (xx = 0; xx < imOut->xsize; xx++) { for (xx = 0; xx < imOut->xsize; xx++) {
ss0 = 1 << (PRECISION_BITS - 1); ss0 = 1 << (coefs_precision -1);
for (y = 0; y < ymax; y++) { for (y = 0; y < ymax; y++) {
ss0 += ((UINT8)imIn->image8[y + ymin][xx]) * k[y]; ss0 += ((UINT8)imIn->image8[y + ymin][xx]) * k[y];
} }
imOut->image8[yy][xx] = clip8(ss0); imOut->image8[yy][xx] = clip8(ss0, coefs_precision);
} }
} }
} else if (imIn->type == IMAGING_TYPE_UINT8) { } else if (imIn->type == IMAGING_TYPE_UINT8) {
if (imIn->bands == 2) {
for (yy = 0; yy < imOut->ysize; yy++) { for (yy = 0; yy < imOut->ysize; yy++) {
k = &kk[yy * ksize]; k = &kk[yy * ksize];
ymin = bounds[yy * 2 + 0]; ymin = bounds[yy * 2 + 0];
ymax = bounds[yy * 2 + 1]; ymax = bounds[yy * 2 + 1];
for (xx = 0; xx < imOut->xsize; xx++) { ImagingResampleVerticalConvolution8u(
UINT32 v; (UINT32 *) imOut->image32[yy], imIn,
ss0 = ss3 = 1 << (PRECISION_BITS - 1); ymin, ymax, k, coefs_precision
for (y = 0; y < ymax; y++) { );
ss0 += ((UINT8)imIn->image[y + ymin][xx * 4 + 0]) * k[y];
ss3 += ((UINT8)imIn->image[y + ymin][xx * 4 + 3]) * k[y];
}
v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
}
}
} else if (imIn->bands == 3) {
for (yy = 0; yy < imOut->ysize; yy++) {
k = &kk[yy * ksize];
ymin = bounds[yy * 2 + 0];
ymax = bounds[yy * 2 + 1];
for (xx = 0; xx < imOut->xsize; xx++) {
UINT32 v;
ss0 = ss1 = ss2 = 1 << (PRECISION_BITS - 1);
for (y = 0; y < ymax; y++) {
ss0 += ((UINT8)imIn->image[y + ymin][xx * 4 + 0]) * k[y];
ss1 += ((UINT8)imIn->image[y + ymin][xx * 4 + 1]) * k[y];
ss2 += ((UINT8)imIn->image[y + ymin][xx * 4 + 2]) * k[y];
}
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0);
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
}
}
} else {
for (yy = 0; yy < imOut->ysize; yy++) {
k = &kk[yy * ksize];
ymin = bounds[yy * 2 + 0];
ymax = bounds[yy * 2 + 1];
for (xx = 0; xx < imOut->xsize; xx++) {
UINT32 v;
ss0 = ss1 = ss2 = ss3 = 1 << (PRECISION_BITS - 1);
for (y = 0; y < ymax; y++) {
ss0 += ((UINT8)imIn->image[y + ymin][xx * 4 + 0]) * k[y];
ss1 += ((UINT8)imIn->image[y + ymin][xx * 4 + 1]) * k[y];
ss2 += ((UINT8)imIn->image[y + ymin][xx * 4 + 2]) * k[y];
ss3 += ((UINT8)imIn->image[y + ymin][xx * 4 + 3]) * k[y];
}
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
}
}
} }
} }
ImagingSectionLeave(&cookie); ImagingSectionLeave(&cookie);