mirror of
https://github.com/python-pillow/Pillow.git
synced 2025-08-21 04:34:47 +03:00
SIMD Resample. unrolled SSE4 & AVX2
This commit is contained in:
parent
b4a54840ee
commit
8796b40ef6
|
@ -81,12 +81,13 @@ class TestImagingCoreResampleAccuracy:
|
||||||
s_px[size[0] - x - 1, y] = 255 - val
|
s_px[size[0] - x - 1, y] = 255 - val
|
||||||
return sample
|
return sample
|
||||||
|
|
||||||
def check_case(self, case, sample):
|
def check_case(self, case, sample, strict=True):
|
||||||
s_px = sample.load()
|
s_px = sample.load()
|
||||||
c_px = case.load()
|
c_px = case.load()
|
||||||
for y in range(case.size[1]):
|
for y in range(case.size[1]):
|
||||||
for x in range(case.size[0]):
|
for x in range(case.size[0]):
|
||||||
if c_px[x, y] != s_px[x, y]:
|
diff = abs(c_px[x, y] - s_px[x, y])
|
||||||
|
if (strict and diff) or diff > 1:
|
||||||
message = (
|
message = (
|
||||||
f"\nHave: \n{self.serialize_image(case)}\n"
|
f"\nHave: \n{self.serialize_image(case)}\n"
|
||||||
f"\nExpected: \n{self.serialize_image(sample)}"
|
f"\nExpected: \n{self.serialize_image(sample)}"
|
||||||
|
@ -217,7 +218,7 @@ class TestImagingCoreResampleAccuracy:
|
||||||
"b8 b7 b4 bf c4 a0"
|
"b8 b7 b4 bf c4 a0"
|
||||||
)
|
)
|
||||||
for channel in case.split():
|
for channel in case.split():
|
||||||
self.check_case(channel, self.make_sample(data, (12, 12)))
|
self.check_case(channel, self.make_sample(data, (12, 12)), strict=False)
|
||||||
|
|
||||||
def test_box_filter_correct_range(self):
|
def test_box_filter_correct_range(self):
|
||||||
im = Image.new("RGB", (8, 8), "#1688ff").resize((100, 100), Image.BOX)
|
im = Image.new("RGB", (8, 8), "#1688ff").resize((100, 100), Image.BOX)
|
||||||
|
|
382
libImaging/ResampleSIMDHorizontalConv.c
Normal file
382
libImaging/ResampleSIMDHorizontalConv.c
Normal file
|
@ -0,0 +1,382 @@
|
||||||
|
void
|
||||||
|
ImagingResampleHorizontalConvolution8u4x(
|
||||||
|
UINT32 *lineOut0, UINT32 *lineOut1, UINT32 *lineOut2, UINT32 *lineOut3,
|
||||||
|
UINT32 *lineIn0, UINT32 *lineIn1, UINT32 *lineIn2, UINT32 *lineIn3,
|
||||||
|
int xsize, int *xbounds, INT16 *kk, int kmax, int coefs_precision)
|
||||||
|
{
|
||||||
|
int xmin, xmax, xx, x;
|
||||||
|
INT16 *k;
|
||||||
|
|
||||||
|
for (xx = 0; xx < xsize; xx++) {
|
||||||
|
xmin = xbounds[xx * 2 + 0];
|
||||||
|
xmax = xbounds[xx * 2 + 1];
|
||||||
|
k = &kk[xx * kmax];
|
||||||
|
x = 0;
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
__m256i sss0, sss1;
|
||||||
|
__m256i zero = _mm256_setzero_si256();
|
||||||
|
__m256i initial = _mm256_set1_epi32((1 << (coefs_precision -1)));
|
||||||
|
sss0 = initial;
|
||||||
|
sss1 = initial;
|
||||||
|
|
||||||
|
for (; x < xmax - 3; x += 4) {
|
||||||
|
__m256i pix, mmk0, mmk1, source, ksource;
|
||||||
|
__m128i tmp = _mm_loadl_epi64((__m128i *) &k[x]);
|
||||||
|
ksource = _mm256_insertf128_si256(
|
||||||
|
_mm256_castsi128_si256(tmp), tmp, 1);
|
||||||
|
|
||||||
|
mmk0 = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
|
||||||
|
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0,
|
||||||
|
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
|
||||||
|
mmk1 = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
|
||||||
|
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4,
|
||||||
|
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4));
|
||||||
|
|
||||||
|
source = _mm256_inserti128_si256(_mm256_castsi128_si256(
|
||||||
|
_mm_loadu_si128((__m128i *) &lineIn0[x + xmin])),
|
||||||
|
_mm_loadu_si128((__m128i *) &lineIn1[x + xmin]), 1);
|
||||||
|
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||||
|
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk0));
|
||||||
|
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||||
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
|
||||||
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
|
||||||
|
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk1));
|
||||||
|
|
||||||
|
source = _mm256_inserti128_si256(_mm256_castsi128_si256(
|
||||||
|
_mm_loadu_si128((__m128i *) &lineIn2[x + xmin])),
|
||||||
|
_mm_loadu_si128((__m128i *) &lineIn3[x + xmin]), 1);
|
||||||
|
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||||
|
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk0));
|
||||||
|
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||||
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
|
||||||
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
|
||||||
|
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk1));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; x < xmax - 1; x += 2) {
|
||||||
|
__m256i pix, mmk;
|
||||||
|
__m128i ksource = _mm_cvtsi32_si128(*(int *) &k[x]);
|
||||||
|
ksource = _mm_shuffle_epi8(ksource, _mm_set_epi8(
|
||||||
|
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
|
||||||
|
mmk = _mm256_inserti128_si256(_mm256_castsi128_si256(ksource), ksource, 1);
|
||||||
|
|
||||||
|
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
|
||||||
|
_mm_loadl_epi64((__m128i *) &lineIn0[x + xmin])),
|
||||||
|
_mm_loadl_epi64((__m128i *) &lineIn1[x + xmin]), 1);
|
||||||
|
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||||
|
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
|
||||||
|
_mm_loadl_epi64((__m128i *) &lineIn2[x + xmin])),
|
||||||
|
_mm_loadl_epi64((__m128i *) &lineIn3[x + xmin]), 1);
|
||||||
|
pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||||
|
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; x < xmax; x ++) {
|
||||||
|
__m256i pix, mmk;
|
||||||
|
|
||||||
|
// [16] xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0
|
||||||
|
mmk = _mm256_set1_epi32(k[x]);
|
||||||
|
|
||||||
|
// [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0
|
||||||
|
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
|
||||||
|
_mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin])),
|
||||||
|
_mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]), 1);
|
||||||
|
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
|
||||||
|
_mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin])),
|
||||||
|
_mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]), 1);
|
||||||
|
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
|
||||||
|
sss0 = _mm256_srai_epi32(sss0, coefs_precision);
|
||||||
|
sss1 = _mm256_srai_epi32(sss1, coefs_precision);
|
||||||
|
sss0 = _mm256_packs_epi32(sss0, zero);
|
||||||
|
sss1 = _mm256_packs_epi32(sss1, zero);
|
||||||
|
sss0 = _mm256_packus_epi16(sss0, zero);
|
||||||
|
sss1 = _mm256_packus_epi16(sss1, zero);
|
||||||
|
lineOut0[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 0));
|
||||||
|
lineOut1[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1));
|
||||||
|
lineOut2[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 0));
|
||||||
|
lineOut3[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1));
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
__m128i sss0, sss1, sss2, sss3;
|
||||||
|
__m128i initial = _mm_set1_epi32((1 << (coefs_precision -1)));
|
||||||
|
sss0 = initial;
|
||||||
|
sss1 = initial;
|
||||||
|
sss2 = initial;
|
||||||
|
sss3 = initial;
|
||||||
|
|
||||||
|
for (; x < xmax - 3; x += 4) {
|
||||||
|
__m128i pix, mmk_lo, mmk_hi, source;
|
||||||
|
__m128i mask_lo = _mm_set_epi8(
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0);
|
||||||
|
__m128i mask_hi = _mm_set_epi8(
|
||||||
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8);
|
||||||
|
|
||||||
|
// [16] xx xx xx xx k3 k2 k1 k0
|
||||||
|
__m128i ksource = _mm_loadl_epi64((__m128i *) &k[x]);
|
||||||
|
// [16] k1 k0 k1 k0 k1 k0 k1 k0
|
||||||
|
mmk_lo = _mm_shuffle_epi8(ksource, _mm_set_epi8(
|
||||||
|
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
|
||||||
|
mmk_hi = _mm_shuffle_epi8(ksource, _mm_set_epi8(
|
||||||
|
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4));
|
||||||
|
|
||||||
|
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
|
||||||
|
source = _mm_loadu_si128((__m128i *) &lineIn0[x + xmin]);
|
||||||
|
// [16] a1 a0 b1 b0 g1 g0 r1 r0
|
||||||
|
pix = _mm_shuffle_epi8(source, mask_lo);
|
||||||
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk_lo));
|
||||||
|
// [16] a3 a2 b3 b2 g3 g2 r3 r2
|
||||||
|
pix = _mm_shuffle_epi8(source, mask_hi);
|
||||||
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk_hi));
|
||||||
|
|
||||||
|
source = _mm_loadu_si128((__m128i *) &lineIn1[x + xmin]);
|
||||||
|
pix = _mm_shuffle_epi8(source, mask_lo);
|
||||||
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk_lo));
|
||||||
|
pix = _mm_shuffle_epi8(source, mask_hi);
|
||||||
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk_hi));
|
||||||
|
|
||||||
|
source = _mm_loadu_si128((__m128i *) &lineIn2[x + xmin]);
|
||||||
|
pix = _mm_shuffle_epi8(source, mask_lo);
|
||||||
|
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk_lo));
|
||||||
|
pix = _mm_shuffle_epi8(source, mask_hi);
|
||||||
|
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk_hi));
|
||||||
|
|
||||||
|
source = _mm_loadu_si128((__m128i *) &lineIn3[x + xmin]);
|
||||||
|
pix = _mm_shuffle_epi8(source, mask_lo);
|
||||||
|
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk_lo));
|
||||||
|
pix = _mm_shuffle_epi8(source, mask_hi);
|
||||||
|
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk_hi));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; x < xmax - 1; x += 2) {
|
||||||
|
__m128i pix, mmk;
|
||||||
|
__m128i mask = _mm_set_epi8(
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0);
|
||||||
|
|
||||||
|
// [16] k1 k0 k1 k0 k1 k0 k1 k0
|
||||||
|
mmk = _mm_set1_epi32(*(int *) &k[x]);
|
||||||
|
|
||||||
|
// [8] x x x x x x x x a1 b1 g1 r1 a0 b0 g0 r0
|
||||||
|
pix = _mm_loadl_epi64((__m128i *) &lineIn0[x + xmin]);
|
||||||
|
// [16] a1 a0 b1 b0 g1 g0 r1 r0
|
||||||
|
pix = _mm_shuffle_epi8(pix, mask);
|
||||||
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm_loadl_epi64((__m128i *) &lineIn1[x + xmin]);
|
||||||
|
pix = _mm_shuffle_epi8(pix, mask);
|
||||||
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm_loadl_epi64((__m128i *) &lineIn2[x + xmin]);
|
||||||
|
pix = _mm_shuffle_epi8(pix, mask);
|
||||||
|
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm_loadl_epi64((__m128i *) &lineIn3[x + xmin]);
|
||||||
|
pix = _mm_shuffle_epi8(pix, mask);
|
||||||
|
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; x < xmax; x ++) {
|
||||||
|
__m128i pix, mmk;
|
||||||
|
// [16] xx k0 xx k0 xx k0 xx k0
|
||||||
|
mmk = _mm_set1_epi32(k[x]);
|
||||||
|
// [16] xx a0 xx b0 xx g0 xx r0
|
||||||
|
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn0[x + xmin]);
|
||||||
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn1[x + xmin]);
|
||||||
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn2[x + xmin]);
|
||||||
|
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn3[x + xmin]);
|
||||||
|
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
|
||||||
|
sss0 = _mm_srai_epi32(sss0, coefs_precision);
|
||||||
|
sss1 = _mm_srai_epi32(sss1, coefs_precision);
|
||||||
|
sss2 = _mm_srai_epi32(sss2, coefs_precision);
|
||||||
|
sss3 = _mm_srai_epi32(sss3, coefs_precision);
|
||||||
|
sss0 = _mm_packs_epi32(sss0, sss0);
|
||||||
|
sss1 = _mm_packs_epi32(sss1, sss1);
|
||||||
|
sss2 = _mm_packs_epi32(sss2, sss2);
|
||||||
|
sss3 = _mm_packs_epi32(sss3, sss3);
|
||||||
|
lineOut0[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss0, sss0));
|
||||||
|
lineOut1[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss1, sss1));
|
||||||
|
lineOut2[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss2, sss2));
|
||||||
|
lineOut3[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss3, sss3));
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
ImagingResampleHorizontalConvolution8u(UINT32 *lineOut, UINT32 *lineIn,
|
||||||
|
int xsize, int *xbounds, INT16 *kk, int kmax, int coefs_precision)
|
||||||
|
{
|
||||||
|
int xmin, xmax, xx, x;
|
||||||
|
INT16 *k;
|
||||||
|
|
||||||
|
for (xx = 0; xx < xsize; xx++) {
|
||||||
|
__m128i sss;
|
||||||
|
xmin = xbounds[xx * 2 + 0];
|
||||||
|
xmax = xbounds[xx * 2 + 1];
|
||||||
|
k = &kk[xx * kmax];
|
||||||
|
x = 0;
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
if (xmax < 8) {
|
||||||
|
sss = _mm_set1_epi32((1 << (coefs_precision -1)) + xmax / 2);
|
||||||
|
} else {
|
||||||
|
|
||||||
|
__m256i sss256 = _mm256_set1_epi32((1 << (coefs_precision -2)) + xmax / 4);
|
||||||
|
|
||||||
|
for (; x < xmax - 7; x += 8) {
|
||||||
|
__m256i pix, mmk, source;
|
||||||
|
__m128i tmp = _mm_loadu_si128((__m128i *) &k[x]);
|
||||||
|
__m256i ksource = _mm256_insertf128_si256(
|
||||||
|
_mm256_castsi128_si256(tmp), tmp, 1);
|
||||||
|
|
||||||
|
source = _mm256_loadu_si256((__m256i *) &lineIn[x + xmin]);
|
||||||
|
|
||||||
|
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||||
|
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
|
||||||
|
11,10, 9,8, 11,10, 9,8, 11,10, 9,8, 11,10, 9,8,
|
||||||
|
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
|
||||||
|
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||||
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
|
||||||
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
|
||||||
|
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
|
||||||
|
15,14, 13,12, 15,14, 13,12, 15,14, 13,12, 15,14, 13,12,
|
||||||
|
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4));
|
||||||
|
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; x < xmax - 3; x += 4) {
|
||||||
|
__m256i pix, mmk, source;
|
||||||
|
__m128i tmp = _mm_loadl_epi64((__m128i *) &k[x]);
|
||||||
|
__m256i ksource = _mm256_insertf128_si256(
|
||||||
|
_mm256_castsi128_si256(tmp), tmp, 1);
|
||||||
|
|
||||||
|
tmp = _mm_loadu_si128((__m128i *) &lineIn[x + xmin]);
|
||||||
|
source = _mm256_insertf128_si256(
|
||||||
|
_mm256_castsi128_si256(tmp), tmp, 1);
|
||||||
|
|
||||||
|
pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
|
||||||
|
-1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||||
|
mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
|
||||||
|
7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4,
|
||||||
|
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
|
||||||
|
sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
|
||||||
|
sss = _mm_add_epi32(
|
||||||
|
_mm256_extracti128_si256(sss256, 0),
|
||||||
|
_mm256_extracti128_si256(sss256, 1)
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
sss = _mm_set1_epi32((1 << (coefs_precision -1)) + xmax / 2);
|
||||||
|
|
||||||
|
for (; x < xmax - 7; x += 8) {
|
||||||
|
__m128i pix, mmk, source;
|
||||||
|
__m128i ksource = _mm_loadu_si128((__m128i *) &k[x]);
|
||||||
|
|
||||||
|
source = _mm_loadu_si128((__m128i *) &lineIn[x + xmin]);
|
||||||
|
|
||||||
|
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
|
||||||
|
-1,11, -1,3, -1,10, -1,2, -1,9, -1,1, -1,8, -1,0));
|
||||||
|
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
|
||||||
|
5,4, 1,0, 5,4, 1,0, 5,4, 1,0, 5,4, 1,0));
|
||||||
|
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
|
||||||
|
-1,15, -1,7, -1,14, -1,6, -1,13, -1,5, -1,12, -1,4));
|
||||||
|
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
|
||||||
|
7,6, 3,2, 7,6, 3,2, 7,6, 3,2, 7,6, 3,2));
|
||||||
|
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
source = _mm_loadu_si128((__m128i *) &lineIn[x + 4 + xmin]);
|
||||||
|
|
||||||
|
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
|
||||||
|
-1,11, -1,3, -1,10, -1,2, -1,9, -1,1, -1,8, -1,0));
|
||||||
|
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
|
||||||
|
13,12, 9,8, 13,12, 9,8, 13,12, 9,8, 13,12, 9,8));
|
||||||
|
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
|
||||||
|
-1,15, -1,7, -1,14, -1,6, -1,13, -1,5, -1,12, -1,4));
|
||||||
|
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
|
||||||
|
15,14, 11,10, 15,14, 11,10, 15,14, 11,10, 15,14, 11,10));
|
||||||
|
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; x < xmax - 3; x += 4) {
|
||||||
|
__m128i pix, mmk;
|
||||||
|
__m128i source = _mm_loadu_si128((__m128i *) &lineIn[x + xmin]);
|
||||||
|
__m128i ksource = _mm_loadl_epi64((__m128i *) &k[x]);
|
||||||
|
|
||||||
|
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
|
||||||
|
-1,11, -1,3, -1,10, -1,2, -1,9, -1,1, -1,8, -1,0));
|
||||||
|
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
|
||||||
|
5,4, 1,0, 5,4, 1,0, 5,4, 1,0, 5,4, 1,0));
|
||||||
|
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
|
||||||
|
-1,15, -1,7, -1,14, -1,6, -1,13, -1,5, -1,12, -1,4));
|
||||||
|
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
|
||||||
|
7,6, 3,2, 7,6, 3,2, 7,6, 3,2, 7,6, 3,2));
|
||||||
|
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (; x < xmax - 1; x += 2) {
|
||||||
|
__m128i pix, mmk;
|
||||||
|
__m128i source = _mm_loadl_epi64((__m128i *) &lineIn[x + xmin]);
|
||||||
|
__m128i ksource = _mm_cvtsi32_si128(*(int *) &k[x]);
|
||||||
|
|
||||||
|
pix = _mm_shuffle_epi8(source, _mm_set_epi8(
|
||||||
|
-1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
|
||||||
|
mmk = _mm_shuffle_epi8(ksource, _mm_set_epi8(
|
||||||
|
3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
|
||||||
|
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; x < xmax; x ++) {
|
||||||
|
__m128i pix = _mm_cvtepu8_epi32(*(__m128i *) &lineIn[x + xmin]);
|
||||||
|
__m128i mmk = _mm_set1_epi32(k[x]);
|
||||||
|
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
sss = _mm_srai_epi32(sss, coefs_precision);
|
||||||
|
sss = _mm_packs_epi32(sss, sss);
|
||||||
|
lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
|
||||||
|
}
|
||||||
|
}
|
263
libImaging/ResampleSIMDVerticalConv.c
Normal file
263
libImaging/ResampleSIMDVerticalConv.c
Normal file
|
@ -0,0 +1,263 @@
|
||||||
|
|
||||||
|
void
|
||||||
|
ImagingResampleVerticalConvolution8u(UINT32 *lineOut, Imaging imIn,
|
||||||
|
int xmin, int xmax, INT16 *k, int coefs_precision)
|
||||||
|
{
|
||||||
|
int x;
|
||||||
|
int xx = 0;
|
||||||
|
int xsize = imIn->xsize;
|
||||||
|
|
||||||
|
__m128i initial = _mm_set1_epi32((1 << (coefs_precision -1)) + xmax / 2);
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
|
__m256i initial_256 = _mm256_set1_epi32((1 << (coefs_precision -1)) + xmax / 2);
|
||||||
|
|
||||||
|
for (; xx < xsize - 7; xx += 8) {
|
||||||
|
__m256i sss0 = initial_256;
|
||||||
|
__m256i sss1 = initial_256;
|
||||||
|
__m256i sss2 = initial_256;
|
||||||
|
__m256i sss3 = initial_256;
|
||||||
|
x = 0;
|
||||||
|
for (; x < xmax - 1; x += 2) {
|
||||||
|
__m256i source, source1, source2;
|
||||||
|
__m256i pix, mmk, mmk1;
|
||||||
|
mmk = _mm256_set1_epi32(k[x]);
|
||||||
|
mmk1 = _mm256_set1_epi32(k[x + 1]);
|
||||||
|
mmk = _mm256_unpacklo_epi16(
|
||||||
|
_mm256_packs_epi32(mmk, mmk),
|
||||||
|
_mm256_packs_epi32(mmk1, mmk1));
|
||||||
|
|
||||||
|
source1 = _mm256_loadu_si256( // top line
|
||||||
|
(__m256i *) &imIn->image32[x + xmin][xx]);
|
||||||
|
source2 = _mm256_loadu_si256( // bottom line
|
||||||
|
(__m256i *) &imIn->image32[x + 1 + xmin][xx]);
|
||||||
|
|
||||||
|
source = _mm256_unpacklo_epi8(source1, source2);
|
||||||
|
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
|
||||||
|
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
|
||||||
|
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
source = _mm256_unpackhi_epi8(source1, source2);
|
||||||
|
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
|
||||||
|
sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
|
||||||
|
sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
for (; x < xmax; x += 1) {
|
||||||
|
__m256i source, source1, pix, mmk;
|
||||||
|
mmk = _mm256_set1_epi32(k[x]);
|
||||||
|
|
||||||
|
source1 = _mm256_loadu_si256( // top line
|
||||||
|
(__m256i *) &imIn->image32[x + xmin][xx]);
|
||||||
|
|
||||||
|
source = _mm256_unpacklo_epi8(source1, _mm256_setzero_si256());
|
||||||
|
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
|
||||||
|
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
|
||||||
|
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
source = _mm256_unpackhi_epi8(source1, _mm256_setzero_si256());
|
||||||
|
pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
|
||||||
|
sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
|
||||||
|
sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
sss0 = _mm256_srai_epi32(sss0, coefs_precision);
|
||||||
|
sss1 = _mm256_srai_epi32(sss1, coefs_precision);
|
||||||
|
sss2 = _mm256_srai_epi32(sss2, coefs_precision);
|
||||||
|
sss3 = _mm256_srai_epi32(sss3, coefs_precision);
|
||||||
|
|
||||||
|
sss0 = _mm256_packs_epi32(sss0, sss1);
|
||||||
|
sss2 = _mm256_packs_epi32(sss2, sss3);
|
||||||
|
sss0 = _mm256_packus_epi16(sss0, sss2);
|
||||||
|
_mm256_storeu_si256((__m256i *) &lineOut[xx], sss0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
for (; xx < xsize - 7; xx += 8) {
|
||||||
|
__m128i sss0 = initial;
|
||||||
|
__m128i sss1 = initial;
|
||||||
|
__m128i sss2 = initial;
|
||||||
|
__m128i sss3 = initial;
|
||||||
|
__m128i sss4 = initial;
|
||||||
|
__m128i sss5 = initial;
|
||||||
|
__m128i sss6 = initial;
|
||||||
|
__m128i sss7 = initial;
|
||||||
|
x = 0;
|
||||||
|
for (; x < xmax - 1; x += 2) {
|
||||||
|
__m128i source, source1, source2;
|
||||||
|
__m128i pix, mmk, mmk1;
|
||||||
|
mmk = _mm_set1_epi32(k[x]);
|
||||||
|
mmk1 = _mm_set1_epi32(k[x + 1]);
|
||||||
|
mmk = _mm_unpacklo_epi16(
|
||||||
|
_mm_packs_epi32(mmk, mmk),
|
||||||
|
_mm_packs_epi32(mmk1, mmk1));
|
||||||
|
|
||||||
|
source1 = _mm_loadu_si128( // top line
|
||||||
|
(__m128i *) &imIn->image32[x + xmin][xx]);
|
||||||
|
source2 = _mm_loadu_si128( // bottom line
|
||||||
|
(__m128i *) &imIn->image32[x + 1 + xmin][xx]);
|
||||||
|
|
||||||
|
source = _mm_unpacklo_epi8(source1, source2);
|
||||||
|
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||||
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
source = _mm_unpackhi_epi8(source1, source2);
|
||||||
|
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||||
|
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
source1 = _mm_loadu_si128( // top line
|
||||||
|
(__m128i *) &imIn->image32[x + xmin][xx + 4]);
|
||||||
|
source2 = _mm_loadu_si128( // bottom line
|
||||||
|
(__m128i *) &imIn->image32[x + 1 + xmin][xx + 4]);
|
||||||
|
|
||||||
|
source = _mm_unpacklo_epi8(source1, source2);
|
||||||
|
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
sss4 = _mm_add_epi32(sss4, _mm_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||||
|
sss5 = _mm_add_epi32(sss5, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
source = _mm_unpackhi_epi8(source1, source2);
|
||||||
|
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
sss6 = _mm_add_epi32(sss6, _mm_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||||
|
sss7 = _mm_add_epi32(sss7, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
for (; x < xmax; x += 1) {
|
||||||
|
__m128i source, source1, pix, mmk;
|
||||||
|
mmk = _mm_set1_epi32(k[x]);
|
||||||
|
|
||||||
|
source1 = _mm_loadu_si128( // top line
|
||||||
|
(__m128i *) &imIn->image32[x + xmin][xx]);
|
||||||
|
|
||||||
|
source = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
|
||||||
|
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||||
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
source = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
|
||||||
|
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
sss2 = _mm_add_epi32(sss2, _mm_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||||
|
sss3 = _mm_add_epi32(sss3, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
source1 = _mm_loadu_si128( // top line
|
||||||
|
(__m128i *) &imIn->image32[x + xmin][xx + 4]);
|
||||||
|
|
||||||
|
source = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
|
||||||
|
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
sss4 = _mm_add_epi32(sss4, _mm_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||||
|
sss5 = _mm_add_epi32(sss5, _mm_madd_epi16(pix, mmk));
|
||||||
|
|
||||||
|
source = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
|
||||||
|
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
sss6 = _mm_add_epi32(sss6, _mm_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||||
|
sss7 = _mm_add_epi32(sss7, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
sss0 = _mm_srai_epi32(sss0, coefs_precision);
|
||||||
|
sss1 = _mm_srai_epi32(sss1, coefs_precision);
|
||||||
|
sss2 = _mm_srai_epi32(sss2, coefs_precision);
|
||||||
|
sss3 = _mm_srai_epi32(sss3, coefs_precision);
|
||||||
|
sss4 = _mm_srai_epi32(sss4, coefs_precision);
|
||||||
|
sss5 = _mm_srai_epi32(sss5, coefs_precision);
|
||||||
|
sss6 = _mm_srai_epi32(sss6, coefs_precision);
|
||||||
|
sss7 = _mm_srai_epi32(sss7, coefs_precision);
|
||||||
|
|
||||||
|
sss0 = _mm_packs_epi32(sss0, sss1);
|
||||||
|
sss2 = _mm_packs_epi32(sss2, sss3);
|
||||||
|
sss0 = _mm_packus_epi16(sss0, sss2);
|
||||||
|
_mm_storeu_si128((__m128i *) &lineOut[xx], sss0);
|
||||||
|
sss4 = _mm_packs_epi32(sss4, sss5);
|
||||||
|
sss6 = _mm_packs_epi32(sss6, sss7);
|
||||||
|
sss4 = _mm_packus_epi16(sss4, sss6);
|
||||||
|
_mm_storeu_si128((__m128i *) &lineOut[xx + 4], sss4);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (; xx < xsize - 1; xx += 2) {
|
||||||
|
__m128i sss0 = initial; // left row
|
||||||
|
__m128i sss1 = initial; // right row
|
||||||
|
x = 0;
|
||||||
|
for (; x < xmax - 1; x += 2) {
|
||||||
|
__m128i source, source1, source2;
|
||||||
|
__m128i pix, mmk, mmk1;
|
||||||
|
mmk = _mm_set1_epi32(k[x]);
|
||||||
|
mmk1 = _mm_set1_epi32(k[x + 1]);
|
||||||
|
mmk = _mm_unpacklo_epi16(
|
||||||
|
_mm_packs_epi32(mmk, mmk),
|
||||||
|
_mm_packs_epi32(mmk1, mmk1));
|
||||||
|
|
||||||
|
source1 = _mm_loadl_epi64( // top line
|
||||||
|
(__m128i *) &imIn->image32[x + xmin][xx]);
|
||||||
|
source2 = _mm_loadl_epi64( // bottom line
|
||||||
|
(__m128i *) &imIn->image32[x + 1 + xmin][xx]);
|
||||||
|
|
||||||
|
source = _mm_unpacklo_epi8(source1, source2);
|
||||||
|
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||||
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
for (; x < xmax; x += 1) {
|
||||||
|
__m128i source, source1, pix, mmk;
|
||||||
|
mmk = _mm_set1_epi32(k[x]);
|
||||||
|
|
||||||
|
source1 = _mm_loadl_epi64( // top line
|
||||||
|
(__m128i *) &imIn->image32[x + xmin][xx]);
|
||||||
|
|
||||||
|
source = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
|
||||||
|
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
|
||||||
|
pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
|
||||||
|
sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
sss0 = _mm_srai_epi32(sss0, coefs_precision);
|
||||||
|
sss1 = _mm_srai_epi32(sss1, coefs_precision);
|
||||||
|
|
||||||
|
sss0 = _mm_packs_epi32(sss0, sss1);
|
||||||
|
sss0 = _mm_packus_epi16(sss0, sss0);
|
||||||
|
_mm_storel_epi64((__m128i *) &lineOut[xx], sss0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; xx < xsize; xx++) {
|
||||||
|
__m128i sss = initial;
|
||||||
|
x = 0;
|
||||||
|
for (; x < xmax - 1; x += 2) {
|
||||||
|
__m128i source, source1, source2;
|
||||||
|
__m128i pix, mmk, mmk1;
|
||||||
|
mmk = _mm_set1_epi32(k[x]);
|
||||||
|
mmk1 = _mm_set1_epi32(k[x + 1]);
|
||||||
|
mmk = _mm_unpacklo_epi16(
|
||||||
|
_mm_packs_epi32(mmk, mmk),
|
||||||
|
_mm_packs_epi32(mmk1, mmk1));
|
||||||
|
|
||||||
|
source1 = _mm_cvtsi32_si128( // top line
|
||||||
|
*(int *) &imIn->image32[x + xmin][xx]);
|
||||||
|
source2 = _mm_cvtsi32_si128( // bottom line
|
||||||
|
*(int *) &imIn->image32[x + 1 + xmin][xx]);
|
||||||
|
|
||||||
|
source = _mm_unpacklo_epi8(source1, source2);
|
||||||
|
pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
|
||||||
|
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
for (; x < xmax; x++) {
|
||||||
|
__m128i pix = _mm_cvtepu8_epi32(*(__m128i *) &imIn->image32[x + xmin][xx]);
|
||||||
|
__m128i mmk = _mm_set1_epi32(k[x]);
|
||||||
|
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
|
||||||
|
}
|
||||||
|
sss = _mm_srai_epi32(sss, coefs_precision);
|
||||||
|
sss = _mm_packs_epi32(sss, sss);
|
||||||
|
lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,17 @@
|
||||||
#include "Imaging.h"
|
#include "Imaging.h"
|
||||||
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <emmintrin.h>
|
||||||
|
#include <mmintrin.h>
|
||||||
|
#include <smmintrin.h>
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#include "ResampleSIMDHorizontalConv.c"
|
||||||
|
#include "ResampleSIMDVerticalConv.c"
|
||||||
|
|
||||||
#define ROUND_UP(f) ((int)((f) >= 0.0 ? (f) + 0.5F : (f)-0.5F))
|
#define ROUND_UP(f) ((int)((f) >= 0.0 ? (f) + 0.5F : (f)-0.5F))
|
||||||
|
|
||||||
|
@ -90,6 +101,8 @@ static struct filter LANCZOS = {lanczos_filter, 3.0};
|
||||||
in the other it will be more than 1.0. That is why we need
|
in the other it will be more than 1.0. That is why we need
|
||||||
two extra bits for overflow and int type. */
|
two extra bits for overflow and int type. */
|
||||||
#define PRECISION_BITS (32 - 8 - 2)
|
#define PRECISION_BITS (32 - 8 - 2)
|
||||||
|
/* We use signed INT16 type to store coefficients. */
|
||||||
|
#define MAX_COEFS_PRECISION (16 - 1)
|
||||||
|
|
||||||
/* Handles values form -640 to 639. */
|
/* Handles values form -640 to 639. */
|
||||||
UINT8 _clip8_lookups[1280] = {
|
UINT8 _clip8_lookups[1280] = {
|
||||||
|
@ -173,9 +186,9 @@ UINT8 _clip8_lookups[1280] = {
|
||||||
|
|
||||||
UINT8 *clip8_lookups = &_clip8_lookups[640];
|
UINT8 *clip8_lookups = &_clip8_lookups[640];
|
||||||
|
|
||||||
static inline UINT8
|
static inline UINT8 clip8(int in, int precision)
|
||||||
clip8(int in) {
|
{
|
||||||
return clip8_lookups[in >> PRECISION_BITS];
|
return clip8_lookups[in >> precision];
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
@ -265,34 +278,54 @@ precompute_coeffs(
|
||||||
return ksize;
|
return ksize;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
|
||||||
normalize_coeffs_8bpc(int outSize, int ksize, double *prekk) {
|
int
|
||||||
|
normalize_coeffs_8bpc(int outSize, int ksize, double *prekk)
|
||||||
|
{
|
||||||
int x;
|
int x;
|
||||||
INT32 *kk;
|
INT16 *kk;
|
||||||
|
int coefs_precision;
|
||||||
|
double maxkk;
|
||||||
|
|
||||||
// use the same buffer for normalized coefficients
|
// use the same buffer for normalized coefficients
|
||||||
kk = (INT32 *)prekk;
|
kk = (INT16 *) prekk;
|
||||||
|
|
||||||
|
maxkk = prekk[0];
|
||||||
|
for (x = 0; x < outSize * ksize; x++) {
|
||||||
|
if (maxkk < prekk[x]) {
|
||||||
|
maxkk = prekk[x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
coefs_precision = 0;
|
||||||
|
while (maxkk < (1 << (MAX_COEFS_PRECISION-1)) && (coefs_precision < PRECISION_BITS)) {
|
||||||
|
maxkk *= 2;
|
||||||
|
coefs_precision += 1;
|
||||||
|
};
|
||||||
|
|
||||||
for (x = 0; x < outSize * ksize; x++) {
|
for (x = 0; x < outSize * ksize; x++) {
|
||||||
if (prekk[x] < 0) {
|
if (prekk[x] < 0) {
|
||||||
kk[x] = (int)(-0.5 + prekk[x] * (1 << PRECISION_BITS));
|
kk[x] = (int) (-0.5 + prekk[x] * (1 << coefs_precision));
|
||||||
} else {
|
} else {
|
||||||
kk[x] = (int)(0.5 + prekk[x] * (1 << PRECISION_BITS));
|
kk[x] = (int) (0.5 + prekk[x] * (1 << coefs_precision));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return coefs_precision;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
ImagingResampleHorizontal_8bpc(
|
ImagingResampleHorizontal_8bpc(
|
||||||
Imaging imOut, Imaging imIn, int offset, int ksize, int *bounds, double *prekk) {
|
Imaging imOut, Imaging imIn, int offset, int ksize, int *bounds, double *prekk) {
|
||||||
ImagingSectionCookie cookie;
|
ImagingSectionCookie cookie;
|
||||||
int ss0, ss1, ss2, ss3;
|
int ss0;
|
||||||
int xx, yy, x, xmin, xmax;
|
int xx, yy, x, xmin, xmax;
|
||||||
INT32 *k, *kk;
|
INT16 *k, *kk;
|
||||||
|
int coefs_precision;
|
||||||
|
|
||||||
// use the same buffer for normalized coefficients
|
// use the same buffer for normalized coefficients
|
||||||
kk = (INT32 *)prekk;
|
kk = (INT16 *) prekk;
|
||||||
normalize_coeffs_8bpc(imOut->xsize, ksize, prekk);
|
coefs_precision = normalize_coeffs_8bpc(imOut->xsize, ksize, prekk);
|
||||||
|
|
||||||
ImagingSectionEnter(&cookie);
|
ImagingSectionEnter(&cookie);
|
||||||
if (imIn->image8) {
|
if (imIn->image8) {
|
||||||
|
@ -301,74 +334,36 @@ ImagingResampleHorizontal_8bpc(
|
||||||
xmin = bounds[xx * 2 + 0];
|
xmin = bounds[xx * 2 + 0];
|
||||||
xmax = bounds[xx * 2 + 1];
|
xmax = bounds[xx * 2 + 1];
|
||||||
k = &kk[xx * ksize];
|
k = &kk[xx * ksize];
|
||||||
ss0 = 1 << (PRECISION_BITS - 1);
|
ss0 = 1 << (coefs_precision -1);
|
||||||
for (x = 0; x < xmax; x++) {
|
for (x = 0; x < xmax; x++) {
|
||||||
ss0 += ((UINT8)imIn->image8[yy + offset][x + xmin]) * k[x];
|
ss0 += ((UINT8)imIn->image8[yy + offset][x + xmin]) * k[x];
|
||||||
}
|
}
|
||||||
imOut->image8[yy][xx] = clip8(ss0);
|
imOut->image8[yy][xx] = clip8(ss0, coefs_precision);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (imIn->type == IMAGING_TYPE_UINT8) {
|
} else if (imIn->type == IMAGING_TYPE_UINT8) {
|
||||||
if (imIn->bands == 2) {
|
yy = 0;
|
||||||
for (yy = 0; yy < imOut->ysize; yy++) {
|
for (; yy < imOut->ysize - 3; yy += 4) {
|
||||||
for (xx = 0; xx < imOut->xsize; xx++) {
|
ImagingResampleHorizontalConvolution8u4x(
|
||||||
UINT32 v;
|
(UINT32 *) imOut->image32[yy],
|
||||||
xmin = bounds[xx * 2 + 0];
|
(UINT32 *) imOut->image32[yy + 1],
|
||||||
xmax = bounds[xx * 2 + 1];
|
(UINT32 *) imOut->image32[yy + 2],
|
||||||
k = &kk[xx * ksize];
|
(UINT32 *) imOut->image32[yy + 3],
|
||||||
ss0 = ss3 = 1 << (PRECISION_BITS - 1);
|
(UINT32 *) imIn->image32[yy + offset],
|
||||||
for (x = 0; x < xmax; x++) {
|
(UINT32 *) imIn->image32[yy + offset + 1],
|
||||||
ss0 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 0]) *
|
(UINT32 *) imIn->image32[yy + offset + 2],
|
||||||
k[x];
|
(UINT32 *) imIn->image32[yy + offset + 3],
|
||||||
ss3 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 3]) *
|
imOut->xsize, bounds, kk, ksize,
|
||||||
k[x];
|
coefs_precision
|
||||||
}
|
);
|
||||||
v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
|
|
||||||
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (imIn->bands == 3) {
|
|
||||||
for (yy = 0; yy < imOut->ysize; yy++) {
|
|
||||||
for (xx = 0; xx < imOut->xsize; xx++) {
|
|
||||||
UINT32 v;
|
|
||||||
xmin = bounds[xx * 2 + 0];
|
|
||||||
xmax = bounds[xx * 2 + 1];
|
|
||||||
k = &kk[xx * ksize];
|
|
||||||
ss0 = ss1 = ss2 = 1 << (PRECISION_BITS - 1);
|
|
||||||
for (x = 0; x < xmax; x++) {
|
|
||||||
ss0 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 0]) *
|
|
||||||
k[x];
|
|
||||||
ss1 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 1]) *
|
|
||||||
k[x];
|
|
||||||
ss2 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 2]) *
|
|
||||||
k[x];
|
|
||||||
}
|
|
||||||
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0);
|
|
||||||
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (yy = 0; yy < imOut->ysize; yy++) {
|
|
||||||
for (xx = 0; xx < imOut->xsize; xx++) {
|
|
||||||
UINT32 v;
|
|
||||||
xmin = bounds[xx * 2 + 0];
|
|
||||||
xmax = bounds[xx * 2 + 1];
|
|
||||||
k = &kk[xx * ksize];
|
|
||||||
ss0 = ss1 = ss2 = ss3 = 1 << (PRECISION_BITS - 1);
|
|
||||||
for (x = 0; x < xmax; x++) {
|
|
||||||
ss0 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 0]) *
|
|
||||||
k[x];
|
|
||||||
ss1 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 1]) *
|
|
||||||
k[x];
|
|
||||||
ss2 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 2]) *
|
|
||||||
k[x];
|
|
||||||
ss3 += ((UINT8)imIn->image[yy + offset][(x + xmin) * 4 + 3]) *
|
|
||||||
k[x];
|
|
||||||
}
|
|
||||||
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
|
|
||||||
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
for (; yy < imOut->ysize; yy++) {
|
||||||
|
ImagingResampleHorizontalConvolution8u(
|
||||||
|
(UINT32 *) imOut->image32[yy],
|
||||||
|
(UINT32 *) imIn->image32[yy + offset],
|
||||||
|
imOut->xsize, bounds, kk, ksize,
|
||||||
|
coefs_precision
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ImagingSectionLeave(&cookie);
|
ImagingSectionLeave(&cookie);
|
||||||
|
@ -378,13 +373,14 @@ void
|
||||||
ImagingResampleVertical_8bpc(
|
ImagingResampleVertical_8bpc(
|
||||||
Imaging imOut, Imaging imIn, int offset, int ksize, int *bounds, double *prekk) {
|
Imaging imOut, Imaging imIn, int offset, int ksize, int *bounds, double *prekk) {
|
||||||
ImagingSectionCookie cookie;
|
ImagingSectionCookie cookie;
|
||||||
int ss0, ss1, ss2, ss3;
|
int ss0;
|
||||||
int xx, yy, y, ymin, ymax;
|
int xx, yy, y, ymin, ymax;
|
||||||
INT32 *k, *kk;
|
INT16 *k, *kk;
|
||||||
|
int coefs_precision;
|
||||||
|
|
||||||
// use the same buffer for normalized coefficients
|
// use the same buffer for normalized coefficients
|
||||||
kk = (INT32 *)prekk;
|
kk = (INT16 *) prekk;
|
||||||
normalize_coeffs_8bpc(imOut->ysize, ksize, prekk);
|
coefs_precision = normalize_coeffs_8bpc(imOut->ysize, ksize, prekk);
|
||||||
|
|
||||||
ImagingSectionEnter(&cookie);
|
ImagingSectionEnter(&cookie);
|
||||||
if (imIn->image8) {
|
if (imIn->image8) {
|
||||||
|
@ -393,65 +389,22 @@ ImagingResampleVertical_8bpc(
|
||||||
ymin = bounds[yy * 2 + 0];
|
ymin = bounds[yy * 2 + 0];
|
||||||
ymax = bounds[yy * 2 + 1];
|
ymax = bounds[yy * 2 + 1];
|
||||||
for (xx = 0; xx < imOut->xsize; xx++) {
|
for (xx = 0; xx < imOut->xsize; xx++) {
|
||||||
ss0 = 1 << (PRECISION_BITS - 1);
|
ss0 = 1 << (coefs_precision -1);
|
||||||
for (y = 0; y < ymax; y++) {
|
for (y = 0; y < ymax; y++) {
|
||||||
ss0 += ((UINT8)imIn->image8[y + ymin][xx]) * k[y];
|
ss0 += ((UINT8)imIn->image8[y + ymin][xx]) * k[y];
|
||||||
}
|
}
|
||||||
imOut->image8[yy][xx] = clip8(ss0);
|
imOut->image8[yy][xx] = clip8(ss0, coefs_precision);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (imIn->type == IMAGING_TYPE_UINT8) {
|
} else if (imIn->type == IMAGING_TYPE_UINT8) {
|
||||||
if (imIn->bands == 2) {
|
|
||||||
for (yy = 0; yy < imOut->ysize; yy++) {
|
for (yy = 0; yy < imOut->ysize; yy++) {
|
||||||
k = &kk[yy * ksize];
|
k = &kk[yy * ksize];
|
||||||
ymin = bounds[yy * 2 + 0];
|
ymin = bounds[yy * 2 + 0];
|
||||||
ymax = bounds[yy * 2 + 1];
|
ymax = bounds[yy * 2 + 1];
|
||||||
for (xx = 0; xx < imOut->xsize; xx++) {
|
ImagingResampleVerticalConvolution8u(
|
||||||
UINT32 v;
|
(UINT32 *) imOut->image32[yy], imIn,
|
||||||
ss0 = ss3 = 1 << (PRECISION_BITS - 1);
|
ymin, ymax, k, coefs_precision
|
||||||
for (y = 0; y < ymax; y++) {
|
);
|
||||||
ss0 += ((UINT8)imIn->image[y + ymin][xx * 4 + 0]) * k[y];
|
|
||||||
ss3 += ((UINT8)imIn->image[y + ymin][xx * 4 + 3]) * k[y];
|
|
||||||
}
|
|
||||||
v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
|
|
||||||
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (imIn->bands == 3) {
|
|
||||||
for (yy = 0; yy < imOut->ysize; yy++) {
|
|
||||||
k = &kk[yy * ksize];
|
|
||||||
ymin = bounds[yy * 2 + 0];
|
|
||||||
ymax = bounds[yy * 2 + 1];
|
|
||||||
for (xx = 0; xx < imOut->xsize; xx++) {
|
|
||||||
UINT32 v;
|
|
||||||
ss0 = ss1 = ss2 = 1 << (PRECISION_BITS - 1);
|
|
||||||
for (y = 0; y < ymax; y++) {
|
|
||||||
ss0 += ((UINT8)imIn->image[y + ymin][xx * 4 + 0]) * k[y];
|
|
||||||
ss1 += ((UINT8)imIn->image[y + ymin][xx * 4 + 1]) * k[y];
|
|
||||||
ss2 += ((UINT8)imIn->image[y + ymin][xx * 4 + 2]) * k[y];
|
|
||||||
}
|
|
||||||
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0);
|
|
||||||
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (yy = 0; yy < imOut->ysize; yy++) {
|
|
||||||
k = &kk[yy * ksize];
|
|
||||||
ymin = bounds[yy * 2 + 0];
|
|
||||||
ymax = bounds[yy * 2 + 1];
|
|
||||||
for (xx = 0; xx < imOut->xsize; xx++) {
|
|
||||||
UINT32 v;
|
|
||||||
ss0 = ss1 = ss2 = ss3 = 1 << (PRECISION_BITS - 1);
|
|
||||||
for (y = 0; y < ymax; y++) {
|
|
||||||
ss0 += ((UINT8)imIn->image[y + ymin][xx * 4 + 0]) * k[y];
|
|
||||||
ss1 += ((UINT8)imIn->image[y + ymin][xx * 4 + 1]) * k[y];
|
|
||||||
ss2 += ((UINT8)imIn->image[y + ymin][xx * 4 + 2]) * k[y];
|
|
||||||
ss3 += ((UINT8)imIn->image[y + ymin][xx * 4 + 3]) * k[y];
|
|
||||||
}
|
|
||||||
v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
|
|
||||||
memcpy(imOut->image[yy] + xx * sizeof(v), &v, sizeof(v));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ImagingSectionLeave(&cookie);
|
ImagingSectionLeave(&cookie);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user