diff --git a/src/libImaging/Bands.c b/src/libImaging/Bands.c index e1b16b34a..4f50c85a8 100644 --- a/src/libImaging/Bands.c +++ b/src/libImaging/Bands.c @@ -17,6 +17,16 @@ #include "Imaging.h" +#include +#include +#include + +#if defined(__AVX2__) + #include +#endif + + + Imaging ImagingGetBand(Imaging imIn, int band) { Imaging imOut; @@ -52,8 +62,10 @@ ImagingGetBand(Imaging imIn, int band) { UINT8 *out = imOut->image8[y]; x = 0; for (; x < imIn->xsize - 3; x += 4) { - UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]); - memcpy(out + x, &v, sizeof(v)); + __m128i source = _mm_loadu_si128((__m128i *) in); + *((UINT32*) (out + x)) = _mm_cvtsi128_si32( + _mm_shuffle_epi8(source, _mm_set_epi8( + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, 12,8,4,0))); in += 16; } for (; x < imIn->xsize; x++) { @@ -99,10 +111,12 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { UINT8 *out1 = bands[1]->image8[y]; x = 0; for (; x < imIn->xsize - 3; x += 4) { - UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]); - memcpy(out0 + x, &v, sizeof(v)); - v = MAKE_UINT32(in[0 + 3], in[4 + 3], in[8 + 3], in[12 + 3]); - memcpy(out1 + x, &v, sizeof(v)); + __m128i source = _mm_loadu_si128((__m128i *) in); + source = _mm_shuffle_epi8(source, _mm_set_epi8( + 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0)); + *((UINT32*) (out0 + x)) = _mm_cvtsi128_si32(source); + *((UINT32*) (out1 + x)) = _mm_cvtsi128_si32( + _mm_srli_si128(source, 12)); in += 16; } for (; x < imIn->xsize; x++) { @@ -119,12 +133,14 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { UINT8 *out2 = bands[2]->image8[y]; x = 0; for (; x < imIn->xsize - 3; x += 4) { - UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]); - memcpy(out0 + x, &v, sizeof(v)); - v = MAKE_UINT32(in[0 + 1], in[4 + 1], in[8 + 1], in[12 + 1]); - memcpy(out1 + x, &v, sizeof(v)); - v = MAKE_UINT32(in[0 + 2], in[4 + 2], in[8 + 2], in[12 + 2]); - memcpy(out2 + x, &v, sizeof(v)); + __m128i source = _mm_loadu_si128((__m128i *) in); + source = _mm_shuffle_epi8(source, _mm_set_epi8( + 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0)); + *((UINT32*) (out0 + x)) = _mm_cvtsi128_si32(source); + *((UINT32*) (out1 + x)) = _mm_cvtsi128_si32( + _mm_srli_si128(source, 4)); + *((UINT32*) (out2 + x)) = _mm_cvtsi128_si32( + _mm_srli_si128(source, 8)); in += 16; } for (; x < imIn->xsize; x++) { @@ -143,14 +159,16 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { UINT8 *out3 = bands[3]->image8[y]; x = 0; for (; x < imIn->xsize - 3; x += 4) { - UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]); - memcpy(out0 + x, &v, sizeof(v)); - v = MAKE_UINT32(in[0 + 1], in[4 + 1], in[8 + 1], in[12 + 1]); - memcpy(out1 + x, &v, sizeof(v)); - v = MAKE_UINT32(in[0 + 2], in[4 + 2], in[8 + 2], in[12 + 2]); - memcpy(out2 + x, &v, sizeof(v)); - v = MAKE_UINT32(in[0 + 3], in[4 + 3], in[8 + 3], in[12 + 3]); - memcpy(out3 + x, &v, sizeof(v)); + __m128i source = _mm_loadu_si128((__m128i *) in); + source = _mm_shuffle_epi8(source, _mm_set_epi8( + 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0)); + *((UINT32*) (out0 + x)) = _mm_cvtsi128_si32(source); + *((UINT32*) (out1 + x)) = _mm_cvtsi128_si32( + _mm_srli_si128(source, 4)); + *((UINT32*) (out2 + x)) = _mm_cvtsi128_si32( + _mm_srli_si128(source, 8)); + *((UINT32*) (out3 + x)) = _mm_cvtsi128_si32( + _mm_srli_si128(source, 12)); in += 16; } for (; x < imIn->xsize; x++) {