From e0649632ff53c883b684fd7f4f7591174695d301 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Sat, 6 Jul 2024 15:30:14 +0400 Subject: [PATCH 01/12] Add SIMD headers --- src/libImaging/ImPlatform.h | 2 ++ src/libImaging/ImagingSIMD.h | 41 ++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 src/libImaging/ImagingSIMD.h diff --git a/src/libImaging/ImPlatform.h b/src/libImaging/ImPlatform.h index f6e7fb6b9..e5fb4595e 100644 --- a/src/libImaging/ImPlatform.h +++ b/src/libImaging/ImPlatform.h @@ -96,3 +96,5 @@ typedef signed __int64 int64_t; #ifdef __GNUC__ #define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) #endif + +#include "ImagingSIMD.h" diff --git a/src/libImaging/ImagingSIMD.h b/src/libImaging/ImagingSIMD.h new file mode 100644 index 000000000..3d80c79d8 --- /dev/null +++ b/src/libImaging/ImagingSIMD.h @@ -0,0 +1,41 @@ +/* Microsoft compiler doesn't limit intrinsics for an architecture. + This macro is set only on x86 and means SSE2 and above including AVX2. */ +#if defined(_M_X64) || _M_IX86_FP == 2 + #define __SSE2__ +#endif + +#ifdef __SSE4_2__ + #define __SSE4__ +#endif + +#ifdef __SSE2__ + #include // MMX + #include // SSE + #include // SSE2 +#endif +#ifdef __SSE4__ + #include // SSE3 + #include // SSSE3 + #include // SSE4.1 + #include // SSE4.2 +#endif +#ifdef __AVX2__ + #include // AVX, AVX2 +#endif +#ifdef __aarch64__ + #include // ARM NEON +#endif + +#ifdef __SSE4__ +static __m128i inline +mm_cvtepu8_epi32(void *ptr) { + return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(INT32 *) ptr)); +} +#endif + +#ifdef __AVX2__ +static __m256i inline +mm256_cvtepu8_epi32(void *ptr) { + return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *) ptr)); +} +#endif From 73f76e55c36d13f16802d07e86c4906be91c5fe6 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Sat, 6 Jul 2024 15:30:22 +0400 Subject: [PATCH 02/12] Add SIMD example --- src/libImaging/Bands.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/libImaging/Bands.c b/src/libImaging/Bands.c index e1b16b34a..1cfeb04bf 100644 --- a/src/libImaging/Bands.c +++ b/src/libImaging/Bands.c @@ -21,6 +21,9 @@ Imaging ImagingGetBand(Imaging imIn, int band) { Imaging imOut; int x, y; +#ifdef __SSE4__ + __m128i shuffle_mask; +#endif /* Check arguments */ if (!imIn || imIn->type != IMAGING_TYPE_UINT8) { @@ -46,14 +49,25 @@ ImagingGetBand(Imaging imIn, int band) { return NULL; } +#ifdef __SSE4__ + shuffle_mask = _mm_set_epi8( + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, 12+band,8+band,4+band,0+band); +#endif + /* Extract band from image */ for (y = 0; y < imIn->ysize; y++) { UINT8 *in = (UINT8 *)imIn->image[y] + band; UINT8 *out = imOut->image8[y]; x = 0; for (; x < imIn->xsize - 3; x += 4) { +#ifdef __SSE4__ + __m128i source = _mm_loadu_si128((__m128i *)(in - band)); + *((UINT32 *)(out + x)) = _mm_cvtsi128_si32( + _mm_shuffle_epi8(source, shuffle_mask)); +#else UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]); memcpy(out + x, &v, sizeof(v)); +#endif in += 16; } for (; x < imIn->xsize; x++) { @@ -99,10 +113,18 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { UINT8 *out1 = bands[1]->image8[y]; x = 0; for (; x < imIn->xsize - 3; x += 4) { +#ifdef __SSE4__ + __m128i source = _mm_loadu_si128((__m128i *)in); + source = _mm_shuffle_epi8(source, _mm_set_epi8( + 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0)); + *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source); + *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 12)); +#else UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]); memcpy(out0 + x, &v, sizeof(v)); v = MAKE_UINT32(in[0 + 3], in[4 + 3], in[8 + 3], in[12 + 3]); memcpy(out1 + x, &v, sizeof(v)); +#endif in += 16; } for (; x < imIn->xsize; x++) { @@ -119,12 +141,21 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { UINT8 *out2 = bands[2]->image8[y]; x = 0; for (; x < imIn->xsize - 3; x += 4) { +#ifdef __SSE4__ + __m128i source = _mm_loadu_si128((__m128i *)in); + source = _mm_shuffle_epi8(source, _mm_set_epi8( + 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0)); + *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source); + *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4)); + *((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8)); +#else UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]); memcpy(out0 + x, &v, sizeof(v)); v = MAKE_UINT32(in[0 + 1], in[4 + 1], in[8 + 1], in[12 + 1]); memcpy(out1 + x, &v, sizeof(v)); v = MAKE_UINT32(in[0 + 2], in[4 + 2], in[8 + 2], in[12 + 2]); memcpy(out2 + x, &v, sizeof(v)); +#endif in += 16; } for (; x < imIn->xsize; x++) { @@ -143,6 +174,15 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { UINT8 *out3 = bands[3]->image8[y]; x = 0; for (; x < imIn->xsize - 3; x += 4) { +#ifdef __SSE4__ + __m128i source = _mm_loadu_si128((__m128i *)in); + source = _mm_shuffle_epi8(source, _mm_set_epi8( + 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0)); + *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source); + *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4)); + *((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8)); + *((UINT32 *)(out3 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 12)); +#else UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]); memcpy(out0 + x, &v, sizeof(v)); v = MAKE_UINT32(in[0 + 1], in[4 + 1], in[8 + 1], in[12 + 1]); @@ -151,6 +191,7 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { memcpy(out2 + x, &v, sizeof(v)); v = MAKE_UINT32(in[0 + 3], in[4 + 3], in[8 + 3], in[12 + 3]); memcpy(out3 + x, &v, sizeof(v)); +#endif in += 16; } for (; x < imIn->xsize; x++) { From 910911061fda520ba3a9d346cc83d9fa0bfea2b4 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Sat, 6 Jul 2024 15:40:30 +0400 Subject: [PATCH 03/12] Add __SSE4_2__ for MSCC --- src/libImaging/ImagingSIMD.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/libImaging/ImagingSIMD.h b/src/libImaging/ImagingSIMD.h index 3d80c79d8..ed259cebe 100644 --- a/src/libImaging/ImagingSIMD.h +++ b/src/libImaging/ImagingSIMD.h @@ -2,8 +2,13 @@ This macro is set only on x86 and means SSE2 and above including AVX2. */ #if defined(_M_X64) || _M_IX86_FP == 2 #define __SSE2__ + /* However, Microsoft compiler set __AVX2__ if /arch:AVX2 option is set */ + #ifdef __AVX2__ + #define __SSE4_2__ + #endif #endif +/* For better readability */ #ifdef __SSE4_2__ #define __SSE4__ #endif From 2db9cd34991a06ec01943a5a78c7dba9e7d67d9b Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Sun, 7 Jul 2024 17:13:44 +0400 Subject: [PATCH 04/12] Add core.acceleration attribute --- src/PIL/features.py | 4 +++- src/_imaging.c | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/PIL/features.py b/src/PIL/features.py index 13908c4eb..9f4735aee 100644 --- a/src/PIL/features.py +++ b/src/PIL/features.py @@ -128,6 +128,7 @@ features = { "libjpeg_turbo": ("PIL._imaging", "HAVE_LIBJPEGTURBO", "libjpeg_turbo_version"), "libimagequant": ("PIL._imaging", "HAVE_LIBIMAGEQUANT", "imagequant_version"), "xcb": ("PIL._imaging", "HAVE_XCB", None), + "acceleration": ("PIL._imaging", "acceleration", "acceleration"), } @@ -267,6 +268,7 @@ def pilinfo(out: IO[str] | None = None, supported_formats: bool = True) -> None: for name, feature in [ ("pil", "PIL CORE"), + ("acceleration", "Acceleration"), ("tkinter", "TKINTER"), ("freetype2", "FREETYPE2"), ("littlecms2", "LITTLECMS2"), @@ -291,7 +293,7 @@ def pilinfo(out: IO[str] | None = None, supported_formats: bool = True) -> None: if v is None: v = version(name) if v is not None: - version_static = name in ("pil", "jpg") + version_static = name in ("pil", "jpg", "acceleration") if name == "littlecms2": # this check is also in src/_imagingcms.c:setup_module() version_static = tuple(int(x) for x in v.split(".")) < (2, 7) diff --git a/src/_imaging.c b/src/_imaging.c index 07d9a64cc..84c1d55c3 100644 --- a/src/_imaging.c +++ b/src/_imaging.c @@ -4407,6 +4407,19 @@ setup_module(PyObject *m) { Py_INCREF(have_xcb); PyModule_AddObject(m, "HAVE_XCB", have_xcb); +#ifdef __AVX2__ + PyModule_AddStringConstant(m, "acceleration", "avx2"); +#elif defined(__SSE4__) + PyModule_AddStringConstant(m, "acceleration", "sse4"); +#elif defined(__SSE2__) + PyModule_AddStringConstant(m, "acceleration", "sse2"); +#elif defined(__NEON__) + PyModule_AddStringConstant(m, "acceleration", "neon"); +#else + Py_INCREF(Py_False); + PyModule_AddObject(m, "acceleration", Py_False); +#endif + PyObject *pillow_version = PyUnicode_FromString(version); PyDict_SetItemString( d, "PILLOW_VERSION", pillow_version ? pillow_version : Py_None From 5df34a24f88e9e68f18011ddeb6cdca2f435612b Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Sun, 7 Jul 2024 19:09:35 +0400 Subject: [PATCH 05/12] clang-format --- src/libImaging/Bands.c | 36 +++++++++++++++++++++++++--------- src/libImaging/ImagingSIMD.h | 38 ++++++++++++++++++------------------ 2 files changed, 46 insertions(+), 28 deletions(-) diff --git a/src/libImaging/Bands.c b/src/libImaging/Bands.c index 1cfeb04bf..6b3f8c03a 100644 --- a/src/libImaging/Bands.c +++ b/src/libImaging/Bands.c @@ -51,7 +51,22 @@ ImagingGetBand(Imaging imIn, int band) { #ifdef __SSE4__ shuffle_mask = _mm_set_epi8( - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, 12+band,8+band,4+band,0+band); + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + 12 + band, + 8 + band, + 4 + band, + 0 + band); #endif /* Extract band from image */ @@ -62,8 +77,8 @@ ImagingGetBand(Imaging imIn, int band) { for (; x < imIn->xsize - 3; x += 4) { #ifdef __SSE4__ __m128i source = _mm_loadu_si128((__m128i *)(in - band)); - *((UINT32 *)(out + x)) = _mm_cvtsi128_si32( - _mm_shuffle_epi8(source, shuffle_mask)); + *((UINT32 *)(out + x)) = + _mm_cvtsi128_si32(_mm_shuffle_epi8(source, shuffle_mask)); #else UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]); memcpy(out + x, &v, sizeof(v)); @@ -115,8 +130,9 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { for (; x < imIn->xsize - 3; x += 4) { #ifdef __SSE4__ __m128i source = _mm_loadu_si128((__m128i *)in); - source = _mm_shuffle_epi8(source, _mm_set_epi8( - 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0)); + source = _mm_shuffle_epi8( + source, + _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)); *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source); *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 12)); #else @@ -143,8 +159,9 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { for (; x < imIn->xsize - 3; x += 4) { #ifdef __SSE4__ __m128i source = _mm_loadu_si128((__m128i *)in); - source = _mm_shuffle_epi8(source, _mm_set_epi8( - 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0)); + source = _mm_shuffle_epi8( + source, + _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)); *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source); *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4)); *((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8)); @@ -176,8 +193,9 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { for (; x < imIn->xsize - 3; x += 4) { #ifdef __SSE4__ __m128i source = _mm_loadu_si128((__m128i *)in); - source = _mm_shuffle_epi8(source, _mm_set_epi8( - 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0)); + source = _mm_shuffle_epi8( + source, + _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)); *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source); *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4)); *((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8)); diff --git a/src/libImaging/ImagingSIMD.h b/src/libImaging/ImagingSIMD.h index ed259cebe..a63f3b0aa 100644 --- a/src/libImaging/ImagingSIMD.h +++ b/src/libImaging/ImagingSIMD.h @@ -1,46 +1,46 @@ /* Microsoft compiler doesn't limit intrinsics for an architecture. This macro is set only on x86 and means SSE2 and above including AVX2. */ #if defined(_M_X64) || _M_IX86_FP == 2 - #define __SSE2__ - /* However, Microsoft compiler set __AVX2__ if /arch:AVX2 option is set */ - #ifdef __AVX2__ - #define __SSE4_2__ - #endif +#define __SSE2__ +/* However, Microsoft compiler set __AVX2__ if /arch:AVX2 option is set */ +#ifdef __AVX2__ +#define __SSE4_2__ +#endif #endif /* For better readability */ #ifdef __SSE4_2__ - #define __SSE4__ +#define __SSE4__ #endif #ifdef __SSE2__ - #include // MMX - #include // SSE - #include // SSE2 +#include // MMX +#include // SSE +#include // SSE2 #endif #ifdef __SSE4__ - #include // SSE3 - #include // SSSE3 - #include // SSE4.1 - #include // SSE4.2 +#include // SSE3 +#include // SSSE3 +#include // SSE4.1 +#include // SSE4.2 #endif #ifdef __AVX2__ - #include // AVX, AVX2 +#include // AVX, AVX2 #endif #ifdef __aarch64__ - #include // ARM NEON +#include // ARM NEON #endif #ifdef __SSE4__ -static __m128i inline +static inline __m128i mm_cvtepu8_epi32(void *ptr) { - return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(INT32 *) ptr)); + return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(INT32 *)ptr)); } #endif #ifdef __AVX2__ -static __m256i inline +static inline __m256i mm256_cvtepu8_epi32(void *ptr) { - return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *) ptr)); + return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)ptr)); } #endif From f891043101f8f890776b079cf42b1b1411047575 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Mon, 8 Jul 2024 11:13:21 +0400 Subject: [PATCH 06/12] Define better name __NEON__ --- src/libImaging/ImagingSIMD.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/libImaging/ImagingSIMD.h b/src/libImaging/ImagingSIMD.h index a63f3b0aa..fbf612868 100644 --- a/src/libImaging/ImagingSIMD.h +++ b/src/libImaging/ImagingSIMD.h @@ -12,6 +12,9 @@ #ifdef __SSE4_2__ #define __SSE4__ #endif +#ifdef __aarch64__ +#define __NEON__ +#endif #ifdef __SSE2__ #include // MMX @@ -27,7 +30,7 @@ #ifdef __AVX2__ #include // AVX, AVX2 #endif -#ifdef __aarch64__ +#ifdef __NEON__ #include // ARM NEON #endif From 4d10b3c0ab9e5b1f8173ecf89896fdee98417dd3 Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Mon, 8 Jul 2024 18:32:03 +1000 Subject: [PATCH 07/12] Updated test --- Tests/test_features.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Tests/test_features.py b/Tests/test_features.py index b7eefa09a..dbe53ee29 100644 --- a/Tests/test_features.py +++ b/Tests/test_features.py @@ -36,7 +36,9 @@ def test_version() -> None: assert version is None else: assert function(name) == version - if name != "PIL": + if name == "acceleration": + assert version in ("avx2", "sse4", "sse2", "neon", None) + elif name != "PIL": if name == "zlib" and version is not None: version = re.sub(".zlib-ng$", "", version) elif name == "libtiff" and version is not None: From a410dcbe9a9c0db5d2e254302876e8e9b0af3a40 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Tue, 16 Jul 2024 22:45:27 +0400 Subject: [PATCH 08/12] Add accelerated test builds --- .github/workflows/test.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6e63333b0..9e6fcab32 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -52,15 +52,19 @@ jobs: include: - { python-version: "3.11", PYTHONOPTIMIZE: 1, REVERSE: "--reverse" } - { python-version: "3.10", PYTHONOPTIMIZE: 2 } + # SIMD-accelerated builds for x86 + - { os: "ubuntu-latest", python-version: "3.9", acceleration: "sse4"} + - { os: "ubuntu-latest", python-version: "3.12", acceleration: "avx2"} # Free-threaded - { os: "ubuntu-latest", python-version: "3.13-dev", disable-gil: true } # M1 only available for 3.10+ - { os: "macos-13", python-version: "3.9" } + - { os: "macos-13", python-version: "3.9", acceleration: "avx2"} exclude: - { os: "macos-14", python-version: "3.9" } runs-on: ${{ matrix.os }} - name: ${{ matrix.os }} Python ${{ matrix.python-version }} ${{ matrix.disable-gil && 'free-threaded' || '' }} + name: ${{ matrix.os }} Python ${{ matrix.python-version }} ${{ matrix.acceleration }} ${{ matrix.disable-gil && 'free-threaded' || '' }} steps: - uses: actions/checkout@v4 @@ -108,7 +112,7 @@ jobs: GHA_LIBIMAGEQUANT_CACHE_HIT: ${{ steps.cache-libimagequant.outputs.cache-hit }} - name: Install macOS dependencies - if: startsWith(matrix.os, 'macOS') + if: startsWith(matrix.os, 'macos') run: | .github/workflows/macos-install.sh env: @@ -118,6 +122,11 @@ jobs: if: "matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12'" run: echo "::add-matcher::.github/problem-matchers/gcc.json" + - name: Set compiler options for optimization + if: ${{ matrix.acceleration }} + run: | + echo "CC=cc -m${{ matrix.acceleration }}" >> $GITHUB_ENV + - name: Build run: | .ci/build.sh From cb55ee64b50327721b630fd6be98e1d170b67083 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Sun, 11 Aug 2024 21:11:46 +0400 Subject: [PATCH 09/12] update clang formatting --- src/libImaging/Bands.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/libImaging/Bands.c b/src/libImaging/Bands.c index 6b3f8c03a..0ab3156ed 100644 --- a/src/libImaging/Bands.c +++ b/src/libImaging/Bands.c @@ -66,7 +66,8 @@ ImagingGetBand(Imaging imIn, int band) { 12 + band, 8 + band, 4 + band, - 0 + band); + 0 + band + ); #endif /* Extract band from image */ @@ -132,7 +133,8 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { __m128i source = _mm_loadu_si128((__m128i *)in); source = _mm_shuffle_epi8( source, - _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)); + _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0) + ); *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source); *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 12)); #else @@ -161,7 +163,8 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { __m128i source = _mm_loadu_si128((__m128i *)in); source = _mm_shuffle_epi8( source, - _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)); + _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0) + ); *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source); *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4)); *((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8)); @@ -195,7 +198,8 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) { __m128i source = _mm_loadu_si128((__m128i *)in); source = _mm_shuffle_epi8( source, - _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)); + _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0) + ); *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source); *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4)); *((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8)); From 8e3a81a53c62963d186edd685889160c7ca67bc1 Mon Sep 17 00:00:00 2001 From: Alexander Date: Sun, 13 Aug 2017 17:16:16 +0300 Subject: [PATCH 10/12] SIMD Filter. 3x3 implementation SIMD Filter. 5x5 implementation SIMD Filter. fast 3x3 filter SIMD Filter. a bit faster 5x5 filter SIMD Filter. improve locality in 5x5 filter SIMD Filter. rearrange 3x3 filter to match 5x5 SIMD Filter. use macros SIMD Filter. use macros in 3x3 SIMD Filter. 3x3 SSE4 singleband SIMD Filter. faster 3x3 singleband SSE4 SIMD Filter. reuse loaded values SIMD Filter. 3x3 SSE4 singleband: 2 lines SIMD Filter. First AVX try SIMD Filter. unroll AVX 2 times SIMD Filter. Macros for AVX SIMD Filter. unroll AVX (with no profit) SIMD Filter. consider last pixel in AVX SIMD Filter. 5x5 single channel SSE4 (tests failed) SIMD Filter. fix offset SIMD Filter. move ImagingFilterxxx functions to separate files SIMD Filter. 3x3i SIMD Filter. better macros SIMD Filter. better loading SIMD Filter. Rearrange instruction for speedup SIMD Filter. reduce number of registers SIMD Filter. rearrange operations SIMD Filter. avx2 version SIMD Filter. finish 3x3i_4u8 SIMD Filter. 5x5i_4u8 SSE4 SIMD Filter. advanced 5x5i_4u8 SSE4 SIMD Filter. 5x5i_4u8 AVX2 SIMD Filter. fix memory access for: 3x3f_u8 3x3i_4u8 5x5i_4u8 SIMD Filter. move files SIMD Filter. Correct offset for 3x3f_u8 # Conflicts: # src/libImaging/Filter.c --- src/libImaging/Filter.c | 343 +++++---------------------- src/libImaging/FilterSIMD_3x3f_4u8.c | 237 ++++++++++++++++++ src/libImaging/FilterSIMD_3x3f_u8.c | 159 +++++++++++++ src/libImaging/FilterSIMD_3x3i_4u8.c | 222 +++++++++++++++++ src/libImaging/FilterSIMD_5x5f_4u8.c | 126 ++++++++++ src/libImaging/FilterSIMD_5x5f_u8.c | 100 ++++++++ src/libImaging/FilterSIMD_5x5i_4u8.c | 250 +++++++++++++++++++ 7 files changed, 1151 insertions(+), 286 deletions(-) create mode 100644 src/libImaging/FilterSIMD_3x3f_4u8.c create mode 100644 src/libImaging/FilterSIMD_3x3f_u8.c create mode 100644 src/libImaging/FilterSIMD_3x3i_4u8.c create mode 100644 src/libImaging/FilterSIMD_5x5f_4u8.c create mode 100644 src/libImaging/FilterSIMD_5x5f_u8.c create mode 100644 src/libImaging/FilterSIMD_5x5i_4u8.c diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index fbd6b425f..df3ca3630 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -26,16 +26,21 @@ #include "Imaging.h" -static inline UINT8 -clip8(float in) { - if (in <= 0.0) { - return 0; - } - if (in >= 255.0) { - return 255; - } - return (UINT8)in; -} +/* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25). + 8 is number of bits in result. + Any coefficients delta smaller than this precision will have no effect. */ +#define PRECISION_BITS (8 + 5) +/* 16 is number of bis required for kernel storage. + 1 bit is reserver for sign. + Largest possible kernel coefficient is */ +#define LARGEST_KERNEL (1 << (16 - 1 - PRECISION_BITS)) + +#include "FilterSIMD_3x3f_u8.c" +#include "FilterSIMD_3x3f_4u8.c" +#include "FilterSIMD_3x3i_4u8.c" +#include "FilterSIMD_5x5f_u8.c" +#include "FilterSIMD_5x5f_4u8.c" +#include "FilterSIMD_5x5i_4u8.c" static inline INT32 clip32(float in) { @@ -105,283 +110,14 @@ ImagingExpand(Imaging imIn, int xmargin, int ymargin) { return imOut; } -void -ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { -#define KERNEL1x3(in0, x, kernel, d) \ - (_i2f(in0[x - d]) * (kernel)[0] + _i2f(in0[x]) * (kernel)[1] + \ - _i2f(in0[x + d]) * (kernel)[2]) - - int x = 0, y = 0; - - memcpy(imOut->image[0], im->image[0], im->linesize); - if (im->bands == 1) { - // Add one time for rounding - offset += 0.5; - if (im->type == IMAGING_TYPE_INT32) { - for (y = 1; y < im->ysize - 1; y++) { - INT32 *in_1 = (INT32 *)im->image[y - 1]; - INT32 *in0 = (INT32 *)im->image[y]; - INT32 *in1 = (INT32 *)im->image[y + 1]; - INT32 *out = (INT32 *)imOut->image[y]; - - out[0] = in0[0]; - for (x = 1; x < im->xsize - 1; x++) { - float ss = offset; - ss += KERNEL1x3(in1, x, &kernel[0], 1); - ss += KERNEL1x3(in0, x, &kernel[3], 1); - ss += KERNEL1x3(in_1, x, &kernel[6], 1); - out[x] = clip32(ss); - } - out[x] = in0[x]; - } - } else { - for (y = 1; y < im->ysize - 1; y++) { - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *out = (UINT8 *)imOut->image[y]; - - out[0] = in0[0]; - for (x = 1; x < im->xsize - 1; x++) { - float ss = offset; - ss += KERNEL1x3(in1, x, &kernel[0], 1); - ss += KERNEL1x3(in0, x, &kernel[3], 1); - ss += KERNEL1x3(in_1, x, &kernel[6], 1); - out[x] = clip8(ss); - } - out[x] = in0[x]; - } - } - } else { - // Add one time for rounding - offset += 0.5; - for (y = 1; y < im->ysize - 1; y++) { - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *out = (UINT8 *)imOut->image[y]; - - memcpy(out, in0, sizeof(UINT32)); - if (im->bands == 2) { - for (x = 1; x < im->xsize - 1; x++) { - float ss0 = offset; - float ss3 = offset; - UINT32 v; - ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4); - ss3 += KERNEL1x3(in1, x * 4 + 3, &kernel[0], 4); - ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4); - ss3 += KERNEL1x3(in0, x * 4 + 3, &kernel[3], 4); - ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4); - ss3 += KERNEL1x3(in_1, x * 4 + 3, &kernel[6], 4); - v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3)); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } else if (im->bands == 3) { - for (x = 1; x < im->xsize - 1; x++) { - float ss0 = offset; - float ss1 = offset; - float ss2 = offset; - UINT32 v; - ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4); - ss1 += KERNEL1x3(in1, x * 4 + 1, &kernel[0], 4); - ss2 += KERNEL1x3(in1, x * 4 + 2, &kernel[0], 4); - ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4); - ss1 += KERNEL1x3(in0, x * 4 + 1, &kernel[3], 4); - ss2 += KERNEL1x3(in0, x * 4 + 2, &kernel[3], 4); - ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4); - ss1 += KERNEL1x3(in_1, x * 4 + 1, &kernel[6], 4); - ss2 += KERNEL1x3(in_1, x * 4 + 2, &kernel[6], 4); - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } else if (im->bands == 4) { - for (x = 1; x < im->xsize - 1; x++) { - float ss0 = offset; - float ss1 = offset; - float ss2 = offset; - float ss3 = offset; - UINT32 v; - ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4); - ss1 += KERNEL1x3(in1, x * 4 + 1, &kernel[0], 4); - ss2 += KERNEL1x3(in1, x * 4 + 2, &kernel[0], 4); - ss3 += KERNEL1x3(in1, x * 4 + 3, &kernel[0], 4); - ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4); - ss1 += KERNEL1x3(in0, x * 4 + 1, &kernel[3], 4); - ss2 += KERNEL1x3(in0, x * 4 + 2, &kernel[3], 4); - ss3 += KERNEL1x3(in0, x * 4 + 3, &kernel[3], 4); - ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4); - ss1 += KERNEL1x3(in_1, x * 4 + 1, &kernel[6], 4); - ss2 += KERNEL1x3(in_1, x * 4 + 2, &kernel[6], 4); - ss3 += KERNEL1x3(in_1, x * 4 + 3, &kernel[6], 4); - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3)); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } - memcpy(out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32)); - } - } - memcpy(imOut->image[y], im->image[y], im->linesize); -} - -void -ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { -#define KERNEL1x5(in0, x, kernel, d) \ - (_i2f(in0[x - d - d]) * (kernel)[0] + _i2f(in0[x - d]) * (kernel)[1] + \ - _i2f(in0[x]) * (kernel)[2] + _i2f(in0[x + d]) * (kernel)[3] + \ - _i2f(in0[x + d + d]) * (kernel)[4]) - - int x = 0, y = 0; - - memcpy(imOut->image[0], im->image[0], im->linesize); - memcpy(imOut->image[1], im->image[1], im->linesize); - if (im->bands == 1) { - // Add one time for rounding - offset += 0.5; - if (im->type == IMAGING_TYPE_INT32) { - for (y = 2; y < im->ysize - 2; y++) { - INT32 *in_2 = (INT32 *)im->image[y - 2]; - INT32 *in_1 = (INT32 *)im->image[y - 1]; - INT32 *in0 = (INT32 *)im->image[y]; - INT32 *in1 = (INT32 *)im->image[y + 1]; - INT32 *in2 = (INT32 *)im->image[y + 2]; - INT32 *out = (INT32 *)imOut->image[y]; - - out[0] = in0[0]; - out[1] = in0[1]; - for (x = 2; x < im->xsize - 2; x++) { - float ss = offset; - ss += KERNEL1x5(in2, x, &kernel[0], 1); - ss += KERNEL1x5(in1, x, &kernel[5], 1); - ss += KERNEL1x5(in0, x, &kernel[10], 1); - ss += KERNEL1x5(in_1, x, &kernel[15], 1); - ss += KERNEL1x5(in_2, x, &kernel[20], 1); - out[x] = clip32(ss); - } - out[x + 0] = in0[x + 0]; - out[x + 1] = in0[x + 1]; - } - } else { - for (y = 2; y < im->ysize - 2; y++) { - UINT8 *in_2 = (UINT8 *)im->image[y - 2]; - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *in2 = (UINT8 *)im->image[y + 2]; - UINT8 *out = (UINT8 *)imOut->image[y]; - - out[0] = in0[0]; - out[1] = in0[1]; - for (x = 2; x < im->xsize - 2; x++) { - float ss = offset; - ss += KERNEL1x5(in2, x, &kernel[0], 1); - ss += KERNEL1x5(in1, x, &kernel[5], 1); - ss += KERNEL1x5(in0, x, &kernel[10], 1); - ss += KERNEL1x5(in_1, x, &kernel[15], 1); - ss += KERNEL1x5(in_2, x, &kernel[20], 1); - out[x] = clip8(ss); - } - out[x + 0] = in0[x + 0]; - out[x + 1] = in0[x + 1]; - } - } - } else { - // Add one time for rounding - offset += 0.5; - for (y = 2; y < im->ysize - 2; y++) { - UINT8 *in_2 = (UINT8 *)im->image[y - 2]; - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *in2 = (UINT8 *)im->image[y + 2]; - UINT8 *out = (UINT8 *)imOut->image[y]; - - memcpy(out, in0, sizeof(UINT32) * 2); - if (im->bands == 2) { - for (x = 2; x < im->xsize - 2; x++) { - float ss0 = offset; - float ss3 = offset; - UINT32 v; - ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4); - ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4); - ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4); - ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4); - ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4); - ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4); - ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4); - ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4); - ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4); - ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4); - v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3)); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } else if (im->bands == 3) { - for (x = 2; x < im->xsize - 2; x++) { - float ss0 = offset; - float ss1 = offset; - float ss2 = offset; - UINT32 v; - ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4); - ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4); - ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4); - ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4); - ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4); - ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4); - ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4); - ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4); - ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4); - ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4); - ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4); - ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4); - ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4); - ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4); - ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4); - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } else if (im->bands == 4) { - for (x = 2; x < im->xsize - 2; x++) { - float ss0 = offset; - float ss1 = offset; - float ss2 = offset; - float ss3 = offset; - UINT32 v; - ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4); - ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4); - ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4); - ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4); - ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4); - ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4); - ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4); - ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4); - ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4); - ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4); - ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4); - ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4); - ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4); - ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4); - ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4); - ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4); - ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4); - ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4); - ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4); - ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4); - v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3)); - memcpy(out + x * sizeof(v), &v, sizeof(v)); - } - } - memcpy( - out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32) * 2 - ); - } - } - memcpy(imOut->image[y], im->image[y], im->linesize); - memcpy(imOut->image[y + 1], im->image[y + 1], im->linesize); -} - Imaging ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 offset) { - Imaging imOut; ImagingSectionCookie cookie; + Imaging imOut; + int i, fast_path = 1; + FLOAT32 tmp; + INT16 norm_kernel[25]; + INT32 norm_offset; if (im->type != IMAGING_TYPE_UINT8 && im->type != IMAGING_TYPE_INT32) { return (Imaging)ImagingError_ModeError(); @@ -400,13 +136,48 @@ ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 o return NULL; } + for (i = 0; i < xsize * ysize; i++) { + tmp = kernel[i] * (1 << PRECISION_BITS); + tmp += (tmp >= 0) ? 0.5 : -0.5; + if (tmp >= 32768 || tmp <= -32769) { + fast_path = 0; + break; + } + norm_kernel[i] = tmp; + } + tmp = offset * (1 << PRECISION_BITS); + tmp += (tmp >= 0) ? 0.5 : -0.5; + if (tmp >= 1 << 30) { + norm_offset = 1 << 30; + } else if (tmp <= -1 << 30) { + norm_offset = -1 << 30; + } else { + norm_offset = tmp; + } + ImagingSectionEnter(&cookie); if (xsize == 3) { /* 3x3 kernel. */ - ImagingFilter3x3(imOut, im, kernel, offset); + if (im->image8) { + ImagingFilter3x3f_u8(imOut, im, kernel, offset); + } else { + if (fast_path) { + ImagingFilter3x3i_4u8(imOut, im, norm_kernel, norm_offset); + } else { + ImagingFilter3x3f_4u8(imOut, im, kernel, offset); + } + } } else { /* 5x5 kernel. */ - ImagingFilter5x5(imOut, im, kernel, offset); + if (im->image8) { + ImagingFilter5x5f_u8(imOut, im, kernel, offset); + } else { + if (fast_path) { + ImagingFilter5x5i_4u8(imOut, im, norm_kernel, norm_offset); + } else { + ImagingFilter5x5f_4u8(imOut, im, kernel, offset); + } + } } ImagingSectionLeave(&cookie); return imOut; diff --git a/src/libImaging/FilterSIMD_3x3f_4u8.c b/src/libImaging/FilterSIMD_3x3f_4u8.c new file mode 100644 index 000000000..02272f688 --- /dev/null +++ b/src/libImaging/FilterSIMD_3x3f_4u8.c @@ -0,0 +1,237 @@ +void +ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel, + float offset) +{ +#define MM_KERNEL1x3_LOAD(row, x) \ + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); + +#define MM_KERNEL1x3_SUM(row, kindex) \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, _mm_set1_ps(kernel[3 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, _mm_set1_ps(kernel[6 + kindex]))); + + int x = 0, y = 0; + + memcpy(imOut->image32[0], im->image32[0], im->linesize); + + for (y = 1; y < im->ysize-1; y++) { + INT32* in_1 = im->image32[y-1]; + INT32* in0 = im->image32[y]; + INT32* in1 = im->image32[y+1]; + INT32* out = imOut->image32[y]; + +#if defined(__AVX2__) + +#define MM256_LOAD(row, x) \ + pix0##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in1[x])); \ + pix1##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in0[x])); \ + pix2##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in_1[x])); + +#define MM256_SUM(pixrow, kernelrow) \ + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix0##pixrow, kernel0##kernelrow)); \ + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix1##pixrow, kernel1##kernelrow)); \ + ss = _mm256_add_ps(ss, _mm256_mul_ps(pix2##pixrow, kernel2##kernelrow)); + +#define MM256_PERMUTE(row, from0, from1, control) \ + pix0##row = _mm256_permute2f128_ps(pix0##from0, pix0##from1, control); \ + pix1##row = _mm256_permute2f128_ps(pix1##from0, pix1##from1, control); \ + pix2##row = _mm256_permute2f128_ps(pix2##from0, pix2##from1, control); + +#define MM256_OUT(x) \ + _mm_cvtps_epi32(_mm_add_ps( \ + _mm256_extractf128_ps(ss, 0), \ + _mm256_extractf128_ps(ss, 1) \ + )) + + __m256 kernel00 = _mm256_insertf128_ps( + _mm256_set1_ps(kernel[0+0]), + _mm_set1_ps(kernel[0+1]), 1); + __m256 kernel01 = _mm256_castps128_ps256(_mm_set1_ps(kernel[0+2])); + __m256 kernel10 = _mm256_insertf128_ps( + _mm256_set1_ps(kernel[3+0]), + _mm_set1_ps(kernel[3+1]), 1); + __m256 kernel11 = _mm256_castps128_ps256(_mm_set1_ps(kernel[3+2])); + __m256 kernel20 = _mm256_insertf128_ps( + _mm256_set1_ps(kernel[6+0]), + _mm_set1_ps(kernel[6+1]), 1); + __m256 kernel21 = _mm256_castps128_ps256(_mm_set1_ps(kernel[6+2])); + __m256 pix00, pix10, pix20; + __m256 pix01, pix11, pix21; + __m256 pix02, pix12, pix22; + + out[0] = in0[0]; + x = 1; + MM256_LOAD(0, 0); + for (; x < im->xsize-1-7; x += 8) { + __m256 ss; + __m128i ssi0, ssi1, ssi2, ssi3; + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(0, 0); + MM256_LOAD(1, x+1); + MM256_SUM(1, 1); + ssi0 = MM256_OUT(x); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(1, 0); + MM256_LOAD(2, x+3); + MM256_SUM(2, 1); + ssi2 = MM256_OUT(x+2); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_PERMUTE(0, 0, 1, 0x21); + MM256_SUM(0, 0); + MM256_PERMUTE(1, 1, 2, 0x21); + MM256_SUM(1, 1); + ssi1 = MM256_OUT(x+1); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(1, 0); + MM256_PERMUTE(0, 2, 2, 0x21); + MM256_SUM(0, 1); + ssi3 = MM256_OUT(x+3); + + ssi0 = _mm_packs_epi32(ssi0, ssi1); + ssi2 = _mm_packs_epi32(ssi2, ssi3); + ssi0 = _mm_packus_epi16(ssi0, ssi2); + _mm_storeu_si128((__m128i*) &out[x], ssi0); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(2, 0); + MM256_LOAD(1, x+5); + MM256_SUM(1, 1); + ssi0 = MM256_OUT(x+4); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(1, 0); + MM256_LOAD(0, x+7); + MM256_SUM(0, 1); + ssi2 = MM256_OUT(x+6); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_PERMUTE(2, 2, 1, 0x21); + MM256_SUM(2, 0); + MM256_PERMUTE(1, 1, 0, 0x21); + MM256_SUM(1, 1); + ssi1 = MM256_OUT(x+5); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(1, 0); + MM256_PERMUTE(2, 0, 0, 0x21); + MM256_SUM(2, 1); + ssi3 = MM256_OUT(x+7); + + ssi0 = _mm_packs_epi32(ssi0, ssi1); + ssi2 = _mm_packs_epi32(ssi2, ssi3); + ssi0 = _mm_packus_epi16(ssi0, ssi2); + _mm_storeu_si128((__m128i*) &out[x+4], ssi0); + } + + for (; x < im->xsize-1-1; x += 2) { + __m256 ss; + __m128i ssi0, ssi1; + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_SUM(0, 0); + MM256_LOAD(1, x+1); + MM256_SUM(1, 1); + ssi0 = MM256_OUT(x); + + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); + MM256_PERMUTE(0, 0, 1, 0x21); + MM256_SUM(0, 0); + MM256_PERMUTE(1, 0, 1, 0xf3); + MM256_SUM(1, 1); + ssi1 = MM256_OUT(x+1); + + ssi0 = _mm_packs_epi32(ssi0, ssi1); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + _mm_storel_epi64((__m128i*) &out[x], ssi0); + + MM256_PERMUTE(0, 0, 1, 0x21); + } + for (; x < im->xsize-1; x++) { + __m128 pix00, pix10, pix20; + __m128 ss = _mm_set1_ps(offset); + __m128i ssi0; + MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_SUM(0, 0); + MM_KERNEL1x3_LOAD(0, x+0); + MM_KERNEL1x3_SUM(0, 1); + MM_KERNEL1x3_LOAD(0, x+1); + MM_KERNEL1x3_SUM(0, 2); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + out[x] = in0[x]; + +#undef MM256_LOAD +#undef MM256_SUM +#undef MM256_PERMUTE +#undef MM256_OUT + +#else + __m128 pix00, pix10, pix20; + __m128 pix01, pix11, pix21; + __m128 pix02, pix12, pix22; + MM_KERNEL1x3_LOAD(0, 0); + MM_KERNEL1x3_LOAD(1, 1); + + out[0] = in0[0]; + x = 1; + for (; x < im->xsize-1-2; x += 3) { + __m128 ss; + __m128i ssi0, ssi1, ssi2; + + ss = _mm_set1_ps(offset); + MM_KERNEL1x3_SUM(0, 0); + MM_KERNEL1x3_SUM(1, 1); + MM_KERNEL1x3_LOAD(2, x+1); + MM_KERNEL1x3_SUM(2, 2); + ssi0 = _mm_cvtps_epi32(ss); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x3_SUM(1, 0); + MM_KERNEL1x3_SUM(2, 1); + MM_KERNEL1x3_LOAD(0, x+2); + MM_KERNEL1x3_SUM(0, 2); + ssi1 = _mm_cvtps_epi32(ss); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x3_SUM(2, 0); + MM_KERNEL1x3_SUM(0, 1); + MM_KERNEL1x3_LOAD(1, x+3); + MM_KERNEL1x3_SUM(1, 2); + ssi2 = _mm_cvtps_epi32(ss); + + ssi0 = _mm_packs_epi32(ssi0, ssi1); + ssi1 = _mm_packs_epi32(ssi2, ssi2); + ssi0 = _mm_packus_epi16(ssi0, ssi1); + _mm_storeu_si128((__m128i*) &out[x], ssi0); + } + for (; x < im->xsize-1; x++) { + __m128 ss = _mm_set1_ps(offset); + __m128i ssi0; + MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_SUM(0, 0); + MM_KERNEL1x3_LOAD(0, x+0); + MM_KERNEL1x3_SUM(0, 1); + MM_KERNEL1x3_LOAD(0, x+1); + MM_KERNEL1x3_SUM(0, 2); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + out[x] = in0[x]; +#endif + } + memcpy(imOut->image32[y], im->image32[y], im->linesize); + +#undef MM_KERNEL1x3_LOAD +#undef MM_KERNEL1x3_SUM +} diff --git a/src/libImaging/FilterSIMD_3x3f_u8.c b/src/libImaging/FilterSIMD_3x3f_u8.c new file mode 100644 index 000000000..858366c1d --- /dev/null +++ b/src/libImaging/FilterSIMD_3x3f_u8.c @@ -0,0 +1,159 @@ +void +ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, + float offset) +{ +#define MM_KERNEL1x3_SUM1(ss, row, kernel) \ + ss = _mm_mul_ps(pix0##row, kernel0##kernel); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel1##kernel)); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, kernel2##kernel)); + +#define MM_KERNEL1x3_SUM1_2(ss, row, kernel) \ + ss = _mm_mul_ps(pix3##row, kernel0##kernel); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, kernel1##kernel)); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel2##kernel)); + +#define MM_KERNEL1x3_LOAD(row, x) \ + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); + + int x, y; + __m128 kernel00 = _mm_set_ps(0, kernel[2], kernel[1], kernel[0]); + __m128 kernel10 = _mm_set_ps(0, kernel[5], kernel[4], kernel[3]); + __m128 kernel20 = _mm_set_ps(0, kernel[8], kernel[7], kernel[6]); + __m128 kernel01 = _mm_set_ps(kernel[2], kernel[1], kernel[0], 0); + __m128 kernel11 = _mm_set_ps(kernel[5], kernel[4], kernel[3], 0); + __m128 kernel21 = _mm_set_ps(kernel[8], kernel[7], kernel[6], 0); + __m128 mm_offset = _mm_set1_ps(offset); + + memcpy(imOut->image8[0], im->image8[0], im->linesize); + y = 1; + for (; y < im->ysize-1-1; y += 2) { + UINT8* in_1 = im->image8[y-1]; + UINT8* in0 = im->image8[y]; + UINT8* in1 = im->image8[y+1]; + UINT8* in2 = im->image8[y+2]; + UINT8* out0 = imOut->image8[y]; + UINT8* out1 = imOut->image8[y+1]; + + out0[0] = in0[0]; + out1[0] = in1[0]; + x = 1; + for (; x < im->xsize-1-3; x += 4) { + __m128 ss0, ss1, ss2, ss3, ss4, ss5; + __m128 pix00, pix10, pix20, pix30; + __m128i ssi0; + + MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_SUM1(ss0, 0, 0); + MM_KERNEL1x3_SUM1(ss1, 0, 1); + ss0 = _mm_hadd_ps(ss0, ss1); + pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x-1])); + MM_KERNEL1x3_SUM1_2(ss3, 0, 0); + MM_KERNEL1x3_SUM1_2(ss4, 0, 1); + ss3 = _mm_hadd_ps(ss3, ss4); + + MM_KERNEL1x3_LOAD(0, x+1); + MM_KERNEL1x3_SUM1(ss1, 0, 0); + MM_KERNEL1x3_SUM1(ss2, 0, 1); + ss1 = _mm_hadd_ps(ss1, ss2); + pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x+1])); + MM_KERNEL1x3_SUM1_2(ss4, 0, 0); + MM_KERNEL1x3_SUM1_2(ss5, 0, 1); + ss4 = _mm_hadd_ps(ss4, ss5); + + ss0 = _mm_hadd_ps(ss0, ss1); + ss0 = _mm_add_ps(ss0, mm_offset); + ssi0 = _mm_cvtps_epi32(ss0); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + *((UINT32*) &out0[x]) = _mm_cvtsi128_si32(ssi0); + + ss3 = _mm_hadd_ps(ss3, ss4); + ss3 = _mm_add_ps(ss3, mm_offset); + ssi0 = _mm_cvtps_epi32(ss3); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + *((UINT32*) &out1[x]) = _mm_cvtsi128_si32(ssi0); + } + for (; x < im->xsize-1; x++) { + __m128 ss0, ss1; + __m128 pix00, pix10, pix20, pix30; + __m128i ssi0; + + pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]); + pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]); + pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]); + pix30 = _mm_set_ps(0, in2[x+1], in2[x], in2[x-1]); + MM_KERNEL1x3_SUM1(ss0, 0, 0); + MM_KERNEL1x3_SUM1_2(ss1, 0, 0); + + ss0 = _mm_hadd_ps(ss0, ss0); + ss0 = _mm_hadd_ps(ss0, ss0); + ss0 = _mm_add_ps(ss0, mm_offset); + ssi0 = _mm_cvtps_epi32(ss0); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out0[x] = _mm_cvtsi128_si32(ssi0); + + ss1 = _mm_hadd_ps(ss1, ss1); + ss1 = _mm_hadd_ps(ss1, ss1); + ss1 = _mm_add_ps(ss1, mm_offset); + ssi0 = _mm_cvtps_epi32(ss1); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out1[x] = _mm_cvtsi128_si32(ssi0); + } + out0[x] = in0[x]; + out1[x] = in1[x]; + } + for (; y < im->ysize-1; y++) { + UINT8* in_1 = im->image8[y-1]; + UINT8* in0 = im->image8[y]; + UINT8* in1 = im->image8[y+1]; + UINT8* out = imOut->image8[y]; + + out[0] = in0[0]; + x = 1; + for (; x < im->xsize-2; x++) { + __m128 ss; + __m128 pix00, pix10, pix20; + __m128i ssi0; + + MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_SUM1(ss, 0, 0); + + ss = _mm_hadd_ps(ss, ss); + ss = _mm_hadd_ps(ss, ss); + ss = _mm_add_ps(ss, mm_offset); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + for (; x < im->xsize-1; x++) { + __m128 ss; + __m128 pix00, pix10, pix20; + __m128i ssi0; + + pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]); + pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]); + pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]); + MM_KERNEL1x3_SUM1(ss, 0, 0); + + ss = _mm_hadd_ps(ss, ss); + ss = _mm_hadd_ps(ss, ss); + ss = _mm_add_ps(ss, mm_offset); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + out[x] = in0[x]; + } + memcpy(imOut->image8[y], im->image8[y], im->linesize); + +#undef MM_KERNEL1x3_SUM1 +#undef MM_KERNEL1x3_SUM1_2 +#undef MM_KERNEL1x3_LOAD +} diff --git a/src/libImaging/FilterSIMD_3x3i_4u8.c b/src/libImaging/FilterSIMD_3x3i_4u8.c new file mode 100644 index 000000000..2a52d45eb --- /dev/null +++ b/src/libImaging/FilterSIMD_3x3i_4u8.c @@ -0,0 +1,222 @@ +void +ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, + INT32 offset) +{ + int x, y; + + offset += 1 << (PRECISION_BITS-1); + + memcpy(imOut->image32[0], im->image32[0], im->linesize); + + for (y = 1; y < im->ysize-1; y++) { + INT32* in_1 = im->image32[y-1]; + INT32* in0 = im->image32[y]; + INT32* in1 = im->image32[y+1]; + INT32* out = imOut->image32[y]; + +#if defined(__AVX2__) + + #define MM_KERNEL_LOAD(x) \ + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle)); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \ + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle)); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \ + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle)); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + + #define MM_KERNEL_SUM(ss, row, kctl) \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + pix0##row, _mm256_shuffle_epi32(kernel00, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + pix1##row, _mm256_shuffle_epi32(kernel10, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + pix2##row, _mm256_shuffle_epi32(kernel20, kctl))); + + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m256i kernel00 = _mm256_set_epi16( + 0, 0, 0, 0, kernel[2], kernel[1], kernel[0], 0, + 0, 0, 0, 0, 0, kernel[2], kernel[1], kernel[0]); + __m256i kernel10 = _mm256_set_epi16( + 0, 0, 0, 0, kernel[5], kernel[4], kernel[3], 0, + 0, 0, 0, 0, 0, kernel[5], kernel[4], kernel[3]); + __m256i kernel20 = _mm256_set_epi16( + 0, 0, 0, 0, kernel[8], kernel[7], kernel[6], 0, + 0, 0, 0, 0, 0, kernel[8], kernel[7], kernel[6]); + __m256i pix00, pix10, pix20; + __m256i pix01, pix11, pix21; + __m256i source; + + out[0] = in0[0]; + x = 1; + if (im->xsize >= 4) { + MM_KERNEL_LOAD(0); + } + // Last loaded pixel: x+3 + 3 (wide load) + // Last x should be < im->xsize-6 + for (; x < im->xsize-1-5; x += 4) { + __m256i ss0, ss2; + __m128i ssout; + + ss0 = _mm256_set1_epi32(offset); + MM_KERNEL_SUM(ss0, 0, 0x00); + MM_KERNEL_SUM(ss0, 1, 0x55); + ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS); + + ss2 = _mm256_set1_epi32(offset); + MM_KERNEL_SUM(ss2, 1, 0x00); + MM_KERNEL_LOAD(x+3); + MM_KERNEL_SUM(ss2, 0, 0x55); + ss2 = _mm256_srai_epi32(ss2, PRECISION_BITS); + + ss0 = _mm256_packs_epi32(ss0, ss2); + ssout = _mm_packus_epi16( + _mm256_extracti128_si256(ss0, 0), + _mm256_extracti128_si256(ss0, 1)); + ssout = _mm_shuffle_epi32(ssout, 0xd8); + _mm_storeu_si128((__m128i*) &out[x], ssout); + } + for (; x < im->xsize-1; x++) { + __m256i ss = _mm256_set1_epi32(offset); + + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in1[x+1]), 8), + _mm_loadl_epi64((__m128i*) &in1[x-1])), shuffle)); + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); + pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in0[x+1]), 8), + _mm_loadl_epi64((__m128i*) &in0[x-1])), shuffle)); + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); + pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in_1[x+1]), 8), + _mm_loadl_epi64((__m128i*) &in_1[x-1])), shuffle)); + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); + pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); + pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); + + MM_KERNEL_SUM(ss, 0, 0x00); + MM_KERNEL_SUM(ss, 1, 0x55); + + ss = _mm256_srai_epi32(ss, PRECISION_BITS); + + ss = _mm256_packs_epi32(ss, ss); + ss = _mm256_packus_epi16(ss, ss); + out[x] = _mm_cvtsi128_si32(_mm256_castsi256_si128(ss)); + } + out[x] = in0[x]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#else + + #define MM_KERNEL_LOAD(x) \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \ + pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix01 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \ + pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix11 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); \ + pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix21 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); + + #define MM_KERNEL_SUM(ss, row, kctl) \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + pix0##row, _mm_shuffle_epi32(kernel00, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + pix1##row, _mm_shuffle_epi32(kernel10, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + pix2##row, _mm_shuffle_epi32(kernel20, kctl))); + + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m128i kernel00 = _mm_set_epi16( + kernel[2], kernel[1], kernel[0], 0, + 0, kernel[2], kernel[1], kernel[0]); + __m128i kernel10 = _mm_set_epi16( + kernel[5], kernel[4], kernel[3], 0, + 0, kernel[5], kernel[4], kernel[3]); + __m128i kernel20 = _mm_set_epi16( + kernel[8], kernel[7], kernel[6], 0, + 0, kernel[8], kernel[7], kernel[6]); + __m128i pix00, pix10, pix20; + __m128i pix01, pix11, pix21; + __m128i source; + + out[0] = in0[0]; + x = 1; + if (im->xsize >= 4) { + MM_KERNEL_LOAD(0); + } + // Last loaded pixel: x+3 + 3 (wide load) + // Last x should be < im->xsize-6 + for (; x < im->xsize-1-5; x += 4) { + __m128i ss0 = _mm_set1_epi32(offset); + __m128i ss1 = _mm_set1_epi32(offset); + __m128i ss2 = _mm_set1_epi32(offset); + __m128i ss3 = _mm_set1_epi32(offset); + + MM_KERNEL_SUM(ss0, 0, 0x00); + MM_KERNEL_SUM(ss0, 1, 0x55); + ss0 = _mm_srai_epi32(ss0, PRECISION_BITS); + + MM_KERNEL_SUM(ss1, 0, 0xaa); + MM_KERNEL_SUM(ss1, 1, 0xff); + ss1 = _mm_srai_epi32(ss1, PRECISION_BITS); + + MM_KERNEL_SUM(ss2, 1, 0x00); + MM_KERNEL_SUM(ss3, 1, 0xaa); + + MM_KERNEL_LOAD(x+3); + + MM_KERNEL_SUM(ss2, 0, 0x55); + MM_KERNEL_SUM(ss3, 0, 0xff); + ss2 = _mm_srai_epi32(ss2, PRECISION_BITS); + ss3 = _mm_srai_epi32(ss3, PRECISION_BITS); + + ss0 = _mm_packs_epi32(ss0, ss1); + ss2 = _mm_packs_epi32(ss2, ss3); + ss0 = _mm_packus_epi16(ss0, ss2); + _mm_storeu_si128((__m128i*) &out[x], ss0); + } + for (; x < im->xsize-1; x++) { + __m128i ss = _mm_set1_epi32(offset); + + source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in1[x-1]), shuffle); + pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in1[x+1]), shuffle); + pix01 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in0[x-1]), shuffle); + pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in0[x+1]), shuffle); + pix11 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in_1[x-1]), shuffle); + pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in_1[x+1]), shuffle); + pix21 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); + + MM_KERNEL_SUM(ss, 0, 0x00); + MM_KERNEL_SUM(ss, 1, 0x55); + + ss = _mm_srai_epi32(ss, PRECISION_BITS); + + ss = _mm_packs_epi32(ss, ss); + ss = _mm_packus_epi16(ss, ss); + out[x] = _mm_cvtsi128_si32(ss); + } + out[x] = in0[x]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#endif + } + memcpy(imOut->image32[y], im->image32[y], im->linesize); +} diff --git a/src/libImaging/FilterSIMD_5x5f_4u8.c b/src/libImaging/FilterSIMD_5x5f_4u8.c new file mode 100644 index 000000000..097242adb --- /dev/null +++ b/src/libImaging/FilterSIMD_5x5f_4u8.c @@ -0,0 +1,126 @@ +void +ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel, + float offset) +{ +#define MM_KERNEL1x5_LOAD(row, x) \ + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \ + pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x])); + +#define MM_KERNEL1x5_SUM(row, kindex) \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, _mm_set1_ps(kernel[5 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, _mm_set1_ps(kernel[10 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix3##row, _mm_set1_ps(kernel[15 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix4##row, _mm_set1_ps(kernel[20 + kindex]))); + + int x, y; + + memcpy(imOut->image32[0], im->image32[0], im->linesize); + memcpy(imOut->image32[1], im->image32[1], im->linesize); + for (y = 2; y < im->ysize-2; y++) { + INT32* in_2 = im->image32[y-2]; + INT32* in_1 = im->image32[y-1]; + INT32* in0 = im->image32[y]; + INT32* in1 = im->image32[y+1]; + INT32* in2 = im->image32[y+2]; + INT32* out = imOut->image32[y]; + __m128 pix00, pix10, pix20, pix30, pix40; + __m128 pix01, pix11, pix21, pix31, pix41; + __m128 pix02, pix12, pix22, pix32, pix42; + __m128 pix03, pix13, pix23, pix33, pix43; + __m128 pix04, pix14, pix24, pix34, pix44; + MM_KERNEL1x5_LOAD(0, 0); + MM_KERNEL1x5_LOAD(1, 1); + MM_KERNEL1x5_LOAD(2, 2); + MM_KERNEL1x5_LOAD(3, 3); + + out[0] = in0[0]; + out[1] = in0[1]; + x = 2; + for (; x < im->xsize-2-4; x += 5) { + __m128 ss; + __m128i ssi0, ssi1, ssi2, ssi3; + + ss = _mm_set1_ps(offset); + MM_KERNEL1x5_SUM(0, 0); + MM_KERNEL1x5_SUM(1, 1); + MM_KERNEL1x5_SUM(2, 2); + MM_KERNEL1x5_SUM(3, 3); + MM_KERNEL1x5_LOAD(4, x+2); + MM_KERNEL1x5_SUM(4, 4); + ssi0 = _mm_cvtps_epi32(ss); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x5_SUM(1, 0); + MM_KERNEL1x5_SUM(2, 1); + MM_KERNEL1x5_SUM(3, 2); + MM_KERNEL1x5_SUM(4, 3); + MM_KERNEL1x5_LOAD(0, x+3); + MM_KERNEL1x5_SUM(0, 4); + ssi1 = _mm_cvtps_epi32(ss); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x5_SUM(2, 0); + MM_KERNEL1x5_SUM(3, 1); + MM_KERNEL1x5_SUM(4, 2); + MM_KERNEL1x5_SUM(0, 3); + MM_KERNEL1x5_LOAD(1, x+4); + MM_KERNEL1x5_SUM(1, 4); + ssi2 = _mm_cvtps_epi32(ss); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x5_SUM(3, 0); + MM_KERNEL1x5_SUM(4, 1); + MM_KERNEL1x5_SUM(0, 2); + MM_KERNEL1x5_SUM(1, 3); + MM_KERNEL1x5_LOAD(2, x+5); + MM_KERNEL1x5_SUM(2, 4); + ssi3 = _mm_cvtps_epi32(ss); + + ssi0 = _mm_packs_epi32(ssi0, ssi1); + ssi1 = _mm_packs_epi32(ssi2, ssi3); + ssi0 = _mm_packus_epi16(ssi0, ssi1); + _mm_storeu_si128((__m128i*) &out[x], ssi0); + + ss = _mm_set1_ps(offset); + MM_KERNEL1x5_SUM(4, 0); + MM_KERNEL1x5_SUM(0, 1); + MM_KERNEL1x5_SUM(1, 2); + MM_KERNEL1x5_SUM(2, 3); + MM_KERNEL1x5_LOAD(3, x+6); + MM_KERNEL1x5_SUM(3, 4); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x+4] = _mm_cvtsi128_si32(ssi0); + } + for (; x < im->xsize-2; x++) { + __m128 ss = _mm_set1_ps(offset); + __m128i ssi0; + MM_KERNEL1x5_LOAD(0, x-2); + MM_KERNEL1x5_SUM(0, 0); + MM_KERNEL1x5_LOAD(0, x-1); + MM_KERNEL1x5_SUM(0, 1); + MM_KERNEL1x5_LOAD(0, x+0); + MM_KERNEL1x5_SUM(0, 2); + MM_KERNEL1x5_LOAD(0, x+1); + MM_KERNEL1x5_SUM(0, 3); + MM_KERNEL1x5_LOAD(0, x+2); + MM_KERNEL1x5_SUM(0, 4); + ssi0 = _mm_cvtps_epi32(ss); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + out[x] = in0[x]; + out[x+1] = in0[x+1]; + } + memcpy(imOut->image32[y], im->image32[y], im->linesize); + memcpy(imOut->image32[y+1], im->image32[y+1], im->linesize); + +#undef MM_KERNEL1x5_LOAD +#undef MM_KERNEL1x5_SUM +} diff --git a/src/libImaging/FilterSIMD_5x5f_u8.c b/src/libImaging/FilterSIMD_5x5f_u8.c new file mode 100644 index 000000000..714bbdf33 --- /dev/null +++ b/src/libImaging/FilterSIMD_5x5f_u8.c @@ -0,0 +1,100 @@ +void +ImagingFilter5x5f_u8(Imaging imOut, Imaging im, const float* kernel, + float offset) +{ +#define MM_KERNEL1x5_LOAD(row, x) \ + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ + pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \ + pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x])); + +#define MM_KERNEL1x5_SUM(ss, row, krow) \ + ss = _mm_mul_ps(pix0##row, kernel0##krow); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel1##krow)); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, kernel2##krow)); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix3##row, kernel3##krow)); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix4##row, kernel4##krow)); + + int x, y; + + memcpy(imOut->image8[0], im->image8[0], im->linesize); + memcpy(imOut->image8[1], im->image8[1], im->linesize); + for (y = 2; y < im->ysize-2; y++) { + UINT8* in_2 = im->image8[y-2]; + UINT8* in_1 = im->image8[y-1]; + UINT8* in0 = im->image8[y]; + UINT8* in1 = im->image8[y+1]; + UINT8* in2 = im->image8[y+2]; + UINT8* out = imOut->image8[y]; + __m128 kernelx0 = _mm_set_ps(kernel[15], kernel[10], kernel[5], kernel[0]); + __m128 kernel00 = _mm_loadu_ps(&kernel[0+1]); + __m128 kernel10 = _mm_loadu_ps(&kernel[5+1]); + __m128 kernel20 = _mm_loadu_ps(&kernel[10+1]); + __m128 kernel30 = _mm_loadu_ps(&kernel[15+1]); + __m128 kernel40 = _mm_loadu_ps(&kernel[20+1]); + __m128 kernelx1 = _mm_set_ps(kernel[19], kernel[14], kernel[9], kernel[4]); + __m128 kernel01 = _mm_loadu_ps(&kernel[0+0]); + __m128 kernel11 = _mm_loadu_ps(&kernel[5+0]); + __m128 kernel21 = _mm_loadu_ps(&kernel[10+0]); + __m128 kernel31 = _mm_loadu_ps(&kernel[15+0]); + __m128 kernel41 = _mm_loadu_ps(&kernel[20+0]); + + out[0] = in0[0]; + out[1] = in0[1]; + x = 2; + for (; x < im->xsize-2-1; x += 2) { + __m128 ss0, ss1; + __m128i ssi0; + __m128 pix00, pix10, pix20, pix30, pix40; + + MM_KERNEL1x5_LOAD(0, x-1); + MM_KERNEL1x5_SUM(ss0, 0, 0); + ss0 = _mm_add_ps(ss0, _mm_mul_ps( + _mm_set_ps(in_1[x-2], in0[x-2], in1[x-2], in2[x-2]), + kernelx0)); + + MM_KERNEL1x5_SUM(ss1, 0, 1); + ss1 = _mm_add_ps(ss1, _mm_mul_ps( + _mm_set_ps(in_1[x+3], in0[x+3], in1[x+3], in2[x+3]), + kernelx1)); + + ss0 = _mm_hadd_ps(ss0, ss1); + ss0 = _mm_add_ps(ss0, _mm_set_ps( + offset, kernel[24] * in_2[x+3], + offset, kernel[20] * in_2[x-2])); + ss0 = _mm_hadd_ps(ss0, ss0); + ssi0 = _mm_cvtps_epi32(ss0); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + *((UINT32*) &out[x]) = _mm_cvtsi128_si32(ssi0); + } + for (; x < im->xsize-2; x ++) { + __m128 ss0; + __m128i ssi0; + __m128 pix00, pix10, pix20, pix30, pix40; + + MM_KERNEL1x5_LOAD(0, x-2); + MM_KERNEL1x5_SUM(ss0, 0, 1); + ss0 = _mm_add_ps(ss0, _mm_mul_ps( + _mm_set_ps(in_1[x+2], in0[x+2], in1[x+2], in2[x+2]), + kernelx1)); + + ss0 = _mm_add_ps(ss0, _mm_set_ps( + 0, 0, offset, kernel[24] * in_2[x+2])); + ss0 = _mm_hadd_ps(ss0, ss0); + ss0 = _mm_hadd_ps(ss0, ss0); + ssi0 = _mm_cvtps_epi32(ss0); + ssi0 = _mm_packs_epi32(ssi0, ssi0); + ssi0 = _mm_packus_epi16(ssi0, ssi0); + out[x] = _mm_cvtsi128_si32(ssi0); + } + out[x+0] = in0[x+0]; + out[x+1] = in0[x+1]; + } + memcpy(imOut->image8[y], im->image8[y], im->linesize); + memcpy(imOut->image8[y+1], im->image8[y+1], im->linesize); + +#undef MM_KERNEL1x5_LOAD +#undef MM_KERNEL1x5_SUM +} diff --git a/src/libImaging/FilterSIMD_5x5i_4u8.c b/src/libImaging/FilterSIMD_5x5i_4u8.c new file mode 100644 index 000000000..d91d472d0 --- /dev/null +++ b/src/libImaging/FilterSIMD_5x5i_4u8.c @@ -0,0 +1,250 @@ +void +ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, + INT32 offset) +{ + int x, y; + + offset += 1 << (PRECISION_BITS-1); + + memcpy(imOut->image32[0], im->image32[0], im->linesize); + memcpy(imOut->image32[1], im->image32[1], im->linesize); + for (y = 2; y < im->ysize-2; y++) { + INT32* in_2 = im->image32[y-2]; + INT32* in_1 = im->image32[y-1]; + INT32* in0 = im->image32[y]; + INT32* in1 = im->image32[y+1]; + INT32* in2 = im->image32[y+2]; + INT32* out = imOut->image32[y]; +#if defined(__AVX2__) + + #define MM_KERNEL_LOAD(row, x) \ + pix0##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x]), shuffle)); \ + pix0##row = _mm256_inserti128_si256(pix0##row, _mm256_castsi256_si128(pix0##row), 1); \ + pix1##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle)); \ + pix1##row = _mm256_inserti128_si256(pix1##row, _mm256_castsi256_si128(pix1##row), 1); \ + pix2##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle)); \ + pix2##row = _mm256_inserti128_si256(pix2##row, _mm256_castsi256_si128(pix2##row), 1); \ + pix3##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle)); \ + pix3##row = _mm256_inserti128_si256(pix3##row, _mm256_castsi256_si128(pix3##row), 1); \ + pix4##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x]), shuffle)); \ + pix4##row = _mm256_inserti128_si256(pix4##row, _mm256_castsi256_si128(pix4##row), 1); + + #define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix0##row, zero), _mm256_shuffle_epi32(kernel0##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix1##row, zero), _mm256_shuffle_epi32(kernel1##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix2##row, zero), _mm256_shuffle_epi32(kernel2##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix3##row, zero), _mm256_shuffle_epi32(kernel3##krow, kctl))); \ + ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ + unpack_epi8(pix4##row, zero), _mm256_shuffle_epi32(kernel4##krow, kctl))); + + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m256i kernel00 = _mm256_set_epi16( + 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0], 0, + 0, 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0]); + __m256i kernel10 = _mm256_set_epi16( + 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5], 0, + 0, 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5]); + __m256i kernel20 = _mm256_set_epi16( + 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10], 0, + 0, 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10]); + __m256i kernel30 = _mm256_set_epi16( + 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15], 0, + 0, 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15]); + __m256i kernel40 = _mm256_set_epi16( + 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20], 0, + 0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20]); + __m256i zero = _mm256_setzero_si256(); + __m256i pix00, pix10, pix20, pix30, pix40; + __m256i pix01, pix11, pix21, pix31, pix41; + + out[0] = in0[0]; + out[1] = in0[1]; + x = 2; + + MM_KERNEL_LOAD(0, 0); + + for (; x < im->xsize-2-3; x += 4) { + __m256i ss0, ss2; + __m128i ssout; + + ss0 = _mm256_set1_epi32(offset); + ss2 = _mm256_set1_epi32(offset); + + MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm256_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss2, 0, _mm256_unpackhi_epi8, 0, 0x00); + + MM_KERNEL_LOAD(1, x+2); + MM_KERNEL_SUM(ss0, 1, _mm256_unpacklo_epi8, 0, 0xaa); + ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS); + MM_KERNEL_SUM(ss2, 1, _mm256_unpacklo_epi8, 0, 0x55); + MM_KERNEL_SUM(ss2, 1, _mm256_unpackhi_epi8, 0, 0xaa); + ss2 = _mm256_srai_epi32(ss2, PRECISION_BITS); + + pix00 = pix01; + pix10 = pix11; + pix20 = pix21; + pix30 = pix31; + pix40 = pix41; + + ss0 = _mm256_packs_epi32(ss0, ss2); + ssout = _mm_packus_epi16( + _mm256_extracti128_si256(ss0, 0), + _mm256_extracti128_si256(ss0, 1)); + ssout = _mm_shuffle_epi32(ssout, 0xd8); + _mm_storeu_si128((__m128i*) &out[x], ssout); + } + for (; x < im->xsize-2; x++) { + __m256i ss0; + + MM_KERNEL_LOAD(0, x-2); + pix01 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle)); + pix01 = _mm256_inserti128_si256(pix01, _mm256_castsi256_si128(pix01), 1); + pix11 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle)); + pix11 = _mm256_inserti128_si256(pix11, _mm256_castsi256_si128(pix11), 1); + pix21 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle)); + pix21 = _mm256_inserti128_si256(pix21, _mm256_castsi256_si128(pix21), 1); + pix31 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle)); + pix31 = _mm256_inserti128_si256(pix31, _mm256_castsi256_si128(pix31), 1); + pix41 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle)); + pix41 = _mm256_inserti128_si256(pix41, _mm256_castsi256_si128(pix41), 1); + + ss0 = _mm256_set1_epi32(offset); + MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm256_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss0, 1, _mm256_unpacklo_epi8, 0, 0xaa); + ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS); + + ss0 = _mm256_packs_epi32(ss0, ss0); + ss0 = _mm256_packus_epi16(ss0, ss0); + out[x] = _mm_cvtsi128_si32(_mm256_castsi256_si128(ss0)); + } + out[x] = in0[x]; + out[x+1] = in0[x+1]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#else + + #define MM_KERNEL_LOAD(row, x) \ + pix0##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x]), shuffle); \ + pix1##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \ + pix2##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \ + pix3##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); \ + pix4##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x]), shuffle); + + #define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix0##row, zero), _mm_shuffle_epi32(kernel0##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix1##row, zero), _mm_shuffle_epi32(kernel1##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix2##row, zero), _mm_shuffle_epi32(kernel2##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix3##row, zero), _mm_shuffle_epi32(kernel3##krow, kctl))); \ + ss = _mm_add_epi32(ss, _mm_madd_epi16( \ + unpack_epi8(pix4##row, zero), _mm_shuffle_epi32(kernel4##krow, kctl))); + + __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m128i kernel00 = _mm_set_epi16( + 0, 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0]); + __m128i kernel10 = _mm_set_epi16( + 0, 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5]); + __m128i kernel20 = _mm_set_epi16( + 0, 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10]); + __m128i kernel30 = _mm_set_epi16( + 0, 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15]); + __m128i kernel40 = _mm_set_epi16( + 0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20]); + __m128i kernel01 = _mm_set_epi16( + 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0], 0); + __m128i kernel11 = _mm_set_epi16( + 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5], 0); + __m128i kernel21 = _mm_set_epi16( + 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10], 0); + __m128i kernel31 = _mm_set_epi16( + 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15], 0); + __m128i kernel41 = _mm_set_epi16( + 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20], 0); + __m128i zero = _mm_setzero_si128(); + __m128i pix00, pix10, pix20, pix30, pix40; + __m128i pix01, pix11, pix21, pix31, pix41; + + out[0] = in0[0]; + out[1] = in0[1]; + x = 2; + + MM_KERNEL_LOAD(0, 0); + + for (; x < im->xsize-2-3; x += 4) { + __m128i ss0, ss1, ss2, ss3; + + ss0 = _mm_set1_epi32(offset); + ss1 = _mm_set1_epi32(offset); + ss2 = _mm_set1_epi32(offset); + ss3 = _mm_set1_epi32(offset); + + MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss1, 0, _mm_unpacklo_epi8, 1, 0x00); + MM_KERNEL_SUM(ss1, 0, _mm_unpackhi_epi8, 1, 0x55); + MM_KERNEL_SUM(ss2, 0, _mm_unpackhi_epi8, 0, 0x00); + MM_KERNEL_SUM(ss3, 0, _mm_unpackhi_epi8, 1, 0x00); + + MM_KERNEL_LOAD(1, x+2); + MM_KERNEL_SUM(ss0, 1, _mm_unpacklo_epi8, 0, 0xaa); + ss0 = _mm_srai_epi32(ss0, PRECISION_BITS); + MM_KERNEL_SUM(ss1, 1, _mm_unpacklo_epi8, 1, 0xaa); + ss1 = _mm_srai_epi32(ss1, PRECISION_BITS); + MM_KERNEL_SUM(ss2, 1, _mm_unpacklo_epi8, 0, 0x55); + MM_KERNEL_SUM(ss2, 1, _mm_unpackhi_epi8, 0, 0xaa); + ss2 = _mm_srai_epi32(ss2, PRECISION_BITS); + MM_KERNEL_SUM(ss3, 1, _mm_unpacklo_epi8, 1, 0x55); + MM_KERNEL_SUM(ss3, 1, _mm_unpackhi_epi8, 1, 0xaa); + ss3 = _mm_srai_epi32(ss3, PRECISION_BITS); + + pix00 = pix01; + pix10 = pix11; + pix20 = pix21; + pix30 = pix31; + pix40 = pix41; + + ss0 = _mm_packs_epi32(ss0, ss1); + ss2 = _mm_packs_epi32(ss2, ss3); + ss0 = _mm_packus_epi16(ss0, ss2); + _mm_storeu_si128((__m128i*) &out[x], ss0); + } + for (; x < im->xsize-2; x++) { + __m128i ss0; + + MM_KERNEL_LOAD(0, x-2); + pix01 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle); + pix11 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle); + pix21 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle); + pix31 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle); + pix41 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle); + + ss0 = _mm_set1_epi32(offset); + MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00); + MM_KERNEL_SUM(ss0, 0, _mm_unpackhi_epi8, 0, 0x55); + MM_KERNEL_SUM(ss0, 1, _mm_unpacklo_epi8, 0, 0xaa); + ss0 = _mm_srai_epi32(ss0, PRECISION_BITS); + + ss0 = _mm_packs_epi32(ss0, ss0); + ss0 = _mm_packus_epi16(ss0, ss0); + out[x] = _mm_cvtsi128_si32(ss0); + } + out[x] = in0[x]; + out[x+1] = in0[x+1]; + #undef MM_KERNEL_LOAD + #undef MM_KERNEL_SUM + +#endif + } + memcpy(imOut->image32[y], im->image32[y], im->linesize); + memcpy(imOut->image32[y+1], im->image32[y+1], im->linesize); +} From f5f809196f258732521570dfbaaff24af0681c09 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Sun, 11 Aug 2024 22:34:34 +0400 Subject: [PATCH 11/12] Make SSE4 optional --- src/libImaging/Filter.c | 384 +++++++++++++++++++++++++++++++++++----- 1 file changed, 342 insertions(+), 42 deletions(-) diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index df3ca3630..6b495a7d7 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -26,6 +26,7 @@ #include "Imaging.h" +#if defined(__SSE4__) /* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25). 8 is number of bits in result. Any coefficients delta smaller than this precision will have no effect. */ @@ -41,6 +42,18 @@ #include "FilterSIMD_5x5f_u8.c" #include "FilterSIMD_5x5f_4u8.c" #include "FilterSIMD_5x5i_4u8.c" +#endif // defined(__SSE4__) + +static inline UINT8 +clip8(float in) { + if (in <= 0.0) { + return 0; + } + if (in >= 255.0) { + return 255; + } + return (UINT8)in; +} static inline INT32 clip32(float in) { @@ -110,14 +123,283 @@ ImagingExpand(Imaging imIn, int xmargin, int ymargin) { return imOut; } +void +ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { +#define KERNEL1x3(in0, x, kernel, d) \ + (_i2f(in0[x - d]) * (kernel)[0] + _i2f(in0[x]) * (kernel)[1] + \ + _i2f(in0[x + d]) * (kernel)[2]) + + int x = 0, y = 0; + + memcpy(imOut->image[0], im->image[0], im->linesize); + if (im->bands == 1) { + // Add one time for rounding + offset += 0.5; + if (im->type == IMAGING_TYPE_INT32) { + for (y = 1; y < im->ysize - 1; y++) { + INT32 *in_1 = (INT32 *)im->image[y - 1]; + INT32 *in0 = (INT32 *)im->image[y]; + INT32 *in1 = (INT32 *)im->image[y + 1]; + INT32 *out = (INT32 *)imOut->image[y]; + + out[0] = in0[0]; + for (x = 1; x < im->xsize - 1; x++) { + float ss = offset; + ss += KERNEL1x3(in1, x, &kernel[0], 1); + ss += KERNEL1x3(in0, x, &kernel[3], 1); + ss += KERNEL1x3(in_1, x, &kernel[6], 1); + out[x] = clip32(ss); + } + out[x] = in0[x]; + } + } else { + for (y = 1; y < im->ysize - 1; y++) { + UINT8 *in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *in0 = (UINT8 *)im->image[y]; + UINT8 *in1 = (UINT8 *)im->image[y + 1]; + UINT8 *out = (UINT8 *)imOut->image[y]; + + out[0] = in0[0]; + for (x = 1; x < im->xsize - 1; x++) { + float ss = offset; + ss += KERNEL1x3(in1, x, &kernel[0], 1); + ss += KERNEL1x3(in0, x, &kernel[3], 1); + ss += KERNEL1x3(in_1, x, &kernel[6], 1); + out[x] = clip8(ss); + } + out[x] = in0[x]; + } + } + } else { + // Add one time for rounding + offset += 0.5; + for (y = 1; y < im->ysize - 1; y++) { + UINT8 *in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *in0 = (UINT8 *)im->image[y]; + UINT8 *in1 = (UINT8 *)im->image[y + 1]; + UINT8 *out = (UINT8 *)imOut->image[y]; + + memcpy(out, in0, sizeof(UINT32)); + if (im->bands == 2) { + for (x = 1; x < im->xsize - 1; x++) { + float ss0 = offset; + float ss3 = offset; + UINT32 v; + ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4); + ss3 += KERNEL1x3(in1, x * 4 + 3, &kernel[0], 4); + ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4); + ss3 += KERNEL1x3(in0, x * 4 + 3, &kernel[3], 4); + ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4); + ss3 += KERNEL1x3(in_1, x * 4 + 3, &kernel[6], 4); + v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3)); + memcpy(out + x * sizeof(v), &v, sizeof(v)); + } + } else if (im->bands == 3) { + for (x = 1; x < im->xsize - 1; x++) { + float ss0 = offset; + float ss1 = offset; + float ss2 = offset; + UINT32 v; + ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4); + ss1 += KERNEL1x3(in1, x * 4 + 1, &kernel[0], 4); + ss2 += KERNEL1x3(in1, x * 4 + 2, &kernel[0], 4); + ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4); + ss1 += KERNEL1x3(in0, x * 4 + 1, &kernel[3], 4); + ss2 += KERNEL1x3(in0, x * 4 + 2, &kernel[3], 4); + ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4); + ss1 += KERNEL1x3(in_1, x * 4 + 1, &kernel[6], 4); + ss2 += KERNEL1x3(in_1, x * 4 + 2, &kernel[6], 4); + v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0); + memcpy(out + x * sizeof(v), &v, sizeof(v)); + } + } else if (im->bands == 4) { + for (x = 1; x < im->xsize - 1; x++) { + float ss0 = offset; + float ss1 = offset; + float ss2 = offset; + float ss3 = offset; + UINT32 v; + ss0 += KERNEL1x3(in1, x * 4 + 0, &kernel[0], 4); + ss1 += KERNEL1x3(in1, x * 4 + 1, &kernel[0], 4); + ss2 += KERNEL1x3(in1, x * 4 + 2, &kernel[0], 4); + ss3 += KERNEL1x3(in1, x * 4 + 3, &kernel[0], 4); + ss0 += KERNEL1x3(in0, x * 4 + 0, &kernel[3], 4); + ss1 += KERNEL1x3(in0, x * 4 + 1, &kernel[3], 4); + ss2 += KERNEL1x3(in0, x * 4 + 2, &kernel[3], 4); + ss3 += KERNEL1x3(in0, x * 4 + 3, &kernel[3], 4); + ss0 += KERNEL1x3(in_1, x * 4 + 0, &kernel[6], 4); + ss1 += KERNEL1x3(in_1, x * 4 + 1, &kernel[6], 4); + ss2 += KERNEL1x3(in_1, x * 4 + 2, &kernel[6], 4); + ss3 += KERNEL1x3(in_1, x * 4 + 3, &kernel[6], 4); + v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3)); + memcpy(out + x * sizeof(v), &v, sizeof(v)); + } + } + memcpy(out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32)); + } + } + memcpy(imOut->image[y], im->image[y], im->linesize); +} + +void +ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { +#define KERNEL1x5(in0, x, kernel, d) \ + (_i2f(in0[x - d - d]) * (kernel)[0] + _i2f(in0[x - d]) * (kernel)[1] + \ + _i2f(in0[x]) * (kernel)[2] + _i2f(in0[x + d]) * (kernel)[3] + \ + _i2f(in0[x + d + d]) * (kernel)[4]) + + int x = 0, y = 0; + + memcpy(imOut->image[0], im->image[0], im->linesize); + memcpy(imOut->image[1], im->image[1], im->linesize); + if (im->bands == 1) { + // Add one time for rounding + offset += 0.5; + if (im->type == IMAGING_TYPE_INT32) { + for (y = 2; y < im->ysize - 2; y++) { + INT32 *in_2 = (INT32 *)im->image[y - 2]; + INT32 *in_1 = (INT32 *)im->image[y - 1]; + INT32 *in0 = (INT32 *)im->image[y]; + INT32 *in1 = (INT32 *)im->image[y + 1]; + INT32 *in2 = (INT32 *)im->image[y + 2]; + INT32 *out = (INT32 *)imOut->image[y]; + + out[0] = in0[0]; + out[1] = in0[1]; + for (x = 2; x < im->xsize - 2; x++) { + float ss = offset; + ss += KERNEL1x5(in2, x, &kernel[0], 1); + ss += KERNEL1x5(in1, x, &kernel[5], 1); + ss += KERNEL1x5(in0, x, &kernel[10], 1); + ss += KERNEL1x5(in_1, x, &kernel[15], 1); + ss += KERNEL1x5(in_2, x, &kernel[20], 1); + out[x] = clip32(ss); + } + out[x + 0] = in0[x + 0]; + out[x + 1] = in0[x + 1]; + } + } else { + for (y = 2; y < im->ysize - 2; y++) { + UINT8 *in_2 = (UINT8 *)im->image[y - 2]; + UINT8 *in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *in0 = (UINT8 *)im->image[y]; + UINT8 *in1 = (UINT8 *)im->image[y + 1]; + UINT8 *in2 = (UINT8 *)im->image[y + 2]; + UINT8 *out = (UINT8 *)imOut->image[y]; + + out[0] = in0[0]; + out[1] = in0[1]; + for (x = 2; x < im->xsize - 2; x++) { + float ss = offset; + ss += KERNEL1x5(in2, x, &kernel[0], 1); + ss += KERNEL1x5(in1, x, &kernel[5], 1); + ss += KERNEL1x5(in0, x, &kernel[10], 1); + ss += KERNEL1x5(in_1, x, &kernel[15], 1); + ss += KERNEL1x5(in_2, x, &kernel[20], 1); + out[x] = clip8(ss); + } + out[x + 0] = in0[x + 0]; + out[x + 1] = in0[x + 1]; + } + } + } else { + // Add one time for rounding + offset += 0.5; + for (y = 2; y < im->ysize - 2; y++) { + UINT8 *in_2 = (UINT8 *)im->image[y - 2]; + UINT8 *in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *in0 = (UINT8 *)im->image[y]; + UINT8 *in1 = (UINT8 *)im->image[y + 1]; + UINT8 *in2 = (UINT8 *)im->image[y + 2]; + UINT8 *out = (UINT8 *)imOut->image[y]; + + memcpy(out, in0, sizeof(UINT32) * 2); + if (im->bands == 2) { + for (x = 2; x < im->xsize - 2; x++) { + float ss0 = offset; + float ss3 = offset; + UINT32 v; + ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4); + ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4); + ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4); + ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4); + ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4); + ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4); + ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4); + ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4); + ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4); + ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4); + v = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3)); + memcpy(out + x * sizeof(v), &v, sizeof(v)); + } + } else if (im->bands == 3) { + for (x = 2; x < im->xsize - 2; x++) { + float ss0 = offset; + float ss1 = offset; + float ss2 = offset; + UINT32 v; + ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4); + ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4); + ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4); + ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4); + ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4); + ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4); + ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4); + ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4); + ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4); + ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4); + ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4); + ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4); + ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4); + ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4); + ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4); + v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), 0); + memcpy(out + x * sizeof(v), &v, sizeof(v)); + } + } else if (im->bands == 4) { + for (x = 2; x < im->xsize - 2; x++) { + float ss0 = offset; + float ss1 = offset; + float ss2 = offset; + float ss3 = offset; + UINT32 v; + ss0 += KERNEL1x5(in2, x * 4 + 0, &kernel[0], 4); + ss1 += KERNEL1x5(in2, x * 4 + 1, &kernel[0], 4); + ss2 += KERNEL1x5(in2, x * 4 + 2, &kernel[0], 4); + ss3 += KERNEL1x5(in2, x * 4 + 3, &kernel[0], 4); + ss0 += KERNEL1x5(in1, x * 4 + 0, &kernel[5], 4); + ss1 += KERNEL1x5(in1, x * 4 + 1, &kernel[5], 4); + ss2 += KERNEL1x5(in1, x * 4 + 2, &kernel[5], 4); + ss3 += KERNEL1x5(in1, x * 4 + 3, &kernel[5], 4); + ss0 += KERNEL1x5(in0, x * 4 + 0, &kernel[10], 4); + ss1 += KERNEL1x5(in0, x * 4 + 1, &kernel[10], 4); + ss2 += KERNEL1x5(in0, x * 4 + 2, &kernel[10], 4); + ss3 += KERNEL1x5(in0, x * 4 + 3, &kernel[10], 4); + ss0 += KERNEL1x5(in_1, x * 4 + 0, &kernel[15], 4); + ss1 += KERNEL1x5(in_1, x * 4 + 1, &kernel[15], 4); + ss2 += KERNEL1x5(in_1, x * 4 + 2, &kernel[15], 4); + ss3 += KERNEL1x5(in_1, x * 4 + 3, &kernel[15], 4); + ss0 += KERNEL1x5(in_2, x * 4 + 0, &kernel[20], 4); + ss1 += KERNEL1x5(in_2, x * 4 + 1, &kernel[20], 4); + ss2 += KERNEL1x5(in_2, x * 4 + 2, &kernel[20], 4); + ss3 += KERNEL1x5(in_2, x * 4 + 3, &kernel[20], 4); + v = MAKE_UINT32(clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3)); + memcpy(out + x * sizeof(v), &v, sizeof(v)); + } + } + memcpy( + out + x * sizeof(UINT32), in0 + x * sizeof(UINT32), sizeof(UINT32) * 2 + ); + } + } + memcpy(imOut->image[y], im->image[y], im->linesize); + memcpy(imOut->image[y + 1], im->image[y + 1], im->linesize); +} + Imaging ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 offset) { - ImagingSectionCookie cookie; Imaging imOut; - int i, fast_path = 1; - FLOAT32 tmp; - INT16 norm_kernel[25]; - INT32 norm_offset; + ImagingSectionCookie cookie; if (im->type != IMAGING_TYPE_UINT8 && im->type != IMAGING_TYPE_INT32) { return (Imaging)ImagingError_ModeError(); @@ -136,49 +418,67 @@ ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32 *kernel, FLOAT32 o return NULL; } - for (i = 0; i < xsize * ysize; i++) { - tmp = kernel[i] * (1 << PRECISION_BITS); - tmp += (tmp >= 0) ? 0.5 : -0.5; - if (tmp >= 32768 || tmp <= -32769) { - fast_path = 0; - break; - } - norm_kernel[i] = tmp; - } - tmp = offset * (1 << PRECISION_BITS); - tmp += (tmp >= 0) ? 0.5 : -0.5; - if (tmp >= 1 << 30) { - norm_offset = 1 << 30; - } else if (tmp <= -1 << 30) { - norm_offset = -1 << 30; - } else { - norm_offset = tmp; - } - ImagingSectionEnter(&cookie); +#if defined(__SSE4__) + { + int i, fast_path = 1; + FLOAT32 tmp; + INT16 norm_kernel[25]; + INT32 norm_offset; + + for (i = 0; i < xsize * ysize; i++) { + tmp = kernel[i] * (1 << PRECISION_BITS); + tmp += (tmp >= 0) ? 0.5 : -0.5; + if (tmp >= 32768 || tmp <= -32769) { + fast_path = 0; + break; + } + norm_kernel[i] = tmp; + } + tmp = offset * (1 << PRECISION_BITS); + tmp += (tmp >= 0) ? 0.5 : -0.5; + if (tmp >= 1 << 30) { + norm_offset = 1 << 30; + } else if (tmp <= -1 << 30) { + norm_offset = -1 << 30; + } else { + norm_offset = tmp; + } + + if (xsize == 3) { + /* 3x3 kernel. */ + if (im->image8) { + ImagingFilter3x3f_u8(imOut, im, kernel, offset); + } else { + if (fast_path) { + ImagingFilter3x3i_4u8(imOut, im, norm_kernel, norm_offset); + } else { + ImagingFilter3x3f_4u8(imOut, im, kernel, offset); + } + } + } else { + /* 5x5 kernel. */ + if (im->image8) { + ImagingFilter5x5f_u8(imOut, im, kernel, offset); + } else { + if (fast_path) { + ImagingFilter5x5i_4u8(imOut, im, norm_kernel, norm_offset); + } else { + ImagingFilter5x5f_4u8(imOut, im, kernel, offset); + } + } + } + } +#else if (xsize == 3) { /* 3x3 kernel. */ - if (im->image8) { - ImagingFilter3x3f_u8(imOut, im, kernel, offset); - } else { - if (fast_path) { - ImagingFilter3x3i_4u8(imOut, im, norm_kernel, norm_offset); - } else { - ImagingFilter3x3f_4u8(imOut, im, kernel, offset); - } - } + ImagingFilter3x3(imOut, im, kernel, offset); } else { /* 5x5 kernel. */ - if (im->image8) { - ImagingFilter5x5f_u8(imOut, im, kernel, offset); - } else { - if (fast_path) { - ImagingFilter5x5i_4u8(imOut, im, norm_kernel, norm_offset); - } else { - ImagingFilter5x5f_4u8(imOut, im, kernel, offset); - } - } + ImagingFilter5x5(imOut, im, kernel, offset); } +#endif + ImagingSectionLeave(&cookie); return imOut; } From 2481c6782c927bacf99869299c7bdf4234f2ea85 Mon Sep 17 00:00:00 2001 From: Aleksandr Karpinskii Date: Sun, 11 Aug 2024 22:38:38 +0400 Subject: [PATCH 12/12] Apply clang-format --- src/libImaging/FilterSIMD_3x3f_4u8.c | 123 +++++---- src/libImaging/FilterSIMD_3x3f_u8.c | 76 +++--- src/libImaging/FilterSIMD_3x3i_4u8.c | 256 ++++++++++++------- src/libImaging/FilterSIMD_5x5f_4u8.c | 64 +++-- src/libImaging/FilterSIMD_5x5f_u8.c | 108 ++++---- src/libImaging/FilterSIMD_5x5i_4u8.c | 358 +++++++++++++++++++-------- 6 files changed, 606 insertions(+), 379 deletions(-) diff --git a/src/libImaging/FilterSIMD_3x3f_4u8.c b/src/libImaging/FilterSIMD_3x3f_4u8.c index 02272f688..842be1ad3 100644 --- a/src/libImaging/FilterSIMD_3x3f_4u8.c +++ b/src/libImaging/FilterSIMD_3x3f_4u8.c @@ -1,13 +1,11 @@ void -ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel, - float offset) -{ -#define MM_KERNEL1x3_LOAD(row, x) \ +ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float *kernel, float offset) { +#define MM_KERNEL1x3_LOAD(row, x) \ pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); -#define MM_KERNEL1x3_SUM(row, kindex) \ +#define MM_KERNEL1x3_SUM(row, kindex) \ ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, _mm_set1_ps(kernel[3 + kindex]))); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, _mm_set1_ps(kernel[6 + kindex]))); @@ -16,47 +14,46 @@ ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel, memcpy(imOut->image32[0], im->image32[0], im->linesize); - for (y = 1; y < im->ysize-1; y++) { - INT32* in_1 = im->image32[y-1]; - INT32* in0 = im->image32[y]; - INT32* in1 = im->image32[y+1]; - INT32* out = imOut->image32[y]; + for (y = 1; y < im->ysize - 1; y++) { + INT32 *in_1 = im->image32[y - 1]; + INT32 *in0 = im->image32[y]; + INT32 *in1 = im->image32[y + 1]; + INT32 *out = imOut->image32[y]; #if defined(__AVX2__) -#define MM256_LOAD(row, x) \ +#define MM256_LOAD(row, x) \ pix0##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in1[x])); \ pix1##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in0[x])); \ pix2##row = _mm256_cvtepi32_ps(mm256_cvtepu8_epi32(&in_1[x])); -#define MM256_SUM(pixrow, kernelrow) \ +#define MM256_SUM(pixrow, kernelrow) \ ss = _mm256_add_ps(ss, _mm256_mul_ps(pix0##pixrow, kernel0##kernelrow)); \ ss = _mm256_add_ps(ss, _mm256_mul_ps(pix1##pixrow, kernel1##kernelrow)); \ ss = _mm256_add_ps(ss, _mm256_mul_ps(pix2##pixrow, kernel2##kernelrow)); -#define MM256_PERMUTE(row, from0, from1, control) \ +#define MM256_PERMUTE(row, from0, from1, control) \ pix0##row = _mm256_permute2f128_ps(pix0##from0, pix0##from1, control); \ pix1##row = _mm256_permute2f128_ps(pix1##from0, pix1##from1, control); \ pix2##row = _mm256_permute2f128_ps(pix2##from0, pix2##from1, control); -#define MM256_OUT(x) \ - _mm_cvtps_epi32(_mm_add_ps( \ - _mm256_extractf128_ps(ss, 0), \ - _mm256_extractf128_ps(ss, 1) \ - )) - +#define MM256_OUT(x) \ + _mm_cvtps_epi32( \ + _mm_add_ps(_mm256_extractf128_ps(ss, 0), _mm256_extractf128_ps(ss, 1)) \ + ) + __m256 kernel00 = _mm256_insertf128_ps( - _mm256_set1_ps(kernel[0+0]), - _mm_set1_ps(kernel[0+1]), 1); - __m256 kernel01 = _mm256_castps128_ps256(_mm_set1_ps(kernel[0+2])); + _mm256_set1_ps(kernel[0 + 0]), _mm_set1_ps(kernel[0 + 1]), 1 + ); + __m256 kernel01 = _mm256_castps128_ps256(_mm_set1_ps(kernel[0 + 2])); __m256 kernel10 = _mm256_insertf128_ps( - _mm256_set1_ps(kernel[3+0]), - _mm_set1_ps(kernel[3+1]), 1); - __m256 kernel11 = _mm256_castps128_ps256(_mm_set1_ps(kernel[3+2])); + _mm256_set1_ps(kernel[3 + 0]), _mm_set1_ps(kernel[3 + 1]), 1 + ); + __m256 kernel11 = _mm256_castps128_ps256(_mm_set1_ps(kernel[3 + 2])); __m256 kernel20 = _mm256_insertf128_ps( - _mm256_set1_ps(kernel[6+0]), - _mm_set1_ps(kernel[6+1]), 1); - __m256 kernel21 = _mm256_castps128_ps256(_mm_set1_ps(kernel[6+2])); + _mm256_set1_ps(kernel[6 + 0]), _mm_set1_ps(kernel[6 + 1]), 1 + ); + __m256 kernel21 = _mm256_castps128_ps256(_mm_set1_ps(kernel[6 + 2])); __m256 pix00, pix10, pix20; __m256 pix01, pix11, pix21; __m256 pix02, pix12, pix22; @@ -64,78 +61,78 @@ ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel, out[0] = in0[0]; x = 1; MM256_LOAD(0, 0); - for (; x < im->xsize-1-7; x += 8) { + for (; x < im->xsize - 1 - 7; x += 8) { __m256 ss; __m128i ssi0, ssi1, ssi2, ssi3; ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); MM256_SUM(0, 0); - MM256_LOAD(1, x+1); + MM256_LOAD(1, x + 1); MM256_SUM(1, 1); ssi0 = MM256_OUT(x); ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); MM256_SUM(1, 0); - MM256_LOAD(2, x+3); + MM256_LOAD(2, x + 3); MM256_SUM(2, 1); - ssi2 = MM256_OUT(x+2); + ssi2 = MM256_OUT(x + 2); ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); MM256_PERMUTE(0, 0, 1, 0x21); MM256_SUM(0, 0); MM256_PERMUTE(1, 1, 2, 0x21); MM256_SUM(1, 1); - ssi1 = MM256_OUT(x+1); + ssi1 = MM256_OUT(x + 1); ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); MM256_SUM(1, 0); MM256_PERMUTE(0, 2, 2, 0x21); MM256_SUM(0, 1); - ssi3 = MM256_OUT(x+3); + ssi3 = MM256_OUT(x + 3); ssi0 = _mm_packs_epi32(ssi0, ssi1); ssi2 = _mm_packs_epi32(ssi2, ssi3); ssi0 = _mm_packus_epi16(ssi0, ssi2); - _mm_storeu_si128((__m128i*) &out[x], ssi0); - + _mm_storeu_si128((__m128i *)&out[x], ssi0); + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); MM256_SUM(2, 0); - MM256_LOAD(1, x+5); + MM256_LOAD(1, x + 5); MM256_SUM(1, 1); - ssi0 = MM256_OUT(x+4); - + ssi0 = MM256_OUT(x + 4); + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); MM256_SUM(1, 0); - MM256_LOAD(0, x+7); + MM256_LOAD(0, x + 7); MM256_SUM(0, 1); - ssi2 = MM256_OUT(x+6); - + ssi2 = MM256_OUT(x + 6); + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); MM256_PERMUTE(2, 2, 1, 0x21); MM256_SUM(2, 0); MM256_PERMUTE(1, 1, 0, 0x21); MM256_SUM(1, 1); - ssi1 = MM256_OUT(x+5); - + ssi1 = MM256_OUT(x + 5); + ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); MM256_SUM(1, 0); MM256_PERMUTE(2, 0, 0, 0x21); MM256_SUM(2, 1); - ssi3 = MM256_OUT(x+7); + ssi3 = MM256_OUT(x + 7); ssi0 = _mm_packs_epi32(ssi0, ssi1); ssi2 = _mm_packs_epi32(ssi2, ssi3); ssi0 = _mm_packus_epi16(ssi0, ssi2); - _mm_storeu_si128((__m128i*) &out[x+4], ssi0); + _mm_storeu_si128((__m128i *)&out[x + 4], ssi0); } - for (; x < im->xsize-1-1; x += 2) { + for (; x < im->xsize - 1 - 1; x += 2) { __m256 ss; __m128i ssi0, ssi1; ss = _mm256_castps128_ps256(_mm_set1_ps(offset)); MM256_SUM(0, 0); - MM256_LOAD(1, x+1); + MM256_LOAD(1, x + 1); MM256_SUM(1, 1); ssi0 = MM256_OUT(x); @@ -144,23 +141,23 @@ ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel, MM256_SUM(0, 0); MM256_PERMUTE(1, 0, 1, 0xf3); MM256_SUM(1, 1); - ssi1 = MM256_OUT(x+1); + ssi1 = MM256_OUT(x + 1); ssi0 = _mm_packs_epi32(ssi0, ssi1); ssi0 = _mm_packus_epi16(ssi0, ssi0); - _mm_storel_epi64((__m128i*) &out[x], ssi0); + _mm_storel_epi64((__m128i *)&out[x], ssi0); MM256_PERMUTE(0, 0, 1, 0x21); } - for (; x < im->xsize-1; x++) { + for (; x < im->xsize - 1; x++) { __m128 pix00, pix10, pix20; __m128 ss = _mm_set1_ps(offset); __m128i ssi0; - MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_LOAD(0, x - 1); MM_KERNEL1x3_SUM(0, 0); - MM_KERNEL1x3_LOAD(0, x+0); + MM_KERNEL1x3_LOAD(0, x + 0); MM_KERNEL1x3_SUM(0, 1); - MM_KERNEL1x3_LOAD(0, x+1); + MM_KERNEL1x3_LOAD(0, x + 1); MM_KERNEL1x3_SUM(0, 2); ssi0 = _mm_cvtps_epi32(ss); ssi0 = _mm_packs_epi32(ssi0, ssi0); @@ -183,44 +180,44 @@ ImagingFilter3x3f_4u8(Imaging imOut, Imaging im, const float* kernel, out[0] = in0[0]; x = 1; - for (; x < im->xsize-1-2; x += 3) { + for (; x < im->xsize - 1 - 2; x += 3) { __m128 ss; __m128i ssi0, ssi1, ssi2; ss = _mm_set1_ps(offset); MM_KERNEL1x3_SUM(0, 0); MM_KERNEL1x3_SUM(1, 1); - MM_KERNEL1x3_LOAD(2, x+1); + MM_KERNEL1x3_LOAD(2, x + 1); MM_KERNEL1x3_SUM(2, 2); ssi0 = _mm_cvtps_epi32(ss); ss = _mm_set1_ps(offset); MM_KERNEL1x3_SUM(1, 0); MM_KERNEL1x3_SUM(2, 1); - MM_KERNEL1x3_LOAD(0, x+2); + MM_KERNEL1x3_LOAD(0, x + 2); MM_KERNEL1x3_SUM(0, 2); ssi1 = _mm_cvtps_epi32(ss); ss = _mm_set1_ps(offset); MM_KERNEL1x3_SUM(2, 0); MM_KERNEL1x3_SUM(0, 1); - MM_KERNEL1x3_LOAD(1, x+3); + MM_KERNEL1x3_LOAD(1, x + 3); MM_KERNEL1x3_SUM(1, 2); ssi2 = _mm_cvtps_epi32(ss); ssi0 = _mm_packs_epi32(ssi0, ssi1); ssi1 = _mm_packs_epi32(ssi2, ssi2); ssi0 = _mm_packus_epi16(ssi0, ssi1); - _mm_storeu_si128((__m128i*) &out[x], ssi0); + _mm_storeu_si128((__m128i *)&out[x], ssi0); } - for (; x < im->xsize-1; x++) { + for (; x < im->xsize - 1; x++) { __m128 ss = _mm_set1_ps(offset); __m128i ssi0; - MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_LOAD(0, x - 1); MM_KERNEL1x3_SUM(0, 0); - MM_KERNEL1x3_LOAD(0, x+0); + MM_KERNEL1x3_LOAD(0, x + 0); MM_KERNEL1x3_SUM(0, 1); - MM_KERNEL1x3_LOAD(0, x+1); + MM_KERNEL1x3_LOAD(0, x + 1); MM_KERNEL1x3_SUM(0, 2); ssi0 = _mm_cvtps_epi32(ss); ssi0 = _mm_packs_epi32(ssi0, ssi0); diff --git a/src/libImaging/FilterSIMD_3x3f_u8.c b/src/libImaging/FilterSIMD_3x3f_u8.c index 858366c1d..78e63658a 100644 --- a/src/libImaging/FilterSIMD_3x3f_u8.c +++ b/src/libImaging/FilterSIMD_3x3f_u8.c @@ -1,18 +1,16 @@ void -ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, - float offset) -{ -#define MM_KERNEL1x3_SUM1(ss, row, kernel) \ - ss = _mm_mul_ps(pix0##row, kernel0##kernel); \ +ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float *kernel, float offset) { +#define MM_KERNEL1x3_SUM1(ss, row, kernel) \ + ss = _mm_mul_ps(pix0##row, kernel0##kernel); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel1##kernel)); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, kernel2##kernel)); -#define MM_KERNEL1x3_SUM1_2(ss, row, kernel) \ - ss = _mm_mul_ps(pix3##row, kernel0##kernel); \ +#define MM_KERNEL1x3_SUM1_2(ss, row, kernel) \ + ss = _mm_mul_ps(pix3##row, kernel0##kernel); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, kernel1##kernel)); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel2##kernel)); -#define MM_KERNEL1x3_LOAD(row, x) \ +#define MM_KERNEL1x3_LOAD(row, x) \ pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); @@ -28,63 +26,63 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, memcpy(imOut->image8[0], im->image8[0], im->linesize); y = 1; - for (; y < im->ysize-1-1; y += 2) { - UINT8* in_1 = im->image8[y-1]; - UINT8* in0 = im->image8[y]; - UINT8* in1 = im->image8[y+1]; - UINT8* in2 = im->image8[y+2]; - UINT8* out0 = imOut->image8[y]; - UINT8* out1 = imOut->image8[y+1]; + for (; y < im->ysize - 1 - 1; y += 2) { + UINT8 *in_1 = im->image8[y - 1]; + UINT8 *in0 = im->image8[y]; + UINT8 *in1 = im->image8[y + 1]; + UINT8 *in2 = im->image8[y + 2]; + UINT8 *out0 = imOut->image8[y]; + UINT8 *out1 = imOut->image8[y + 1]; out0[0] = in0[0]; out1[0] = in1[0]; x = 1; - for (; x < im->xsize-1-3; x += 4) { + for (; x < im->xsize - 1 - 3; x += 4) { __m128 ss0, ss1, ss2, ss3, ss4, ss5; __m128 pix00, pix10, pix20, pix30; __m128i ssi0; - MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_LOAD(0, x - 1); MM_KERNEL1x3_SUM1(ss0, 0, 0); MM_KERNEL1x3_SUM1(ss1, 0, 1); ss0 = _mm_hadd_ps(ss0, ss1); - pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x-1])); + pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x - 1])); MM_KERNEL1x3_SUM1_2(ss3, 0, 0); MM_KERNEL1x3_SUM1_2(ss4, 0, 1); ss3 = _mm_hadd_ps(ss3, ss4); - MM_KERNEL1x3_LOAD(0, x+1); + MM_KERNEL1x3_LOAD(0, x + 1); MM_KERNEL1x3_SUM1(ss1, 0, 0); MM_KERNEL1x3_SUM1(ss2, 0, 1); ss1 = _mm_hadd_ps(ss1, ss2); - pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x+1])); + pix30 = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x + 1])); MM_KERNEL1x3_SUM1_2(ss4, 0, 0); MM_KERNEL1x3_SUM1_2(ss5, 0, 1); ss4 = _mm_hadd_ps(ss4, ss5); - + ss0 = _mm_hadd_ps(ss0, ss1); ss0 = _mm_add_ps(ss0, mm_offset); ssi0 = _mm_cvtps_epi32(ss0); ssi0 = _mm_packs_epi32(ssi0, ssi0); ssi0 = _mm_packus_epi16(ssi0, ssi0); - *((UINT32*) &out0[x]) = _mm_cvtsi128_si32(ssi0); + *((UINT32 *)&out0[x]) = _mm_cvtsi128_si32(ssi0); ss3 = _mm_hadd_ps(ss3, ss4); ss3 = _mm_add_ps(ss3, mm_offset); ssi0 = _mm_cvtps_epi32(ss3); ssi0 = _mm_packs_epi32(ssi0, ssi0); ssi0 = _mm_packus_epi16(ssi0, ssi0); - *((UINT32*) &out1[x]) = _mm_cvtsi128_si32(ssi0); + *((UINT32 *)&out1[x]) = _mm_cvtsi128_si32(ssi0); } - for (; x < im->xsize-1; x++) { + for (; x < im->xsize - 1; x++) { __m128 ss0, ss1; __m128 pix00, pix10, pix20, pix30; __m128i ssi0; - pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]); - pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]); - pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]); - pix30 = _mm_set_ps(0, in2[x+1], in2[x], in2[x-1]); + pix00 = _mm_set_ps(0, in1[x + 1], in1[x], in1[x - 1]); + pix10 = _mm_set_ps(0, in0[x + 1], in0[x], in0[x - 1]); + pix20 = _mm_set_ps(0, in_1[x + 1], in_1[x], in_1[x - 1]); + pix30 = _mm_set_ps(0, in2[x + 1], in2[x], in2[x - 1]); MM_KERNEL1x3_SUM1(ss0, 0, 0); MM_KERNEL1x3_SUM1_2(ss1, 0, 0); @@ -107,20 +105,20 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, out0[x] = in0[x]; out1[x] = in1[x]; } - for (; y < im->ysize-1; y++) { - UINT8* in_1 = im->image8[y-1]; - UINT8* in0 = im->image8[y]; - UINT8* in1 = im->image8[y+1]; - UINT8* out = imOut->image8[y]; + for (; y < im->ysize - 1; y++) { + UINT8 *in_1 = im->image8[y - 1]; + UINT8 *in0 = im->image8[y]; + UINT8 *in1 = im->image8[y + 1]; + UINT8 *out = imOut->image8[y]; out[0] = in0[0]; x = 1; - for (; x < im->xsize-2; x++) { + for (; x < im->xsize - 2; x++) { __m128 ss; __m128 pix00, pix10, pix20; __m128i ssi0; - MM_KERNEL1x3_LOAD(0, x-1); + MM_KERNEL1x3_LOAD(0, x - 1); MM_KERNEL1x3_SUM1(ss, 0, 0); ss = _mm_hadd_ps(ss, ss); @@ -131,14 +129,14 @@ ImagingFilter3x3f_u8(Imaging imOut, Imaging im, const float* kernel, ssi0 = _mm_packus_epi16(ssi0, ssi0); out[x] = _mm_cvtsi128_si32(ssi0); } - for (; x < im->xsize-1; x++) { + for (; x < im->xsize - 1; x++) { __m128 ss; __m128 pix00, pix10, pix20; __m128i ssi0; - pix00 = _mm_set_ps(0, in1[x+1], in1[x], in1[x-1]); - pix10 = _mm_set_ps(0, in0[x+1], in0[x], in0[x-1]); - pix20 = _mm_set_ps(0, in_1[x+1], in_1[x], in_1[x-1]); + pix00 = _mm_set_ps(0, in1[x + 1], in1[x], in1[x - 1]); + pix10 = _mm_set_ps(0, in0[x + 1], in0[x], in0[x - 1]); + pix20 = _mm_set_ps(0, in_1[x + 1], in_1[x], in_1[x - 1]); MM_KERNEL1x3_SUM1(ss, 0, 0); ss = _mm_hadd_ps(ss, ss); diff --git a/src/libImaging/FilterSIMD_3x3i_4u8.c b/src/libImaging/FilterSIMD_3x3i_4u8.c index 2a52d45eb..f4fe73007 100644 --- a/src/libImaging/FilterSIMD_3x3i_4u8.c +++ b/src/libImaging/FilterSIMD_3x3i_4u8.c @@ -1,53 +1,106 @@ void -ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, - INT32 offset) -{ +ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16 *kernel, INT32 offset) { int x, y; - offset += 1 << (PRECISION_BITS-1); + offset += 1 << (PRECISION_BITS - 1); memcpy(imOut->image32[0], im->image32[0], im->linesize); - for (y = 1; y < im->ysize-1; y++) { - INT32* in_1 = im->image32[y-1]; - INT32* in0 = im->image32[y]; - INT32* in1 = im->image32[y+1]; - INT32* out = imOut->image32[y]; + for (y = 1; y < im->ysize - 1; y++) { + INT32 *in_1 = im->image32[y - 1]; + INT32 *in0 = im->image32[y]; + INT32 *in1 = im->image32[y + 1]; + INT32 *out = imOut->image32[y]; #if defined(__AVX2__) - #define MM_KERNEL_LOAD(x) \ - source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle)); \ - source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ - pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ - pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \ - source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle)); \ - source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ - pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ - pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \ - source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle)); \ - source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ - pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ - pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); +#define MM_KERNEL_LOAD(x) \ + source = _mm256_castsi128_si256( \ + _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in1[x]), shuffle) \ + ); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \ + source = _mm256_castsi128_si256( \ + _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in0[x]), shuffle) \ + ); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); \ + source = _mm256_castsi128_si256( \ + _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in_1[x]), shuffle) \ + ); \ + source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); \ + pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); \ + pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); - #define MM_KERNEL_SUM(ss, row, kctl) \ - ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ - pix0##row, _mm256_shuffle_epi32(kernel00, kctl))); \ - ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ - pix1##row, _mm256_shuffle_epi32(kernel10, kctl))); \ - ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ - pix2##row, _mm256_shuffle_epi32(kernel20, kctl))); +#define MM_KERNEL_SUM(ss, row, kctl) \ + ss = _mm256_add_epi32( \ + ss, _mm256_madd_epi16(pix0##row, _mm256_shuffle_epi32(kernel00, kctl)) \ + ); \ + ss = _mm256_add_epi32( \ + ss, _mm256_madd_epi16(pix1##row, _mm256_shuffle_epi32(kernel10, kctl)) \ + ); \ + ss = _mm256_add_epi32( \ + ss, _mm256_madd_epi16(pix2##row, _mm256_shuffle_epi32(kernel20, kctl)) \ + ); - __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m128i shuffle = + _mm_set_epi8(15, 11, 14, 10, 13, 9, 12, 8, 7, 3, 6, 2, 5, 1, 4, 0); __m256i kernel00 = _mm256_set_epi16( - 0, 0, 0, 0, kernel[2], kernel[1], kernel[0], 0, - 0, 0, 0, 0, 0, kernel[2], kernel[1], kernel[0]); + 0, + 0, + 0, + 0, + kernel[2], + kernel[1], + kernel[0], + 0, + 0, + 0, + 0, + 0, + 0, + kernel[2], + kernel[1], + kernel[0] + ); __m256i kernel10 = _mm256_set_epi16( - 0, 0, 0, 0, kernel[5], kernel[4], kernel[3], 0, - 0, 0, 0, 0, 0, kernel[5], kernel[4], kernel[3]); + 0, + 0, + 0, + 0, + kernel[5], + kernel[4], + kernel[3], + 0, + 0, + 0, + 0, + 0, + 0, + kernel[5], + kernel[4], + kernel[3] + ); __m256i kernel20 = _mm256_set_epi16( - 0, 0, 0, 0, kernel[8], kernel[7], kernel[6], 0, - 0, 0, 0, 0, 0, kernel[8], kernel[7], kernel[6]); + 0, + 0, + 0, + 0, + kernel[8], + kernel[7], + kernel[6], + 0, + 0, + 0, + 0, + 0, + 0, + kernel[8], + kernel[7], + kernel[6] + ); __m256i pix00, pix10, pix20; __m256i pix01, pix11, pix21; __m256i source; @@ -59,7 +112,7 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, } // Last loaded pixel: x+3 + 3 (wide load) // Last x should be < im->xsize-6 - for (; x < im->xsize-1-5; x += 4) { + for (; x < im->xsize - 1 - 5; x += 4) { __m256i ss0, ss2; __m128i ssout; @@ -70,35 +123,47 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, ss2 = _mm256_set1_epi32(offset); MM_KERNEL_SUM(ss2, 1, 0x00); - MM_KERNEL_LOAD(x+3); + MM_KERNEL_LOAD(x + 3); MM_KERNEL_SUM(ss2, 0, 0x55); ss2 = _mm256_srai_epi32(ss2, PRECISION_BITS); - + ss0 = _mm256_packs_epi32(ss0, ss2); ssout = _mm_packus_epi16( - _mm256_extracti128_si256(ss0, 0), - _mm256_extracti128_si256(ss0, 1)); + _mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1) + ); ssout = _mm_shuffle_epi32(ssout, 0xd8); - _mm_storeu_si128((__m128i*) &out[x], ssout); + _mm_storeu_si128((__m128i *)&out[x], ssout); } - for (; x < im->xsize-1; x++) { + for (; x < im->xsize - 1; x++) { __m256i ss = _mm256_set1_epi32(offset); - source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( - _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in1[x+1]), 8), - _mm_loadl_epi64((__m128i*) &in1[x-1])), shuffle)); + source = _mm256_castsi128_si256(_mm_shuffle_epi8( + _mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32 *)&in1[x + 1]), 8), + _mm_loadl_epi64((__m128i *)&in1[x - 1]) + ), + shuffle + )); source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); pix00 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); pix01 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); - source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( - _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in0[x+1]), 8), - _mm_loadl_epi64((__m128i*) &in0[x-1])), shuffle)); + source = _mm256_castsi128_si256(_mm_shuffle_epi8( + _mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32 *)&in0[x + 1]), 8), + _mm_loadl_epi64((__m128i *)&in0[x - 1]) + ), + shuffle + )); source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); pix10 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); pix11 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); - source = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_or_si128( - _mm_slli_si128(_mm_cvtsi32_si128(*(INT32*) &in_1[x+1]), 8), - _mm_loadl_epi64((__m128i*) &in_1[x-1])), shuffle)); + source = _mm256_castsi128_si256(_mm_shuffle_epi8( + _mm_or_si128( + _mm_slli_si128(_mm_cvtsi32_si128(*(INT32 *)&in_1[x + 1]), 8), + _mm_loadl_epi64((__m128i *)&in_1[x - 1]) + ), + shuffle + )); source = _mm256_inserti128_si256(source, _mm256_castsi256_si128(source), 1); pix20 = _mm256_unpacklo_epi8(source, _mm256_setzero_si256()); pix21 = _mm256_unpackhi_epi8(source, _mm256_setzero_si256()); @@ -107,46 +172,50 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, MM_KERNEL_SUM(ss, 1, 0x55); ss = _mm256_srai_epi32(ss, PRECISION_BITS); - + ss = _mm256_packs_epi32(ss, ss); ss = _mm256_packus_epi16(ss, ss); out[x] = _mm_cvtsi128_si32(_mm256_castsi256_si128(ss)); } out[x] = in0[x]; - #undef MM_KERNEL_LOAD - #undef MM_KERNEL_SUM +#undef MM_KERNEL_LOAD +#undef MM_KERNEL_SUM #else - #define MM_KERNEL_LOAD(x) \ - source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \ - pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ - pix01 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ - source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \ - pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ - pix11 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ - source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); \ - pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ - pix21 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); +#define MM_KERNEL_LOAD(x) \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in1[x]), shuffle); \ + pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix01 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in0[x]), shuffle); \ + pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix11 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); \ + source = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in_1[x]), shuffle); \ + pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); \ + pix21 = _mm_unpackhi_epi8(source, _mm_setzero_si128()); - #define MM_KERNEL_SUM(ss, row, kctl) \ - ss = _mm_add_epi32(ss, _mm_madd_epi16( \ - pix0##row, _mm_shuffle_epi32(kernel00, kctl))); \ - ss = _mm_add_epi32(ss, _mm_madd_epi16( \ - pix1##row, _mm_shuffle_epi32(kernel10, kctl))); \ - ss = _mm_add_epi32(ss, _mm_madd_epi16( \ - pix2##row, _mm_shuffle_epi32(kernel20, kctl))); +#define MM_KERNEL_SUM(ss, row, kctl) \ + ss = _mm_add_epi32( \ + ss, _mm_madd_epi16(pix0##row, _mm_shuffle_epi32(kernel00, kctl)) \ + ); \ + ss = _mm_add_epi32( \ + ss, _mm_madd_epi16(pix1##row, _mm_shuffle_epi32(kernel10, kctl)) \ + ); \ + ss = _mm_add_epi32( \ + ss, _mm_madd_epi16(pix2##row, _mm_shuffle_epi32(kernel20, kctl)) \ + ); - __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m128i shuffle = + _mm_set_epi8(15, 11, 14, 10, 13, 9, 12, 8, 7, 3, 6, 2, 5, 1, 4, 0); __m128i kernel00 = _mm_set_epi16( - kernel[2], kernel[1], kernel[0], 0, - 0, kernel[2], kernel[1], kernel[0]); + kernel[2], kernel[1], kernel[0], 0, 0, kernel[2], kernel[1], kernel[0] + ); __m128i kernel10 = _mm_set_epi16( - kernel[5], kernel[4], kernel[3], 0, - 0, kernel[5], kernel[4], kernel[3]); + kernel[5], kernel[4], kernel[3], 0, 0, kernel[5], kernel[4], kernel[3] + ); __m128i kernel20 = _mm_set_epi16( - kernel[8], kernel[7], kernel[6], 0, - 0, kernel[8], kernel[7], kernel[6]); + kernel[8], kernel[7], kernel[6], 0, 0, kernel[8], kernel[7], kernel[6] + ); __m128i pix00, pix10, pix20; __m128i pix01, pix11, pix21; __m128i source; @@ -158,7 +227,7 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, } // Last loaded pixel: x+3 + 3 (wide load) // Last x should be < im->xsize-6 - for (; x < im->xsize-1-5; x += 4) { + for (; x < im->xsize - 1 - 5; x += 4) { __m128i ss0 = _mm_set1_epi32(offset); __m128i ss1 = _mm_set1_epi32(offset); __m128i ss2 = _mm_set1_epi32(offset); @@ -175,46 +244,47 @@ ImagingFilter3x3i_4u8(Imaging imOut, Imaging im, const INT16* kernel, MM_KERNEL_SUM(ss2, 1, 0x00); MM_KERNEL_SUM(ss3, 1, 0xaa); - MM_KERNEL_LOAD(x+3); + MM_KERNEL_LOAD(x + 3); MM_KERNEL_SUM(ss2, 0, 0x55); MM_KERNEL_SUM(ss3, 0, 0xff); ss2 = _mm_srai_epi32(ss2, PRECISION_BITS); ss3 = _mm_srai_epi32(ss3, PRECISION_BITS); - + ss0 = _mm_packs_epi32(ss0, ss1); ss2 = _mm_packs_epi32(ss2, ss3); ss0 = _mm_packus_epi16(ss0, ss2); - _mm_storeu_si128((__m128i*) &out[x], ss0); + _mm_storeu_si128((__m128i *)&out[x], ss0); } - for (; x < im->xsize-1; x++) { + for (; x < im->xsize - 1; x++) { __m128i ss = _mm_set1_epi32(offset); - source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in1[x-1]), shuffle); + source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i *)&in1[x - 1]), shuffle); pix00 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); - source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in1[x+1]), shuffle); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int *)&in1[x + 1]), shuffle); pix01 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); - source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in0[x-1]), shuffle); + source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i *)&in0[x - 1]), shuffle); pix10 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); - source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in0[x+1]), shuffle); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int *)&in0[x + 1]), shuffle); pix11 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); - source = _mm_shuffle_epi8(_mm_loadl_epi64((__m128i*) &in_1[x-1]), shuffle); + source = + _mm_shuffle_epi8(_mm_loadl_epi64((__m128i *)&in_1[x - 1]), shuffle); pix20 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); - source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int*) &in_1[x+1]), shuffle); + source = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(int *)&in_1[x + 1]), shuffle); pix21 = _mm_unpacklo_epi8(source, _mm_setzero_si128()); MM_KERNEL_SUM(ss, 0, 0x00); MM_KERNEL_SUM(ss, 1, 0x55); ss = _mm_srai_epi32(ss, PRECISION_BITS); - + ss = _mm_packs_epi32(ss, ss); ss = _mm_packus_epi16(ss, ss); out[x] = _mm_cvtsi128_si32(ss); } out[x] = in0[x]; - #undef MM_KERNEL_LOAD - #undef MM_KERNEL_SUM +#undef MM_KERNEL_LOAD +#undef MM_KERNEL_SUM #endif } diff --git a/src/libImaging/FilterSIMD_5x5f_4u8.c b/src/libImaging/FilterSIMD_5x5f_4u8.c index 097242adb..7ebd3e10f 100644 --- a/src/libImaging/FilterSIMD_5x5f_4u8.c +++ b/src/libImaging/FilterSIMD_5x5f_4u8.c @@ -1,17 +1,15 @@ void -ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel, - float offset) -{ -#define MM_KERNEL1x5_LOAD(row, x) \ - pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \ - pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ - pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ +ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float *kernel, float offset) { +#define MM_KERNEL1x5_LOAD(row, x) \ + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \ pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x])); -#define MM_KERNEL1x5_SUM(row, kindex) \ - ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ - ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, _mm_set1_ps(kernel[5 + kindex]))); \ +#define MM_KERNEL1x5_SUM(row, kindex) \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix0##row, _mm_set1_ps(kernel[0 + kindex]))); \ + ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, _mm_set1_ps(kernel[5 + kindex]))); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, _mm_set1_ps(kernel[10 + kindex]))); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix3##row, _mm_set1_ps(kernel[15 + kindex]))); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix4##row, _mm_set1_ps(kernel[20 + kindex]))); @@ -20,13 +18,13 @@ ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel, memcpy(imOut->image32[0], im->image32[0], im->linesize); memcpy(imOut->image32[1], im->image32[1], im->linesize); - for (y = 2; y < im->ysize-2; y++) { - INT32* in_2 = im->image32[y-2]; - INT32* in_1 = im->image32[y-1]; - INT32* in0 = im->image32[y]; - INT32* in1 = im->image32[y+1]; - INT32* in2 = im->image32[y+2]; - INT32* out = imOut->image32[y]; + for (y = 2; y < im->ysize - 2; y++) { + INT32 *in_2 = im->image32[y - 2]; + INT32 *in_1 = im->image32[y - 1]; + INT32 *in0 = im->image32[y]; + INT32 *in1 = im->image32[y + 1]; + INT32 *in2 = im->image32[y + 2]; + INT32 *out = imOut->image32[y]; __m128 pix00, pix10, pix20, pix30, pix40; __m128 pix01, pix11, pix21, pix31, pix41; __m128 pix02, pix12, pix22, pix32, pix42; @@ -40,7 +38,7 @@ ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel, out[0] = in0[0]; out[1] = in0[1]; x = 2; - for (; x < im->xsize-2-4; x += 5) { + for (; x < im->xsize - 2 - 4; x += 5) { __m128 ss; __m128i ssi0, ssi1, ssi2, ssi3; @@ -49,7 +47,7 @@ ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel, MM_KERNEL1x5_SUM(1, 1); MM_KERNEL1x5_SUM(2, 2); MM_KERNEL1x5_SUM(3, 3); - MM_KERNEL1x5_LOAD(4, x+2); + MM_KERNEL1x5_LOAD(4, x + 2); MM_KERNEL1x5_SUM(4, 4); ssi0 = _mm_cvtps_epi32(ss); @@ -58,7 +56,7 @@ ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel, MM_KERNEL1x5_SUM(2, 1); MM_KERNEL1x5_SUM(3, 2); MM_KERNEL1x5_SUM(4, 3); - MM_KERNEL1x5_LOAD(0, x+3); + MM_KERNEL1x5_LOAD(0, x + 3); MM_KERNEL1x5_SUM(0, 4); ssi1 = _mm_cvtps_epi32(ss); @@ -67,7 +65,7 @@ ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel, MM_KERNEL1x5_SUM(3, 1); MM_KERNEL1x5_SUM(4, 2); MM_KERNEL1x5_SUM(0, 3); - MM_KERNEL1x5_LOAD(1, x+4); + MM_KERNEL1x5_LOAD(1, x + 4); MM_KERNEL1x5_SUM(1, 4); ssi2 = _mm_cvtps_epi32(ss); @@ -76,39 +74,39 @@ ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel, MM_KERNEL1x5_SUM(4, 1); MM_KERNEL1x5_SUM(0, 2); MM_KERNEL1x5_SUM(1, 3); - MM_KERNEL1x5_LOAD(2, x+5); + MM_KERNEL1x5_LOAD(2, x + 5); MM_KERNEL1x5_SUM(2, 4); ssi3 = _mm_cvtps_epi32(ss); ssi0 = _mm_packs_epi32(ssi0, ssi1); ssi1 = _mm_packs_epi32(ssi2, ssi3); ssi0 = _mm_packus_epi16(ssi0, ssi1); - _mm_storeu_si128((__m128i*) &out[x], ssi0); + _mm_storeu_si128((__m128i *)&out[x], ssi0); ss = _mm_set1_ps(offset); MM_KERNEL1x5_SUM(4, 0); MM_KERNEL1x5_SUM(0, 1); MM_KERNEL1x5_SUM(1, 2); MM_KERNEL1x5_SUM(2, 3); - MM_KERNEL1x5_LOAD(3, x+6); + MM_KERNEL1x5_LOAD(3, x + 6); MM_KERNEL1x5_SUM(3, 4); ssi0 = _mm_cvtps_epi32(ss); ssi0 = _mm_packs_epi32(ssi0, ssi0); ssi0 = _mm_packus_epi16(ssi0, ssi0); - out[x+4] = _mm_cvtsi128_si32(ssi0); + out[x + 4] = _mm_cvtsi128_si32(ssi0); } - for (; x < im->xsize-2; x++) { + for (; x < im->xsize - 2; x++) { __m128 ss = _mm_set1_ps(offset); __m128i ssi0; - MM_KERNEL1x5_LOAD(0, x-2); + MM_KERNEL1x5_LOAD(0, x - 2); MM_KERNEL1x5_SUM(0, 0); - MM_KERNEL1x5_LOAD(0, x-1); + MM_KERNEL1x5_LOAD(0, x - 1); MM_KERNEL1x5_SUM(0, 1); - MM_KERNEL1x5_LOAD(0, x+0); + MM_KERNEL1x5_LOAD(0, x + 0); MM_KERNEL1x5_SUM(0, 2); - MM_KERNEL1x5_LOAD(0, x+1); + MM_KERNEL1x5_LOAD(0, x + 1); MM_KERNEL1x5_SUM(0, 3); - MM_KERNEL1x5_LOAD(0, x+2); + MM_KERNEL1x5_LOAD(0, x + 2); MM_KERNEL1x5_SUM(0, 4); ssi0 = _mm_cvtps_epi32(ss); ssi0 = _mm_packs_epi32(ssi0, ssi0); @@ -116,10 +114,10 @@ ImagingFilter5x5f_4u8(Imaging imOut, Imaging im, const float* kernel, out[x] = _mm_cvtsi128_si32(ssi0); } out[x] = in0[x]; - out[x+1] = in0[x+1]; + out[x + 1] = in0[x + 1]; } memcpy(imOut->image32[y], im->image32[y], im->linesize); - memcpy(imOut->image32[y+1], im->image32[y+1], im->linesize); + memcpy(imOut->image32[y + 1], im->image32[y + 1], im->linesize); #undef MM_KERNEL1x5_LOAD #undef MM_KERNEL1x5_SUM diff --git a/src/libImaging/FilterSIMD_5x5f_u8.c b/src/libImaging/FilterSIMD_5x5f_u8.c index 714bbdf33..6681b45f6 100644 --- a/src/libImaging/FilterSIMD_5x5f_u8.c +++ b/src/libImaging/FilterSIMD_5x5f_u8.c @@ -1,16 +1,14 @@ void -ImagingFilter5x5f_u8(Imaging imOut, Imaging im, const float* kernel, - float offset) -{ -#define MM_KERNEL1x5_LOAD(row, x) \ - pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \ - pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ - pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ +ImagingFilter5x5f_u8(Imaging imOut, Imaging im, const float *kernel, float offset) { +#define MM_KERNEL1x5_LOAD(row, x) \ + pix0##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in2[x])); \ + pix1##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in1[x])); \ + pix2##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in0[x])); \ pix3##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_1[x])); \ pix4##row = _mm_cvtepi32_ps(mm_cvtepu8_epi32(&in_2[x])); -#define MM_KERNEL1x5_SUM(ss, row, krow) \ - ss = _mm_mul_ps(pix0##row, kernel0##krow); \ +#define MM_KERNEL1x5_SUM(ss, row, krow) \ + ss = _mm_mul_ps(pix0##row, kernel0##krow); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix1##row, kernel1##krow)); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix2##row, kernel2##krow)); \ ss = _mm_add_ps(ss, _mm_mul_ps(pix3##row, kernel3##krow)); \ @@ -20,68 +18,82 @@ ImagingFilter5x5f_u8(Imaging imOut, Imaging im, const float* kernel, memcpy(imOut->image8[0], im->image8[0], im->linesize); memcpy(imOut->image8[1], im->image8[1], im->linesize); - for (y = 2; y < im->ysize-2; y++) { - UINT8* in_2 = im->image8[y-2]; - UINT8* in_1 = im->image8[y-1]; - UINT8* in0 = im->image8[y]; - UINT8* in1 = im->image8[y+1]; - UINT8* in2 = im->image8[y+2]; - UINT8* out = imOut->image8[y]; + for (y = 2; y < im->ysize - 2; y++) { + UINT8 *in_2 = im->image8[y - 2]; + UINT8 *in_1 = im->image8[y - 1]; + UINT8 *in0 = im->image8[y]; + UINT8 *in1 = im->image8[y + 1]; + UINT8 *in2 = im->image8[y + 2]; + UINT8 *out = imOut->image8[y]; __m128 kernelx0 = _mm_set_ps(kernel[15], kernel[10], kernel[5], kernel[0]); - __m128 kernel00 = _mm_loadu_ps(&kernel[0+1]); - __m128 kernel10 = _mm_loadu_ps(&kernel[5+1]); - __m128 kernel20 = _mm_loadu_ps(&kernel[10+1]); - __m128 kernel30 = _mm_loadu_ps(&kernel[15+1]); - __m128 kernel40 = _mm_loadu_ps(&kernel[20+1]); + __m128 kernel00 = _mm_loadu_ps(&kernel[0 + 1]); + __m128 kernel10 = _mm_loadu_ps(&kernel[5 + 1]); + __m128 kernel20 = _mm_loadu_ps(&kernel[10 + 1]); + __m128 kernel30 = _mm_loadu_ps(&kernel[15 + 1]); + __m128 kernel40 = _mm_loadu_ps(&kernel[20 + 1]); __m128 kernelx1 = _mm_set_ps(kernel[19], kernel[14], kernel[9], kernel[4]); - __m128 kernel01 = _mm_loadu_ps(&kernel[0+0]); - __m128 kernel11 = _mm_loadu_ps(&kernel[5+0]); - __m128 kernel21 = _mm_loadu_ps(&kernel[10+0]); - __m128 kernel31 = _mm_loadu_ps(&kernel[15+0]); - __m128 kernel41 = _mm_loadu_ps(&kernel[20+0]); + __m128 kernel01 = _mm_loadu_ps(&kernel[0 + 0]); + __m128 kernel11 = _mm_loadu_ps(&kernel[5 + 0]); + __m128 kernel21 = _mm_loadu_ps(&kernel[10 + 0]); + __m128 kernel31 = _mm_loadu_ps(&kernel[15 + 0]); + __m128 kernel41 = _mm_loadu_ps(&kernel[20 + 0]); out[0] = in0[0]; out[1] = in0[1]; x = 2; - for (; x < im->xsize-2-1; x += 2) { + for (; x < im->xsize - 2 - 1; x += 2) { __m128 ss0, ss1; __m128i ssi0; __m128 pix00, pix10, pix20, pix30, pix40; - MM_KERNEL1x5_LOAD(0, x-1); + MM_KERNEL1x5_LOAD(0, x - 1); MM_KERNEL1x5_SUM(ss0, 0, 0); - ss0 = _mm_add_ps(ss0, _mm_mul_ps( - _mm_set_ps(in_1[x-2], in0[x-2], in1[x-2], in2[x-2]), - kernelx0)); + ss0 = _mm_add_ps( + ss0, + _mm_mul_ps( + _mm_set_ps(in_1[x - 2], in0[x - 2], in1[x - 2], in2[x - 2]), + kernelx0 + ) + ); MM_KERNEL1x5_SUM(ss1, 0, 1); - ss1 = _mm_add_ps(ss1, _mm_mul_ps( - _mm_set_ps(in_1[x+3], in0[x+3], in1[x+3], in2[x+3]), - kernelx1)); + ss1 = _mm_add_ps( + ss1, + _mm_mul_ps( + _mm_set_ps(in_1[x + 3], in0[x + 3], in1[x + 3], in2[x + 3]), + kernelx1 + ) + ); ss0 = _mm_hadd_ps(ss0, ss1); - ss0 = _mm_add_ps(ss0, _mm_set_ps( - offset, kernel[24] * in_2[x+3], - offset, kernel[20] * in_2[x-2])); + ss0 = _mm_add_ps( + ss0, + _mm_set_ps( + offset, kernel[24] * in_2[x + 3], offset, kernel[20] * in_2[x - 2] + ) + ); ss0 = _mm_hadd_ps(ss0, ss0); ssi0 = _mm_cvtps_epi32(ss0); ssi0 = _mm_packs_epi32(ssi0, ssi0); ssi0 = _mm_packus_epi16(ssi0, ssi0); - *((UINT32*) &out[x]) = _mm_cvtsi128_si32(ssi0); + *((UINT32 *)&out[x]) = _mm_cvtsi128_si32(ssi0); } - for (; x < im->xsize-2; x ++) { + for (; x < im->xsize - 2; x++) { __m128 ss0; __m128i ssi0; __m128 pix00, pix10, pix20, pix30, pix40; - MM_KERNEL1x5_LOAD(0, x-2); + MM_KERNEL1x5_LOAD(0, x - 2); MM_KERNEL1x5_SUM(ss0, 0, 1); - ss0 = _mm_add_ps(ss0, _mm_mul_ps( - _mm_set_ps(in_1[x+2], in0[x+2], in1[x+2], in2[x+2]), - kernelx1)); + ss0 = _mm_add_ps( + ss0, + _mm_mul_ps( + _mm_set_ps(in_1[x + 2], in0[x + 2], in1[x + 2], in2[x + 2]), + kernelx1 + ) + ); - ss0 = _mm_add_ps(ss0, _mm_set_ps( - 0, 0, offset, kernel[24] * in_2[x+2])); + ss0 = _mm_add_ps(ss0, _mm_set_ps(0, 0, offset, kernel[24] * in_2[x + 2])); ss0 = _mm_hadd_ps(ss0, ss0); ss0 = _mm_hadd_ps(ss0, ss0); ssi0 = _mm_cvtps_epi32(ss0); @@ -89,11 +101,11 @@ ImagingFilter5x5f_u8(Imaging imOut, Imaging im, const float* kernel, ssi0 = _mm_packus_epi16(ssi0, ssi0); out[x] = _mm_cvtsi128_si32(ssi0); } - out[x+0] = in0[x+0]; - out[x+1] = in0[x+1]; + out[x + 0] = in0[x + 0]; + out[x + 1] = in0[x + 1]; } memcpy(imOut->image8[y], im->image8[y], im->linesize); - memcpy(imOut->image8[y+1], im->image8[y+1], im->linesize); + memcpy(imOut->image8[y + 1], im->image8[y + 1], im->linesize); #undef MM_KERNEL1x5_LOAD #undef MM_KERNEL1x5_SUM diff --git a/src/libImaging/FilterSIMD_5x5i_4u8.c b/src/libImaging/FilterSIMD_5x5i_4u8.c index d91d472d0..92c118b29 100644 --- a/src/libImaging/FilterSIMD_5x5i_4u8.c +++ b/src/libImaging/FilterSIMD_5x5i_4u8.c @@ -1,62 +1,171 @@ void -ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, - INT32 offset) -{ +ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16 *kernel, INT32 offset) { int x, y; - offset += 1 << (PRECISION_BITS-1); + offset += 1 << (PRECISION_BITS - 1); memcpy(imOut->image32[0], im->image32[0], im->linesize); memcpy(imOut->image32[1], im->image32[1], im->linesize); - for (y = 2; y < im->ysize-2; y++) { - INT32* in_2 = im->image32[y-2]; - INT32* in_1 = im->image32[y-1]; - INT32* in0 = im->image32[y]; - INT32* in1 = im->image32[y+1]; - INT32* in2 = im->image32[y+2]; - INT32* out = imOut->image32[y]; + for (y = 2; y < im->ysize - 2; y++) { + INT32 *in_2 = im->image32[y - 2]; + INT32 *in_1 = im->image32[y - 1]; + INT32 *in0 = im->image32[y]; + INT32 *in1 = im->image32[y + 1]; + INT32 *in2 = im->image32[y + 2]; + INT32 *out = imOut->image32[y]; #if defined(__AVX2__) - #define MM_KERNEL_LOAD(row, x) \ - pix0##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x]), shuffle)); \ - pix0##row = _mm256_inserti128_si256(pix0##row, _mm256_castsi256_si128(pix0##row), 1); \ - pix1##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle)); \ - pix1##row = _mm256_inserti128_si256(pix1##row, _mm256_castsi256_si128(pix1##row), 1); \ - pix2##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle)); \ - pix2##row = _mm256_inserti128_si256(pix2##row, _mm256_castsi256_si128(pix2##row), 1); \ - pix3##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle)); \ - pix3##row = _mm256_inserti128_si256(pix3##row, _mm256_castsi256_si128(pix3##row), 1); \ - pix4##row = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x]), shuffle)); \ - pix4##row = _mm256_inserti128_si256(pix4##row, _mm256_castsi256_si128(pix4##row), 1); +#define MM_KERNEL_LOAD(row, x) \ + pix0##row = _mm256_castsi128_si256( \ + _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in2[x]), shuffle) \ + ); \ + pix0##row = \ + _mm256_inserti128_si256(pix0##row, _mm256_castsi256_si128(pix0##row), 1); \ + pix1##row = _mm256_castsi128_si256( \ + _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in1[x]), shuffle) \ + ); \ + pix1##row = \ + _mm256_inserti128_si256(pix1##row, _mm256_castsi256_si128(pix1##row), 1); \ + pix2##row = _mm256_castsi128_si256( \ + _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in0[x]), shuffle) \ + ); \ + pix2##row = \ + _mm256_inserti128_si256(pix2##row, _mm256_castsi256_si128(pix2##row), 1); \ + pix3##row = _mm256_castsi128_si256( \ + _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in_1[x]), shuffle) \ + ); \ + pix3##row = \ + _mm256_inserti128_si256(pix3##row, _mm256_castsi256_si128(pix3##row), 1); \ + pix4##row = _mm256_castsi128_si256( \ + _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in_2[x]), shuffle) \ + ); \ + pix4##row = \ + _mm256_inserti128_si256(pix4##row, _mm256_castsi256_si128(pix4##row), 1); - #define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \ - ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ - unpack_epi8(pix0##row, zero), _mm256_shuffle_epi32(kernel0##krow, kctl))); \ - ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ - unpack_epi8(pix1##row, zero), _mm256_shuffle_epi32(kernel1##krow, kctl))); \ - ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ - unpack_epi8(pix2##row, zero), _mm256_shuffle_epi32(kernel2##krow, kctl))); \ - ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ - unpack_epi8(pix3##row, zero), _mm256_shuffle_epi32(kernel3##krow, kctl))); \ - ss = _mm256_add_epi32(ss, _mm256_madd_epi16( \ - unpack_epi8(pix4##row, zero), _mm256_shuffle_epi32(kernel4##krow, kctl))); +#define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \ + ss = _mm256_add_epi32( \ + ss, \ + _mm256_madd_epi16( \ + unpack_epi8(pix0##row, zero), _mm256_shuffle_epi32(kernel0##krow, kctl) \ + ) \ + ); \ + ss = _mm256_add_epi32( \ + ss, \ + _mm256_madd_epi16( \ + unpack_epi8(pix1##row, zero), _mm256_shuffle_epi32(kernel1##krow, kctl) \ + ) \ + ); \ + ss = _mm256_add_epi32( \ + ss, \ + _mm256_madd_epi16( \ + unpack_epi8(pix2##row, zero), _mm256_shuffle_epi32(kernel2##krow, kctl) \ + ) \ + ); \ + ss = _mm256_add_epi32( \ + ss, \ + _mm256_madd_epi16( \ + unpack_epi8(pix3##row, zero), _mm256_shuffle_epi32(kernel3##krow, kctl) \ + ) \ + ); \ + ss = _mm256_add_epi32( \ + ss, \ + _mm256_madd_epi16( \ + unpack_epi8(pix4##row, zero), _mm256_shuffle_epi32(kernel4##krow, kctl) \ + ) \ + ); - __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m128i shuffle = + _mm_set_epi8(15, 11, 14, 10, 13, 9, 12, 8, 7, 3, 6, 2, 5, 1, 4, 0); __m256i kernel00 = _mm256_set_epi16( - 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0], 0, - 0, 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0]); + 0, + 0, + kernel[4], + kernel[3], + kernel[2], + kernel[1], + kernel[0], + 0, + 0, + 0, + 0, + kernel[4], + kernel[3], + kernel[2], + kernel[1], + kernel[0] + ); __m256i kernel10 = _mm256_set_epi16( - 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5], 0, - 0, 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5]); + 0, + 0, + kernel[9], + kernel[8], + kernel[7], + kernel[6], + kernel[5], + 0, + 0, + 0, + 0, + kernel[9], + kernel[8], + kernel[7], + kernel[6], + kernel[5] + ); __m256i kernel20 = _mm256_set_epi16( - 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10], 0, - 0, 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10]); + 0, + 0, + kernel[14], + kernel[13], + kernel[12], + kernel[11], + kernel[10], + 0, + 0, + 0, + 0, + kernel[14], + kernel[13], + kernel[12], + kernel[11], + kernel[10] + ); __m256i kernel30 = _mm256_set_epi16( - 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15], 0, - 0, 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15]); + 0, + 0, + kernel[19], + kernel[18], + kernel[17], + kernel[16], + kernel[15], + 0, + 0, + 0, + 0, + kernel[19], + kernel[18], + kernel[17], + kernel[16], + kernel[15] + ); __m256i kernel40 = _mm256_set_epi16( - 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20], 0, - 0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20]); + 0, + 0, + kernel[24], + kernel[23], + kernel[22], + kernel[21], + kernel[20], + 0, + 0, + 0, + 0, + kernel[24], + kernel[23], + kernel[22], + kernel[21], + kernel[20] + ); __m256i zero = _mm256_setzero_si256(); __m256i pix00, pix10, pix20, pix30, pix40; __m256i pix01, pix11, pix21, pix31, pix41; @@ -67,7 +176,7 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, MM_KERNEL_LOAD(0, 0); - for (; x < im->xsize-2-3; x += 4) { + for (; x < im->xsize - 2 - 3; x += 4) { __m256i ss0, ss2; __m128i ssout; @@ -77,8 +186,8 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, MM_KERNEL_SUM(ss0, 0, _mm256_unpacklo_epi8, 0, 0x00); MM_KERNEL_SUM(ss0, 0, _mm256_unpackhi_epi8, 0, 0x55); MM_KERNEL_SUM(ss2, 0, _mm256_unpackhi_epi8, 0, 0x00); - - MM_KERNEL_LOAD(1, x+2); + + MM_KERNEL_LOAD(1, x + 2); MM_KERNEL_SUM(ss0, 1, _mm256_unpacklo_epi8, 0, 0xaa); ss0 = _mm256_srai_epi32(ss0, PRECISION_BITS); MM_KERNEL_SUM(ss2, 1, _mm256_unpacklo_epi8, 0, 0x55); @@ -93,24 +202,34 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, ss0 = _mm256_packs_epi32(ss0, ss2); ssout = _mm_packus_epi16( - _mm256_extracti128_si256(ss0, 0), - _mm256_extracti128_si256(ss0, 1)); + _mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1) + ); ssout = _mm_shuffle_epi32(ssout, 0xd8); - _mm_storeu_si128((__m128i*) &out[x], ssout); + _mm_storeu_si128((__m128i *)&out[x], ssout); } - for (; x < im->xsize-2; x++) { + for (; x < im->xsize - 2; x++) { __m256i ss0; - MM_KERNEL_LOAD(0, x-2); - pix01 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle)); + MM_KERNEL_LOAD(0, x - 2); + pix01 = _mm256_castsi128_si256( + _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32 *)&in2[x + 2]), shuffle) + ); pix01 = _mm256_inserti128_si256(pix01, _mm256_castsi256_si128(pix01), 1); - pix11 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle)); + pix11 = _mm256_castsi128_si256( + _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32 *)&in1[x + 2]), shuffle) + ); pix11 = _mm256_inserti128_si256(pix11, _mm256_castsi256_si128(pix11), 1); - pix21 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle)); + pix21 = _mm256_castsi128_si256( + _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32 *)&in0[x + 2]), shuffle) + ); pix21 = _mm256_inserti128_si256(pix21, _mm256_castsi256_si128(pix21), 1); - pix31 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle)); + pix31 = _mm256_castsi128_si256( + _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32 *)&in_1[x + 2]), shuffle) + ); pix31 = _mm256_inserti128_si256(pix31, _mm256_castsi256_si128(pix31), 1); - pix41 = _mm256_castsi128_si256(_mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle)); + pix41 = _mm256_castsi128_si256( + _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32 *)&in_2[x + 2]), shuffle) + ); pix41 = _mm256_inserti128_si256(pix41, _mm256_castsi256_si128(pix41), 1); ss0 = _mm256_set1_epi32(offset); @@ -124,52 +243,83 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, out[x] = _mm_cvtsi128_si32(_mm256_castsi256_si128(ss0)); } out[x] = in0[x]; - out[x+1] = in0[x+1]; - #undef MM_KERNEL_LOAD - #undef MM_KERNEL_SUM + out[x + 1] = in0[x + 1]; +#undef MM_KERNEL_LOAD +#undef MM_KERNEL_SUM #else - #define MM_KERNEL_LOAD(row, x) \ - pix0##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in2[x]), shuffle); \ - pix1##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in1[x]), shuffle); \ - pix2##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in0[x]), shuffle); \ - pix3##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_1[x]), shuffle); \ - pix4##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*) &in_2[x]), shuffle); +#define MM_KERNEL_LOAD(row, x) \ + pix0##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in2[x]), shuffle); \ + pix1##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in1[x]), shuffle); \ + pix2##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in0[x]), shuffle); \ + pix3##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in_1[x]), shuffle); \ + pix4##row = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&in_2[x]), shuffle); - #define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \ - ss = _mm_add_epi32(ss, _mm_madd_epi16( \ - unpack_epi8(pix0##row, zero), _mm_shuffle_epi32(kernel0##krow, kctl))); \ - ss = _mm_add_epi32(ss, _mm_madd_epi16( \ - unpack_epi8(pix1##row, zero), _mm_shuffle_epi32(kernel1##krow, kctl))); \ - ss = _mm_add_epi32(ss, _mm_madd_epi16( \ - unpack_epi8(pix2##row, zero), _mm_shuffle_epi32(kernel2##krow, kctl))); \ - ss = _mm_add_epi32(ss, _mm_madd_epi16( \ - unpack_epi8(pix3##row, zero), _mm_shuffle_epi32(kernel3##krow, kctl))); \ - ss = _mm_add_epi32(ss, _mm_madd_epi16( \ - unpack_epi8(pix4##row, zero), _mm_shuffle_epi32(kernel4##krow, kctl))); +#define MM_KERNEL_SUM(ss, row, unpack_epi8, krow, kctl) \ + ss = _mm_add_epi32( \ + ss, \ + _mm_madd_epi16( \ + unpack_epi8(pix0##row, zero), _mm_shuffle_epi32(kernel0##krow, kctl) \ + ) \ + ); \ + ss = _mm_add_epi32( \ + ss, \ + _mm_madd_epi16( \ + unpack_epi8(pix1##row, zero), _mm_shuffle_epi32(kernel1##krow, kctl) \ + ) \ + ); \ + ss = _mm_add_epi32( \ + ss, \ + _mm_madd_epi16( \ + unpack_epi8(pix2##row, zero), _mm_shuffle_epi32(kernel2##krow, kctl) \ + ) \ + ); \ + ss = _mm_add_epi32( \ + ss, \ + _mm_madd_epi16( \ + unpack_epi8(pix3##row, zero), _mm_shuffle_epi32(kernel3##krow, kctl) \ + ) \ + ); \ + ss = _mm_add_epi32( \ + ss, \ + _mm_madd_epi16( \ + unpack_epi8(pix4##row, zero), _mm_shuffle_epi32(kernel4##krow, kctl) \ + ) \ + ); - __m128i shuffle = _mm_set_epi8(15,11,14,10,13,9,12,8, 7,3,6,2,5,1,4,0); + __m128i shuffle = + _mm_set_epi8(15, 11, 14, 10, 13, 9, 12, 8, 7, 3, 6, 2, 5, 1, 4, 0); __m128i kernel00 = _mm_set_epi16( - 0, 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0]); + 0, 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0] + ); __m128i kernel10 = _mm_set_epi16( - 0, 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5]); + 0, 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5] + ); __m128i kernel20 = _mm_set_epi16( - 0, 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10]); + 0, 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10] + ); __m128i kernel30 = _mm_set_epi16( - 0, 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15]); + 0, 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15] + ); __m128i kernel40 = _mm_set_epi16( - 0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20]); + 0, 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20] + ); __m128i kernel01 = _mm_set_epi16( - 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0], 0); + 0, 0, kernel[4], kernel[3], kernel[2], kernel[1], kernel[0], 0 + ); __m128i kernel11 = _mm_set_epi16( - 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5], 0); + 0, 0, kernel[9], kernel[8], kernel[7], kernel[6], kernel[5], 0 + ); __m128i kernel21 = _mm_set_epi16( - 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10], 0); + 0, 0, kernel[14], kernel[13], kernel[12], kernel[11], kernel[10], 0 + ); __m128i kernel31 = _mm_set_epi16( - 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15], 0); + 0, 0, kernel[19], kernel[18], kernel[17], kernel[16], kernel[15], 0 + ); __m128i kernel41 = _mm_set_epi16( - 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20], 0); + 0, 0, kernel[24], kernel[23], kernel[22], kernel[21], kernel[20], 0 + ); __m128i zero = _mm_setzero_si128(); __m128i pix00, pix10, pix20, pix30, pix40; __m128i pix01, pix11, pix21, pix31, pix41; @@ -180,7 +330,7 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, MM_KERNEL_LOAD(0, 0); - for (; x < im->xsize-2-3; x += 4) { + for (; x < im->xsize - 2 - 3; x += 4) { __m128i ss0, ss1, ss2, ss3; ss0 = _mm_set1_epi32(offset); @@ -194,8 +344,8 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, MM_KERNEL_SUM(ss1, 0, _mm_unpackhi_epi8, 1, 0x55); MM_KERNEL_SUM(ss2, 0, _mm_unpackhi_epi8, 0, 0x00); MM_KERNEL_SUM(ss3, 0, _mm_unpackhi_epi8, 1, 0x00); - - MM_KERNEL_LOAD(1, x+2); + + MM_KERNEL_LOAD(1, x + 2); MM_KERNEL_SUM(ss0, 1, _mm_unpacklo_epi8, 0, 0xaa); ss0 = _mm_srai_epi32(ss0, PRECISION_BITS); MM_KERNEL_SUM(ss1, 1, _mm_unpacklo_epi8, 1, 0xaa); @@ -216,17 +366,19 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, ss0 = _mm_packs_epi32(ss0, ss1); ss2 = _mm_packs_epi32(ss2, ss3); ss0 = _mm_packus_epi16(ss0, ss2); - _mm_storeu_si128((__m128i*) &out[x], ss0); + _mm_storeu_si128((__m128i *)&out[x], ss0); } - for (; x < im->xsize-2; x++) { + for (; x < im->xsize - 2; x++) { __m128i ss0; - MM_KERNEL_LOAD(0, x-2); - pix01 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in2[x+2]), shuffle); - pix11 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in1[x+2]), shuffle); - pix21 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in0[x+2]), shuffle); - pix31 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_1[x+2]), shuffle); - pix41 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32*) &in_2[x+2]), shuffle); + MM_KERNEL_LOAD(0, x - 2); + pix01 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32 *)&in2[x + 2]), shuffle); + pix11 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32 *)&in1[x + 2]), shuffle); + pix21 = _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32 *)&in0[x + 2]), shuffle); + pix31 = + _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32 *)&in_1[x + 2]), shuffle); + pix41 = + _mm_shuffle_epi8(_mm_cvtsi32_si128(*(INT32 *)&in_2[x + 2]), shuffle); ss0 = _mm_set1_epi32(offset); MM_KERNEL_SUM(ss0, 0, _mm_unpacklo_epi8, 0, 0x00); @@ -239,12 +391,12 @@ ImagingFilter5x5i_4u8(Imaging imOut, Imaging im, const INT16* kernel, out[x] = _mm_cvtsi128_si32(ss0); } out[x] = in0[x]; - out[x+1] = in0[x+1]; - #undef MM_KERNEL_LOAD - #undef MM_KERNEL_SUM + out[x + 1] = in0[x + 1]; +#undef MM_KERNEL_LOAD +#undef MM_KERNEL_SUM #endif } memcpy(imOut->image32[y], im->image32[y], im->linesize); - memcpy(imOut->image32[y+1], im->image32[y+1], im->linesize); + memcpy(imOut->image32[y + 1], im->image32[y + 1], im->linesize); }