Merge pull request #2679 from uploadcare/fast-filters

Fast filters
2025-08-10 07:14:46 +03:00 · 2017-09-10 17:22:00 +01:00 · 2017-09-10 17:22:00 +01:00 · 3b5c2c30be
commit 3b5c2c30be
parent e310b38ba6 98ee46827c
6 changed files with 316 additions and 112 deletions
--- a/PIL/Image.py
+++ b/PIL/Image.py
@ -1114,6 +1114,8 @@ class Image(object):
        :param filter: Filter kernel.
        :returns: An :py:class:`~PIL.Image.Image` object.  """

+        from . import ImageFilter
+
        self.load()

        if isinstance(filter, collections.Callable):
@ -1122,9 +1124,10 @@ class Image(object):
            raise TypeError("filter argument should be ImageFilter.Filter " +
                            "instance or class")

-        if self.im.bands == 1:
+        multiband = isinstance(filter, ImageFilter.MultibandFilter)
+        if self.im.bands == 1 or multiband:
            return self._new(filter.filter(self.im))
-        # fix to handle multiband images since _imaging doesn't
+
        ims = []
        for c in range(self.im.bands):
            ims.append(self._new(filter.filter(self.im.getband(c))))
--- a/PIL/ImageFilter.py
+++ b/PIL/ImageFilter.py
@ -22,7 +22,11 @@ class Filter(object):
    pass


-class Kernel(Filter):
+class MultibandFilter(Filter):
+    pass
+
+
+class Kernel(MultibandFilter):
    """
    Create a convolution kernel.  The current version only
    supports 3x3 and 5x5 integer and floating point kernels.
@ -142,7 +146,7 @@ class ModeFilter(Filter):
        return image.modefilter(self.size)


-class GaussianBlur(Filter):
+class GaussianBlur(MultibandFilter):
    """Gaussian blur filter.

    :param radius: Blur radius.
@ -156,7 +160,7 @@ class GaussianBlur(Filter):
        return image.gaussian_blur(self.radius)


-class UnsharpMask(Filter):
+class UnsharpMask(MultibandFilter):
    """Unsharp mask filter.

    See Wikipedia's entry on `digital unsharp masking`_ for an explanation of
--- a/Tests/test_image_filter.py
+++ b/Tests/test_image_filter.py
@ -95,26 +95,38 @@ class TestImageFilter(PillowTestCase):
        self.assertEqual(rankfilter.rank, 2)

    def test_consistency_3x3(self):
-        im = Image.open("Tests/images/hopper.bmp")
-        emboss = im.filter(ImageFilter.Kernel((3, 3),
-            (-1, -1,  0,
-             -1,  0,  1,
-              0,  1,  1), .3))
+        source = Image.open("Tests/images/hopper.bmp")
+        reference = Image.open("Tests/images/hopper_emboss.bmp")
+        kernel = ImageFilter.Kernel((3, 3),
+                                    (-1, -1,  0,
+                                     -1,  0,  1,
+                                      0,  1,  1), .3)
+        source = source.split() * 2
+        reference = reference.split() * 2

-        self.assert_image_equal(
-            emboss, Image.open("Tests/images/hopper_emboss.bmp"))
+        for mode in ['L', 'LA', 'RGB', 'CMYK']:
+            self.assert_image_equal(
+                Image.merge(mode, source[:len(mode)]).filter(kernel),
+                Image.merge(mode, reference[:len(mode)]),
+            )

    def test_consistency_5x5(self):
-        im = Image.open("Tests/images/hopper.bmp")
-        emboss = im.filter(ImageFilter.Kernel((5, 5),
-            (-1, -1, -1, -1,  0,
-             -1, -1, -1,  0,  1,
-             -1, -1,  0,  1,  1,
-             -1,  0,  1,  1,  1,
-              0,  1,  1,  1,  1), 0.3))
+        source = Image.open("Tests/images/hopper.bmp")
+        reference = Image.open("Tests/images/hopper_emboss_more.bmp")
+        kernel = ImageFilter.Kernel((5, 5),
+                                    (-1, -1, -1, -1,  0,
+                                     -1, -1, -1,  0,  1,
+                                     -1, -1,  0,  1,  1,
+                                     -1,  0,  1,  1,  1,
+                                      0,  1,  1,  1,  1), 0.3)
+        source = source.split() * 2
+        reference = reference.split() * 2

-        self.assert_image_equal(
-            emboss, Image.open("Tests/images/hopper_emboss_more.bmp"))
+        for mode in ['L', 'LA', 'RGB', 'CMYK']:
+            self.assert_image_equal(
+                Image.merge(mode, source[:len(mode)]).filter(kernel),
+                Image.merge(mode, reference[:len(mode)]),
+            )


 if __name__ == '__main__':
--- a/_imaging.c
+++ b/_imaging.c
@ -819,7 +819,7 @@ _filter(ImagingObject* self, PyObject* args)
    Py_ssize_t kernelsize;
    FLOAT32* kerneldata;

-    int xsize, ysize;
+    int xsize, ysize, i;
    float divisor, offset;
    PyObject* kernel = NULL;
    if (!PyArg_ParseTuple(args, "(ii)ffO", &xsize, &ysize,
@ -835,8 +835,12 @@ _filter(ImagingObject* self, PyObject* args)
        return ImagingError_ValueError("bad kernel size");
    }

+    for (i = 0; i < kernelsize; ++i) {
+        kerneldata[i] /= divisor;
+    }
+
    imOut = PyImagingNew(
-        ImagingFilter(self->image, xsize, ysize, kerneldata, offset, divisor)
+        ImagingFilter(self->image, xsize, ysize, kerneldata, offset)
        );

    free(kerneldata);
--- a/libImaging/Filter.c
+++ b/libImaging/Filter.c
@ -26,6 +26,23 @@

 #include "Imaging.h"

+
+#ifdef WORDS_BIGENDIAN
+    #define MAKE_UINT32(u0, u1, u2, u3) (u3 | (u2<<8) | (u1<<16) | (u0<<24))
+#else
+    #define MAKE_UINT32(u0, u1, u2, u3) (u0 | (u1<<8) | (u2<<16) | (u3<<24))
+#endif
+
+
+static inline UINT8 clip8(float in)
+{
+    if (in <= 0.0)
+        return 0;
+    if (in >= 255.0)
+        return 255;
+    return (UINT8) in;
+}
+
 Imaging
 ImagingExpand(Imaging imIn, int xmargin, int ymargin, int mode)
 {
@ -36,9 +53,8 @@ ImagingExpand(Imaging imIn, int xmargin, int ymargin, int mode)
    if (xmargin < 0 && ymargin < 0)
        return (Imaging) ImagingError_ValueError("bad kernel size");

-    imOut = ImagingNew(
-        imIn->mode, imIn->xsize+2*xmargin, imIn->ysize+2*ymargin
-        );
+    imOut = ImagingNewDirty(
+        imIn->mode, imIn->xsize+2*xmargin, imIn->ysize+2*ymargin);
    if (!imOut)
        return NULL;

@ -74,16 +90,259 @@ ImagingExpand(Imaging imIn, int xmargin, int ymargin, int mode)
    return imOut;
 }

+
+/* This is work around bug in GCC prior 4.9 in 64 bit mode.
+   GCC generates code with partial dependency which 3 times slower.
+   See: http://stackoverflow.com/a/26588074/253146 */
+#if defined(__x86_64__) && defined(__SSE__) &&  ! defined(__NO_INLINE__) && \
+    ! defined(__clang__) && defined(GCC_VERSION) && (GCC_VERSION < 40900)
+static float __attribute__((always_inline)) inline i2f(int v) {
+    float x;
+    __asm__("xorps %0, %0; cvtsi2ss %1, %0" : "=X"(x) : "r"(v) );
+    return x;
+}
+#else
+static float inline i2f(int v) { return (float) v; }
+#endif
+
+
+void
+ImagingFilter3x3(Imaging imOut, Imaging im, const float* kernel,
+                 float offset)
+{
+#define KERNEL1x3(in0, x, kernel, d) ( \
+    i2f((UINT8) in0[x-d])  * (kernel)[0] + \
+    i2f((UINT8) in0[x])    * (kernel)[1] + \
+    i2f((UINT8) in0[x+d])  * (kernel)[2])
+
+    int x = 0, y = 0;
+
+    memcpy(imOut->image[0], im->image[0], im->linesize);
+    if (im->bands == 1) {
+        // Add one time for rounding
+        offset += 0.5;
+        for (y = 1; y < im->ysize-1; y++) {
+            UINT8* in_1 = (UINT8*) im->image[y-1];
+            UINT8* in0 = (UINT8*) im->image[y];
+            UINT8* in1 = (UINT8*) im->image[y+1];
+            UINT8* out = (UINT8*) imOut->image[y];
+
+            out[0] = in0[0];
+            for (x = 1; x < im->xsize-1; x++) {
+                float ss = offset;
+                ss += KERNEL1x3(in1, x, &kernel[0], 1);
+                ss += KERNEL1x3(in0, x, &kernel[3], 1);
+                ss += KERNEL1x3(in_1, x, &kernel[6], 1);
+                out[x] = clip8(ss);
+             }
+            out[x] = in0[x];
+        }
+    } else {
+        // Add one time for rounding
+        offset += 0.5;
+        for (y = 1; y < im->ysize-1; y++) {
+            UINT8* in_1 = (UINT8*) im->image[y-1];
+            UINT8* in0 = (UINT8*) im->image[y];
+            UINT8* in1 = (UINT8*) im->image[y+1];
+            UINT32* out = (UINT32*) imOut->image[y];
+
+            out[0] = ((UINT32*) in0)[0];
+            if (im->bands == 2) {
+                for (x = 1; x < im->xsize-1; x++) {
+                    float ss0 = offset;
+                    float ss3 = offset;
+                    ss0 += KERNEL1x3(in1, x*4+0, &kernel[0], 4);
+                    ss3 += KERNEL1x3(in1, x*4+3, &kernel[0], 4);
+                    ss0 += KERNEL1x3(in0, x*4+0, &kernel[3], 4);
+                    ss3 += KERNEL1x3(in0, x*4+3, &kernel[3], 4);
+                    ss0 += KERNEL1x3(in_1, x*4+0, &kernel[6], 4);
+                    ss3 += KERNEL1x3(in_1, x*4+3, &kernel[6], 4);
+                    out[x] = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
+                }
+            } else if (im->bands == 3) {
+                for (x = 1; x < im->xsize-1; x++) {
+                    float ss0 = offset;
+                    float ss1 = offset;
+                    float ss2 = offset;
+                    ss0 += KERNEL1x3(in1, x*4+0, &kernel[0], 4);
+                    ss1 += KERNEL1x3(in1, x*4+1, &kernel[0], 4);
+                    ss2 += KERNEL1x3(in1, x*4+2, &kernel[0], 4);
+                    ss0 += KERNEL1x3(in0, x*4+0, &kernel[3], 4);
+                    ss1 += KERNEL1x3(in0, x*4+1, &kernel[3], 4);
+                    ss2 += KERNEL1x3(in0, x*4+2, &kernel[3], 4);
+                    ss0 += KERNEL1x3(in_1, x*4+0, &kernel[6], 4);
+                    ss1 += KERNEL1x3(in_1, x*4+1, &kernel[6], 4);
+                    ss2 += KERNEL1x3(in_1, x*4+2, &kernel[6], 4);
+                    out[x] = MAKE_UINT32(
+                        clip8(ss0), clip8(ss1), clip8(ss2), 0);
+                }
+            } else if (im->bands == 4) {
+                for (x = 1; x < im->xsize-1; x++) {
+                    float ss0 = offset;
+                    float ss1 = offset;
+                    float ss2 = offset;
+                    float ss3 = offset;
+                    ss0 += KERNEL1x3(in1, x*4+0, &kernel[0], 4);
+                    ss1 += KERNEL1x3(in1, x*4+1, &kernel[0], 4);
+                    ss2 += KERNEL1x3(in1, x*4+2, &kernel[0], 4);
+                    ss3 += KERNEL1x3(in1, x*4+3, &kernel[0], 4);
+                    ss0 += KERNEL1x3(in0, x*4+0, &kernel[3], 4);
+                    ss1 += KERNEL1x3(in0, x*4+1, &kernel[3], 4);
+                    ss2 += KERNEL1x3(in0, x*4+2, &kernel[3], 4);
+                    ss3 += KERNEL1x3(in0, x*4+3, &kernel[3], 4);
+                    ss0 += KERNEL1x3(in_1, x*4+0, &kernel[6], 4);
+                    ss1 += KERNEL1x3(in_1, x*4+1, &kernel[6], 4);
+                    ss2 += KERNEL1x3(in_1, x*4+2, &kernel[6], 4);
+                    ss3 += KERNEL1x3(in_1, x*4+3, &kernel[6], 4);
+                    out[x] = MAKE_UINT32(
+                        clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
+                }
+            }
+            out[x] = ((UINT32*) in0)[x];
+        }
+    }
+    memcpy(imOut->image[y], im->image[y], im->linesize);
+}
+
+
+void
+ImagingFilter5x5(Imaging imOut, Imaging im, const float* kernel,
+                 float offset)
+{
+#define KERNEL1x5(in0, x, kernel, d) ( \
+    i2f((UINT8) in0[x-d-d])   * (kernel)[0] + \
+    i2f((UINT8) in0[x-d])     * (kernel)[1] + \
+    i2f((UINT8) in0[x])       * (kernel)[2] + \
+    i2f((UINT8) in0[x+d])     * (kernel)[3] + \
+    i2f((UINT8) in0[x+d+d])   * (kernel)[4])
+
+    int x = 0, y = 0;
+
+    memcpy(imOut->image[0], im->image[0], im->linesize);
+    memcpy(imOut->image[1], im->image[1], im->linesize);
+    if (im->bands == 1) {
+        // Add one time for rounding
+        offset += 0.5;
+        for (y = 2; y < im->ysize-2; y++) {
+            UINT8* in_2 = (UINT8*) im->image[y-2];
+            UINT8* in_1 = (UINT8*) im->image[y-1];
+            UINT8* in0 = (UINT8*) im->image[y];
+            UINT8* in1 = (UINT8*) im->image[y+1];
+            UINT8* in2 = (UINT8*) im->image[y+2];
+            UINT8* out = (UINT8*) imOut->image[y];
+
+            out[0] = in0[0];
+            out[1] = in0[1];
+            for (x = 2; x < im->xsize-2; x++) {
+                float ss = offset;
+                ss += KERNEL1x5(in2, x, &kernel[0], 1);
+                ss += KERNEL1x5(in1, x, &kernel[5], 1);
+                ss += KERNEL1x5(in0, x, &kernel[10], 1);
+                ss += KERNEL1x5(in_1, x, &kernel[15], 1);
+                ss += KERNEL1x5(in_2, x, &kernel[20], 1);
+                out[x] = clip8(ss);
+            }
+            out[x+0] = in0[x+0];
+            out[x+1] = in0[x+1];
+        }
+    } else {
+        // Add one time for rounding
+        offset += 0.5;
+        for (y = 2; y < im->ysize-2; y++) {
+            UINT8* in_2 = (UINT8*) im->image[y-2];
+            UINT8* in_1 = (UINT8*) im->image[y-1];
+            UINT8* in0 = (UINT8*) im->image[y];
+            UINT8* in1 = (UINT8*) im->image[y+1];
+            UINT8* in2 = (UINT8*) im->image[y+2];
+            UINT32* out = (UINT32*) imOut->image[y];
+
+            out[0] = ((UINT32*) in0)[0];
+            out[1] = ((UINT32*) in0)[1];
+            if (im->bands == 2) {
+                for (x = 2; x < im->xsize-2; x++) {
+                    float ss0 = offset;
+                    float ss3 = offset;
+                    ss0 += KERNEL1x5(in2, x*4+0, &kernel[0], 4);
+                    ss3 += KERNEL1x5(in2, x*4+3, &kernel[0], 4);
+                    ss0 += KERNEL1x5(in1, x*4+0, &kernel[5], 4);
+                    ss3 += KERNEL1x5(in1, x*4+3, &kernel[5], 4);
+                    ss0 += KERNEL1x5(in0, x*4+0, &kernel[10], 4);
+                    ss3 += KERNEL1x5(in0, x*4+3, &kernel[10], 4);
+                    ss0 += KERNEL1x5(in_1, x*4+0, &kernel[15], 4);
+                    ss3 += KERNEL1x5(in_1, x*4+3, &kernel[15], 4);
+                    ss0 += KERNEL1x5(in_2, x*4+0, &kernel[20], 4);
+                    ss3 += KERNEL1x5(in_2, x*4+3, &kernel[20], 4);
+                    out[x] = MAKE_UINT32(clip8(ss0), 0, 0, clip8(ss3));
+                }
+            } else if (im->bands == 3) {
+                for (x = 2; x < im->xsize-2; x++) {
+                    float ss0 = offset;
+                    float ss1 = offset;
+                    float ss2 = offset;
+                    ss0 += KERNEL1x5(in2, x*4+0, &kernel[0], 4);
+                    ss1 += KERNEL1x5(in2, x*4+1, &kernel[0], 4);
+                    ss2 += KERNEL1x5(in2, x*4+2, &kernel[0], 4);
+                    ss0 += KERNEL1x5(in1, x*4+0, &kernel[5], 4);
+                    ss1 += KERNEL1x5(in1, x*4+1, &kernel[5], 4);
+                    ss2 += KERNEL1x5(in1, x*4+2, &kernel[5], 4);
+                    ss0 += KERNEL1x5(in0, x*4+0, &kernel[10], 4);
+                    ss1 += KERNEL1x5(in0, x*4+1, &kernel[10], 4);
+                    ss2 += KERNEL1x5(in0, x*4+2, &kernel[10], 4);
+                    ss0 += KERNEL1x5(in_1, x*4+0, &kernel[15], 4);
+                    ss1 += KERNEL1x5(in_1, x*4+1, &kernel[15], 4);
+                    ss2 += KERNEL1x5(in_1, x*4+2, &kernel[15], 4);
+                    ss0 += KERNEL1x5(in_2, x*4+0, &kernel[20], 4);
+                    ss1 += KERNEL1x5(in_2, x*4+1, &kernel[20], 4);
+                    ss2 += KERNEL1x5(in_2, x*4+2, &kernel[20], 4);
+                    out[x] = MAKE_UINT32(
+                        clip8(ss0), clip8(ss1), clip8(ss2), 0);
+                }
+            } else if (im->bands == 4) {
+                for (x = 2; x < im->xsize-2; x++) {
+                    float ss0 = offset;
+                    float ss1 = offset;
+                    float ss2 = offset;
+                    float ss3 = offset;
+                    ss0 += KERNEL1x5(in2, x*4+0, &kernel[0], 4);
+                    ss1 += KERNEL1x5(in2, x*4+1, &kernel[0], 4);
+                    ss2 += KERNEL1x5(in2, x*4+2, &kernel[0], 4);
+                    ss3 += KERNEL1x5(in2, x*4+3, &kernel[0], 4);
+                    ss0 += KERNEL1x5(in1, x*4+0, &kernel[5], 4);
+                    ss1 += KERNEL1x5(in1, x*4+1, &kernel[5], 4);
+                    ss2 += KERNEL1x5(in1, x*4+2, &kernel[5], 4);
+                    ss3 += KERNEL1x5(in1, x*4+3, &kernel[5], 4);
+                    ss0 += KERNEL1x5(in0, x*4+0, &kernel[10], 4);
+                    ss1 += KERNEL1x5(in0, x*4+1, &kernel[10], 4);
+                    ss2 += KERNEL1x5(in0, x*4+2, &kernel[10], 4);
+                    ss3 += KERNEL1x5(in0, x*4+3, &kernel[10], 4);
+                    ss0 += KERNEL1x5(in_1, x*4+0, &kernel[15], 4);
+                    ss1 += KERNEL1x5(in_1, x*4+1, &kernel[15], 4);
+                    ss2 += KERNEL1x5(in_1, x*4+2, &kernel[15], 4);
+                    ss3 += KERNEL1x5(in_1, x*4+3, &kernel[15], 4);
+                    ss0 += KERNEL1x5(in_2, x*4+0, &kernel[20], 4);
+                    ss1 += KERNEL1x5(in_2, x*4+1, &kernel[20], 4);
+                    ss2 += KERNEL1x5(in_2, x*4+2, &kernel[20], 4);
+                    ss3 += KERNEL1x5(in_2, x*4+3, &kernel[20], 4);
+                    out[x] = MAKE_UINT32(
+                        clip8(ss0), clip8(ss1), clip8(ss2), clip8(ss3));
+                }
+            }
+            out[x] = ((UINT32*) in0)[x];
+            out[x+1] = ((UINT32*) in0)[x+1];
+        }
+    }
+    memcpy(imOut->image[y], im->image[y], im->linesize);
+    memcpy(imOut->image[y+1], im->image[y+1], im->linesize);
+}
+
 Imaging
 ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32* kernel,
-              FLOAT32 offset, FLOAT32 divisor)
+              FLOAT32 offset)
 {
    Imaging imOut;
-    int x, y;
-    FLOAT32 sum;
    ImagingSectionCookie cookie;

-    if (!im || strcmp(im->mode, "L") != 0)
+    if ( ! im || im->type != IMAGING_TYPE_UINT8)
        return (Imaging) ImagingError_ModeError();

    if (im->xsize < xsize || im->ysize < ysize)
@ -92,95 +351,17 @@ ImagingFilter(Imaging im, int xsize, int ysize, const FLOAT32* kernel,
    if ((xsize != 3 && xsize != 5) || xsize != ysize)
        return (Imaging) ImagingError_ValueError("bad kernel size");

-    imOut = ImagingNew(im->mode, im->xsize, im->ysize);
+    imOut = ImagingNewDirty(im->mode, im->xsize, im->ysize);
    if (!imOut)
        return NULL;

-    // Add one time for rounding
-    offset += 0.5;
-
-    /* brute force kernel implementations */
-#define KERNEL3x3(image, kernel, d) ( \
-    (int) image[y+1][x-d] * kernel[0] + \
-    (int) image[y+1][x]   * kernel[1] + \
-    (int) image[y+1][x+d] * kernel[2] + \
-    (int) image[y][x-d]   * kernel[3] + \
-    (int) image[y][x]     * kernel[4] + \
-    (int) image[y][x+d]   * kernel[5] + \
-    (int) image[y-1][x-d] * kernel[6] + \
-    (int) image[y-1][x]   * kernel[7] + \
-    (int) image[y-1][x+d] * kernel[8])
-
-#define KERNEL5x5(image, kernel, d) ( \
-    (int) image[y+2][x-d-d] * kernel[0] + \
-    (int) image[y+2][x-d]   * kernel[1] + \
-    (int) image[y+2][x]     * kernel[2] + \
-    (int) image[y+2][x+d]   * kernel[3] + \
-    (int) image[y+2][x+d+d] * kernel[4] + \
-    (int) image[y+1][x-d-d] * kernel[5] + \
-    (int) image[y+1][x-d]   * kernel[6] + \
-    (int) image[y+1][x]     * kernel[7] + \
-    (int) image[y+1][x+d]   * kernel[8] + \
-    (int) image[y+1][x+d+d] * kernel[9] + \
-    (int) image[y][x-d-d]   * kernel[10] + \
-    (int) image[y][x-d]     * kernel[11] + \
-    (int) image[y][x]       * kernel[12] + \
-    (int) image[y][x+d]     * kernel[13] + \
-    (int) image[y][x+d+d]   * kernel[14] + \
-    (int) image[y-1][x-d-d] * kernel[15] + \
-    (int) image[y-1][x-d]   * kernel[16] + \
-    (int) image[y-1][x]     * kernel[17] + \
-    (int) image[y-1][x+d]   * kernel[18] + \
-    (int) image[y-1][x+d+d] * kernel[19] + \
-    (int) image[y-2][x-d-d] * kernel[20] + \
-    (int) image[y-2][x-d]   * kernel[21] + \
-    (int) image[y-2][x]     * kernel[22] + \
-    (int) image[y-2][x+d]   * kernel[23] + \
-    (int) image[y-2][x+d+d] * kernel[24])
-
    ImagingSectionEnter(&cookie);
    if (xsize == 3) {
        /* 3x3 kernel. */
-        for (x = 0; x < im->xsize; x++)
-            imOut->image[0][x] = im->image8[0][x];
-        for (y = 1; y < im->ysize-1; y++) {
-            imOut->image[y][0] = im->image8[y][0];
-            for (x = 1; x < im->xsize-1; x++) {
-                sum = KERNEL3x3(im->image8, kernel, 1) / divisor + offset;
-                if (sum <= 0)
-                    imOut->image8[y][x] = 0;
-                else if (sum >= 255)
-                    imOut->image8[y][x] = 255;
-                else
-                    imOut->image8[y][x] = (UINT8) sum;
-             }
-            imOut->image8[y][x] = im->image8[y][x];
-        }
-        for (x = 0; x < im->xsize; x++)
-            imOut->image8[y][x] = im->image8[y][x];
+        ImagingFilter3x3(imOut, im, kernel, offset);
    } else {
        /* 5x5 kernel. */
-        for (y = 0; y < 2; y++)
-            for (x = 0; x < im->xsize; x++)
-                imOut->image8[y][x] = im->image8[y][x];
-        for (; y < im->ysize-2; y++) {
-            for (x = 0; x < 2; x++)
-                imOut->image8[y][x] = im->image8[y][x];
-            for (; x < im->xsize-2; x++) {
-                sum = KERNEL5x5(im->image8, kernel, 1) / divisor + offset;
-                if (sum <= 0)
-                    imOut->image8[y][x] = 0;
-                else if (sum >= 255)
-                    imOut->image8[y][x] = 255;
-                else
-                    imOut->image8[y][x] = (UINT8) sum;
-            }
-            for (; x < im->xsize; x++)
-                imOut->image8[y][x] = im->image8[y][x];
-        }
-        for (; y < im->ysize; y++)
-            for (x = 0; x < im->xsize; x++)
-                imOut->image8[y][x] = im->image8[y][x];
+        ImagingFilter5x5(imOut, im, kernel, offset);
    }
    ImagingSectionLeave(&cookie);
    return imOut;
--- a/libImaging/Imaging.h
+++ b/libImaging/Imaging.h
@ -260,7 +260,7 @@ extern Imaging ImagingFillLinearGradient(const char* mode);
 extern Imaging ImagingFillRadialGradient(const char* mode);
 extern Imaging ImagingFilter(
    Imaging im, int xsize, int ysize, const FLOAT32* kernel,
-    FLOAT32 offset, FLOAT32 divisor);
+    FLOAT32 offset);
 extern Imaging ImagingFlipLeftRight(Imaging imOut, Imaging imIn);
 extern Imaging ImagingFlipTopBottom(Imaging imOut, Imaging imIn);
 extern Imaging ImagingGaussianBlur(Imaging imOut, Imaging imIn, float radius,