Merge pull request #156 from homm/master

Performance improvement of alpha_composite function
2025-10-25 05:01:26 +03:00 · 2013-03-24 12:17:05 -07:00 · 2013-03-24 12:17:05 -07:00 · 4554bcb4e4
commit 4554bcb4e4
parent 8244203366 0663e14444
1 changed files with 53 additions and 52 deletions
--- a/libImaging/AlphaComposite.c
+++ b/libImaging/AlphaComposite.c
@ -12,81 +12,82 @@
 #include "Imaging.h"


+typedef struct
+{
+    UINT8 r;
+    UINT8 g;
+    UINT8 b;
+    UINT8 a;
+} rgba8;
+
+
+
 Imaging
 ImagingAlphaComposite(Imaging imDst, Imaging imSrc)
 {
    Imaging imOut;
    int x, y;
-    float dstR, dstG, dstB, dstA;
-    float srcR, srcG, srcB, srcA;
-    float outR, outG, outB, outA;

    /* Check arguments */
    if (!imDst || !imSrc ||
-	strcmp(imDst->mode, "RGBA") ||
-	imDst->type != IMAGING_TYPE_UINT8 ||
-	imDst->bands != 4)
-	return ImagingError_ModeError();
+        strcmp(imDst->mode, "RGBA") ||
+        imDst->type != IMAGING_TYPE_UINT8 ||
+        imDst->bands != 4)
+        return ImagingError_ModeError();
+
    if (strcmp(imDst->mode, imSrc->mode) ||
-	imDst->type  != imSrc->type  ||
-	imDst->bands != imSrc->bands ||
-	imDst->xsize != imSrc->xsize ||
-	imDst->ysize != imSrc->ysize)
-	return ImagingError_Mismatch();
+        imDst->type  != imSrc->type  ||
+        imDst->bands != imSrc->bands ||
+        imDst->xsize != imSrc->xsize ||
+        imDst->ysize != imSrc->ysize)
+        return ImagingError_Mismatch();

    imOut = ImagingNew(imDst->mode, imDst->xsize, imDst->ysize);
    if (!imOut)
-	return NULL;
+        return NULL;

    ImagingCopyInfo(imOut, imDst);

    for (y = 0; y < imDst->ysize; y++) {

-	UINT8* dst = (UINT8*) imDst->image[y];
-	UINT8* src = (UINT8*) imSrc->image[y];
-	UINT8* out = (UINT8*) imOut->image[y];
+        rgba8* dst = (rgba8*) imDst->image[y];
+        rgba8* src = (rgba8*) imSrc->image[y];
+        rgba8* out = (rgba8*) imOut->image[y];

-	for (x = 0; x < imDst->linesize; x += 4) {
+        for (x = 0; x < imDst->xsize; x ++) {

-	    dstR = dst[x + 0] / 255.0;
-	    dstG = dst[x + 1] / 255.0;
-	    dstB = dst[x + 2] / 255.0;
-	    dstA = dst[x + 3] / 255.0;
+            if (src->a == 0) {
+                // Copy 4 bytes at once.
+                *out = *dst;
+            } else {
+                // Integer implementation with increased precision.
+                // Each variable has extra meaningful bits.
+                // Divisions are rounded.

-	    srcR = src[x + 0] / 255.0;
-	    srcG = src[x + 1] / 255.0;
-	    srcB = src[x + 2] / 255.0;
-	    srcA = src[x + 3] / 255.0;
+                // This code uses trick from Paste.c:
+                // (a + (2 << (n-1)) - 1) / ((2 << n)-1)
+                // almost equivalent to:
+                // tmp = a + (2 << (n-1)), ((tmp >> n) + tmp) >> n

-	    if (dstA == 1.0) {
-		outR = srcR * srcA + dstR * (1.0 - srcA);
-		outG = srcG * srcA + dstG * (1.0 - srcA);
-		outB = srcB * srcA + dstB * (1.0 - srcA);
-		outA = 1.0;
-	    } else if (srcA == 0.0) {
-		outR = dstR;
-		outG = dstG;
-		outB = dstB;
-		outA = dstA;
-	    } else {
-		outA = srcA + dstA * (1.0 - srcA);
-		if (outA == 0.0) {
-		    outR = 0.0;
-		    outG = 0.0;
-		    outB = 0.0;
-		} else {
-		    outR = (srcR * srcA + dstR * dstA * (1.0 - srcA)) / outA;
-		    outG = (srcG * srcA + dstG * dstA * (1.0 - srcA)) / outA;
-		    outB = (srcB * srcA + dstB * dstA * (1.0 - srcA)) / outA;
-		}
-	    }
+                // 0xff * 0xff = 16 meaningful bits.
+                UINT16 blend = dst->a * (255 - src->a);
+                // Shift 4 bits up, to don't loose blend precision
+                // on very transparent pixels.
+                UINT16 outa = (src->a << 4) + (((blend << 4) + (blend >> 4) + 0x80) >> 8);
+                UINT16 coef1 = (((src->a << 8) - src->a) << 8) / outa;  // 12
+                UINT16 coef2 = (blend << 8) / outa;  // 12

-	    out[x + 0] = (UINT8) (255.0 * outR + 0.5);
-	    out[x + 1] = (UINT8) (255.0 * outG + 0.5);
-	    out[x + 2] = (UINT8) (255.0 * outB + 0.5);
-	    out[x + 3] = (UINT8) (255.0 * outA + 0.5);
+                UINT32 tmpr = src->r * coef1 + dst->r * coef2 + 0x800;
+                out->r = ((tmpr >> 8) + tmpr) >> 12;
+                UINT32 tmpg = src->g * coef1 + dst->g * coef2 + 0x800;
+                out->g = ((tmpg >> 8) + tmpg) >> 12;
+                UINT32 tmpb = src->b * coef1 + dst->b * coef2 + 0x800;
+                out->b = ((tmpb >> 8) + tmpb) >> 12;
+                out->a = (outa + 0x7) >> 4;
+            }

-	}
+            dst++; src++; out++;
+        }

    }