From 724f0c2c9e1d11fbd2a0c6d3121dfea3bf71147f Mon Sep 17 00:00:00 2001
From: Aras Pranckevicius <aras@nesnausk.org>
Date: Tue, 16 Jan 2024 22:02:57 +0200
Subject: [PATCH] ImBuf: speed up Subsampled3x3 image filter

Conceptually Subsampling filter is a box filter: it sums up N source
image pixels, computes their average and outputs the result. Critical
thing is, that should be done in premultiplied space so that colors
from fully or mostly transparent regions do not "override" opaque
colors.

Previously, especially when operating on byte images, the code
achieved this by always working on byte values, doing "progressively
smaller" lerps into byte color result, taking care of
premultiplication and again storing the "straight" alpha for each
sample being processed. This meant that for each sample, there are 3
divisions involved! This also led to some precision loss, since for
all 9 samples all the intermediate results would only be stored at
byte precision.

Reformulate that by simply accumulating the premultiplied color
as a float color. This gets rid of all divisions, except the last
step when said float needs to be written back into a byte color.

Processing destination 4K UHD resolution image with Subsampling 3x3
filter:
- Windows/VS2022/Ryzen5950X: 52.7ms -> 28.3ms
- Mac/clang15/M1Max: 54.4ms -> 43.7ms

The unit test results have a tiny difference, since now it is better
(as per above, previously it was having some precision loss).
---
 source/blender/imbuf/intern/transform.cc      | 105 ++++++++----------
 source/blender/imbuf/intern/transform_test.cc |   6 +-
 2 files changed, 50 insertions(+), 61 deletions(-)
diff --git a/source/blender/imbuf/intern/transform.cc b/source/blender/imbuf/intern/transform.cc
index f531f527df4..3629e23e911 100644
--- a/source/blender/imbuf/intern/transform.cc
+++ b/source/blender/imbuf/intern/transform.cc
@@ -134,26 +134,6 @@ static float wrap_uv(float value, int size)
   return x;
 }
 
-template<typename T, int NumChannels>
-static void add_subsample(const T *src, T *dst, int sample_number)
-{
-  BLI_STATIC_ASSERT((is_same_any_v<T, uchar, float>), "Only uchar and float channels supported.");
-
-  float factor = 1.0 / (sample_number + 1);
-  if constexpr (std::is_same_v<T, uchar>) {
-    BLI_STATIC_ASSERT(NumChannels == 4, "Pixels using uchar requires to have 4 channels.");
-    blend_color_interpolate_byte(dst, dst, src, factor);
-  }
-  else if constexpr (std::is_same_v<T, float> && NumChannels == 4) {
-    blend_color_interpolate_float(dst, dst, src, factor);
-  }
-  else if constexpr (std::is_same_v<T, float>) {
-    for (int i : IndexRange(NumChannels)) {
-      dst[i] = dst[i] * (1.0f - factor) + src[i] * factor;
-    }
-  }
-}
-
 template<int NumChannels>
 static void sample_nearest_float(const ImBuf *source, float u, float v, float *r_sample)
 {
@@ -235,39 +215,48 @@ static void sample_image(const ImBuf *source, float u, float v, T *r_sample)
   }
 }
 
-template<typename T, int SrcChannels> static void store_sample(const T *sample, T *dst)
+static void add_subsample(const float src[4], float dst[4])
 {
-  if constexpr (std::is_same_v<T, uchar>) {
-    BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels.");
-    copy_v4_v4_uchar(dst, sample);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 4) {
-    copy_v4_v4(dst, sample);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 3) {
-    copy_v4_fl4(dst, sample[0], sample[1], sample[2], 1.0f);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 2) {
-    copy_v4_fl4(dst, sample[0], sample[1], 0.0f, 1.0f);
-  }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 1) {
-    /* Note: single channel sample is stored as grayscale. */
-    copy_v4_fl4(dst, sample[0], sample[0], sample[0], 1.0f);
-  }
-  else {
-    BLI_assert_unreachable();
-  }
+  add_v4_v4(dst, src);
 }
 
-template<typename T, int SrcChannels>
-static void mix_and_store_sample(const T *sample, T *dst, const float mix_factor)
+static void add_subsample(const uchar src[4], float dst[4])
 {
-  if constexpr (std::is_same_v<T, uchar>) {
-    BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels.");
-    blend_color_interpolate_byte(dst, dst, sample, mix_factor);
+  float premul[4];
+  straight_uchar_to_premul_float(premul, src);
+  add_v4_v4(dst, premul);
+}
+
+static void store_premul_float_sample(const float sample[4], float dst[4])
+{
+  copy_v4_v4(dst, sample);
+}
+
+static void store_premul_float_sample(const float sample[4], uchar dst[4])
+{
+  premul_float_to_straight_uchar(dst, sample);
+}
+
+template<int SrcChannels> static void store_sample(const uchar *sample, uchar *dst)
+{
+  BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels.");
+  copy_v4_v4_uchar(dst, sample);
+}
+
+template<int SrcChannels> static void store_sample(const float *sample, float *dst)
+{
+  if constexpr (SrcChannels == 4) {
+    copy_v4_v4(dst, sample);
   }
-  else if constexpr (std::is_same_v<T, float> && SrcChannels == 4) {
-    blend_color_interpolate_float(dst, dst, sample, mix_factor);
+  else if constexpr (SrcChannels == 3) {
+    copy_v4_fl4(dst, sample[0], sample[1], sample[2], 1.0f);
+  }
+  else if constexpr (SrcChannels == 2) {
+    copy_v4_fl4(dst, sample[0], sample[1], 0.0f, 1.0f);
+  }
+  else if constexpr (SrcChannels == 1) {
+    /* Note: single channel sample is stored as grayscale. */
+    copy_v4_fl4(dst, sample[0], sample[0], sample[0], 1.0f);
   }
   else {
     BLI_assert_unreachable();
@@ -286,29 +275,29 @@ static void process_scanlines(const TransformContext &ctx, IndexRange y_range)
   float2 uv_start = ctx.start_uv + ctx.add_x * 0.5f + ctx.add_y * 0.5f;
 
   if (ctx.subsampling_deltas.size() > 1) {
-    /* Multiple samples per pixel. */
+    /* Multiple samples per pixel: accumulate them premultiplied,
+     * divide by sample count and write out (un-premultiplying if writing out
+     * to byte image). */
+    const float inv_count = 1.0f / ctx.subsampling_deltas.size();
     for (int yi : y_range) {
       T *output = init_pixel_pointer<T>(ctx.dst, ctx.dst_region_x_range.first(), yi);
       float2 uv_row = uv_start + yi * ctx.add_y;
       for (int xi : ctx.dst_region_x_range) {
         float2 uv = uv_row + xi * ctx.add_x;
-        T sample[4] = {};
-        int num_subsamples_added = 0;
+        float sample[4] = {};
 
         for (const float2 &delta_uv : ctx.subsampling_deltas) {
           const float2 sub_uv = uv + delta_uv;
           if (!CropSource || !should_discard(ctx, sub_uv)) {
             T sub_sample[4];
             sample_image<Filter, T, SrcChannels, WrapUV>(ctx.src, sub_uv.x, sub_uv.y, sub_sample);
-            add_subsample<T, SrcChannels>(sub_sample, sample, num_subsamples_added);
-            num_subsamples_added += 1;
+            add_subsample(sub_sample, sample);
           }
         }
 
-        if (num_subsamples_added != 0) {
-          const float mix_weight = float(num_subsamples_added) / ctx.subsampling_deltas.size();
-          mix_and_store_sample<T, SrcChannels>(sample, output, mix_weight);
-        }
+        mul_v4_v4fl(sample, sample, inv_count);
+        store_premul_float_sample(sample, output);
+
         output += 4;
       }
     }
@@ -323,7 +312,7 @@ static void process_scanlines(const TransformContext &ctx, IndexRange y_range)
         if (!CropSource || !should_discard(ctx, uv)) {
           T sample[4];
           sample_image<Filter, T, SrcChannels, WrapUV>(ctx.src, uv.x, uv.y, sample);
-          store_sample<T, SrcChannels>(sample, output);
+          store_sample<SrcChannels>(sample, output);
         }
         output += 4;
       }
diff --git a/source/blender/imbuf/intern/transform_test.cc b/source/blender/imbuf/intern/transform_test.cc
index f513cff1285..816926b5d88 100644
--- a/source/blender/imbuf/intern/transform_test.cc
+++ b/source/blender/imbuf/intern/transform_test.cc
@@ -71,9 +71,9 @@ TEST(imbuf_transform, nearest_subsample3_2x_smaller)
 {
   ImBuf *res = transform_2x_smaller(IMB_FILTER_NEAREST, 3);
   const ColorTheme4b *got = reinterpret_cast<ColorTheme4b *>(res->byte_buffer.data);
-  EXPECT_EQ(got[0], ColorTheme4b(226, 168, 113, 255));
-  EXPECT_EQ(got[1], ColorTheme4b(133, 55, 31, 16));
-  EXPECT_EQ(got[2], ColorTheme4b(55, 22, 64, 254));
+  EXPECT_EQ(got[0], ColorTheme4b(227, 170, 113, 255));
+  EXPECT_EQ(got[1], ColorTheme4b(133, 55, 31, 17));
+  EXPECT_EQ(got[2], ColorTheme4b(56, 22, 64, 253));
   IMB_freeImBuf(res);
 }
 
-- 
2.30.2