From cd2d40706114ccf8338f814976e4ad297c92658b Mon Sep 17 00:00:00 2001
From: Aras Pranckevicius <aras@nesnausk.org>
Date: Tue, 16 Jan 2024 11:23:55 +0200
Subject: [PATCH] ImBuf: no need to use double precision inside IMB_transform

Double precision pixel coordinate interpolation was added in 3a65d2f5911
to fix an issue of "wobbly" resulting image at very high scale factors.
But the root cause of that was the fact that the scanline loop was
repeatedly adding the floating point step for each pixel, instead of
doing multiplication by pixel index (repeated floating point additions
can "drift" due to imprecision, whereas multiplications are much more
accurate).

Change all that math back to use single precision floats. I checked the
original issue the commit was fixing, it is still fine. Also added a
gtest to cover this situation: imbuf_transform, nearest_very_large_scale

This makes IMB_transform a tiny bit faster, on Windows/VS2022/Ryzen5950X
scaling an image at 4K resolution:
- Bilinear: 5.92 -> 5.83 ms
- Subsampled3x3: 53.6 -> 52.7 ms
---
 source/blender/imbuf/intern/transform.cc      | 57 +++++++++----------
 source/blender/imbuf/intern/transform_test.cc | 34 +++++++++++
 2 files changed, 61 insertions(+), 30 deletions(-)

diff --git a/source/blender/imbuf/intern/transform.cc b/source/blender/imbuf/intern/transform.cc
index 66a0f09a040..9eeca79a0ea 100644
--- a/source/blender/imbuf/intern/transform.cc
+++ b/source/blender/imbuf/intern/transform.cc
@@ -28,25 +28,25 @@ struct TransformUserData {
   /** \brief Destination image buffer to write to. */
   ImBuf *dst;
   /** \brief UV coordinates at the origin (0,0) in source image space. */
-  double2 start_uv;
+  float2 start_uv;
 
   /**
    * \brief delta UV coordinates along the source image buffer, when moving a single pixel in the X
    * axis of the dst image buffer.
    */
-  double2 add_x;
+  float2 add_x;
 
   /**
    * \brief delta UV coordinate along the source image buffer, when moving a single pixel in the Y
    * axes of the dst image buffer.
    */
-  double2 add_y;
+  float2 add_y;
 
   struct {
     /**
      * Contains per sub-sample a delta to be added to the uv of the source image buffer.
      */
-    Vector<double2, 9> delta_uvs;
+    Vector<float2, 9> delta_uvs;
   } subsampling;
 
   struct {
@@ -76,29 +76,29 @@ struct TransformUserData {
  private:
   void init_start_uv(const float4x4 &transform_matrix)
   {
-    start_uv = double2(transform_matrix.location().xy());
+    start_uv = transform_matrix.location().xy();
   }
 
   void init_add_x(const float4x4 &transform_matrix)
   {
-    add_x = double2(transform_matrix.x_axis());
+    add_x = transform_matrix.x_axis().xy();
   }
 
   void init_add_y(const float4x4 &transform_matrix)
   {
-    add_y = double2(transform_matrix.y_axis());
+    add_y = transform_matrix.y_axis().xy();
   }
 
   void init_subsampling(const int num_subsamples)
   {
-    double2 subsample_add_x = add_x / num_subsamples;
-    double2 subsample_add_y = add_y / num_subsamples;
-    double2 offset_x = -add_x * 0.5 + subsample_add_x * 0.5;
-    double2 offset_y = -add_y * 0.5 + subsample_add_y * 0.5;
+    float2 subsample_add_x = add_x / num_subsamples;
+    float2 subsample_add_y = add_y / num_subsamples;
+    float2 offset_x = -add_x * 0.5f + subsample_add_x * 0.5f;
+    float2 offset_y = -add_y * 0.5f + subsample_add_y * 0.5f;
 
     for (int y : IndexRange(0, num_subsamples)) {
       for (int x : IndexRange(0, num_subsamples)) {
-        double2 delta_uv = offset_x + offset_y;
+        float2 delta_uv = offset_x + offset_y;
         delta_uv += x * subsample_add_x;
         delta_uv += y * subsample_add_y;
         subsampling.delta_uvs.append(delta_uv);
@@ -150,7 +150,7 @@ struct CropSource {
    *
    * Uses user_data.src_crop to determine if the uv coordinate should be skipped.
    */
-  static bool should_discard(const TransformUserData &user_data, const double2 &uv)
+  static bool should_discard(const TransformUserData &user_data, const float2 &uv)
   {
     return uv.x < user_data.src_crop.xmin || uv.x >= user_data.src_crop.xmax ||
            uv.y < user_data.src_crop.ymin || uv.y >= user_data.src_crop.ymax;
@@ -166,7 +166,7 @@ struct NoDiscard {
    *
    * Will never discard any pixels.
    */
-  static bool should_discard(const TransformUserData & /*user_data*/, const double2 & /*uv*/)
+  static bool should_discard(const TransformUserData & /*user_data*/, const float2 & /*uv*/)
   {
     return false;
   }
@@ -301,10 +301,10 @@ class Sampler {
   static const int ChannelLen = NumChannels;
   using SampleType = Pixel<StorageType, NumChannels>;
 
-  void sample(const ImBuf *source, const double2 &uv, SampleType &r_sample)
+  void sample(const ImBuf *source, const float2 &uv, SampleType &r_sample)
   {
-    float u = float(uv.x);
-    float v = float(uv.y);
+    float u = uv.x;
+    float v = uv.y;
     if constexpr (UVWrapping) {
       u = wrap_uv(u, source->x);
       v = wrap_uv(v, source->y);
@@ -519,21 +519,19 @@ class ScanlineProcessor {
   void process_one_sample_per_pixel(const TransformUserData *user_data, int scanline)
   {
     /* Note: sample at pixel center for proper filtering. */
-    double pixel_x = user_data->destination_region.x_range.first() + 0.5;
-    double pixel_y = scanline + 0.5;
-    double2 uv = user_data->start_uv + user_data->add_x * pixel_x + user_data->add_y * pixel_y;
+    float pixel_x = 0.5f;
+    float pixel_y = scanline + 0.5f;
+    float2 uv0 = user_data->start_uv + user_data->add_x * pixel_x + user_data->add_y * pixel_y;
 
     output.init_pixel_pointer(user_data->dst,
                               int2(user_data->destination_region.x_range.first(), scanline));
     for (int xi : user_data->destination_region.x_range) {
-      UNUSED_VARS(xi);
+      float2 uv = uv0 + xi * user_data->add_x;
       if (!discarder.should_discard(*user_data, uv)) {
         typename Sampler::SampleType sample;
         sampler.sample(user_data->src, uv, sample);
         channel_converter.convert_and_store(sample, output);
       }
-
-      uv += user_data->add_x;
       output.increase_pixel_pointer();
     }
   }
@@ -541,20 +539,20 @@ class ScanlineProcessor {
   void process_with_subsampling(const TransformUserData *user_data, int scanline)
   {
     /* Note: sample at pixel center for proper filtering. */
-    double pixel_x = user_data->destination_region.x_range.first() + 0.5;
-    double pixel_y = scanline + 0.5;
-    double2 uv = user_data->start_uv + user_data->add_x * pixel_x + user_data->add_y * pixel_y;
+    float pixel_x = 0.5f;
+    float pixel_y = scanline + 0.5f;
+    float2 uv0 = user_data->start_uv + user_data->add_x * pixel_x + user_data->add_y * pixel_y;
 
     output.init_pixel_pointer(user_data->dst,
                               int2(user_data->destination_region.x_range.first(), scanline));
     for (int xi : user_data->destination_region.x_range) {
-      UNUSED_VARS(xi);
+      float2 uv = uv0 + xi * user_data->add_x;
       typename Sampler::SampleType sample;
       sample.clear();
       int num_subsamples_added = 0;
 
-      for (const double2 &delta_uv : user_data->subsampling.delta_uvs) {
-        const double2 subsample_uv = uv + delta_uv;
+      for (const float2 &delta_uv : user_data->subsampling.delta_uvs) {
+        const float2 subsample_uv = uv + delta_uv;
         if (!discarder.should_discard(*user_data, subsample_uv)) {
           typename Sampler::SampleType sub_sample;
           sampler.sample(user_data->src, subsample_uv, sub_sample);
@@ -568,7 +566,6 @@ class ScanlineProcessor {
                                  user_data->subsampling.delta_uvs.size();
         channel_converter.mix_and_store(sample, output, mix_weight);
       }
-      uv += user_data->add_x;
       output.increase_pixel_pointer();
     }
   }
diff --git a/source/blender/imbuf/intern/transform_test.cc b/source/blender/imbuf/intern/transform_test.cc
index d20c87732bd..f513cff1285 100644
--- a/source/blender/imbuf/intern/transform_test.cc
+++ b/source/blender/imbuf/intern/transform_test.cc
@@ -6,6 +6,7 @@
 
 #include "BLI_color.hh"
 #include "BLI_math_matrix.hh"
+#include "BLI_math_quaternion_types.hh"
 #include "IMB_imbuf.h"
 
 namespace blender::imbuf::tests {
@@ -109,4 +110,37 @@ TEST(imbuf_transform, bicubic_fractional_larger)
   IMB_freeImBuf(res);
 }
 
+TEST(imbuf_transform, nearest_very_large_scale)
+{
+  /* Create 511x1 black image, with three middle pixels being red/green/blue. */
+  ImBuf *src = IMB_allocImBuf(511, 1, 32, IB_rect);
+  ColorTheme4b col_r = ColorTheme4b(255, 0, 0, 255);
+  ColorTheme4b col_g = ColorTheme4b(0, 255, 0, 255);
+  ColorTheme4b col_b = ColorTheme4b(0, 0, 255, 255);
+  ColorTheme4b col_0 = ColorTheme4b(0, 0, 0, 0);
+  ColorTheme4b *src_col = reinterpret_cast<ColorTheme4b *>(src->byte_buffer.data);
+  src_col[254] = col_r;
+  src_col[255] = col_g;
+  src_col[256] = col_b;
+
+  /* Create 3841x1 image, and scale the input image so that the three middle
+   * pixels cover almost all of it, except the rightmost pixel. */
+  ImBuf *res = IMB_allocImBuf(3841, 1, 32, IB_rect);
+  float4x4 matrix = math::from_loc_rot_scale<float4x4>(
+      float3(254, 0, 0), math::Quaternion::identity(), float3(3.0f / 3840.0f, 1, 1));
+  IMB_transform(
+      src, res, IMB_TRANSFORM_MODE_REGULAR, IMB_FILTER_NEAREST, 1, matrix.ptr(), nullptr);
+
+  /* Check result: leftmost red, middle green, two rightmost pixels blue and black.
+   * If the transform code internally does not have enough precision while stepping
+   * through the scanline, the rightmost side will not come out correctly. */
+  const ColorTheme4b *got = reinterpret_cast<ColorTheme4b *>(res->byte_buffer.data);
+  EXPECT_EQ(got[0], col_r);
+  EXPECT_EQ(got[res->x / 2], col_g);
+  EXPECT_EQ(got[res->x - 2], col_b);
+  EXPECT_EQ(got[res->x - 1], col_0);
+  IMB_freeImBuf(src);
+  IMB_freeImBuf(res);
+}
+
 }  // namespace blender::imbuf::tests
-- 
2.30.2