From 4a6fdd3a255bae53a81208bf28340264f5ebd3db Mon Sep 17 00:00:00 2001
From: Aras Pranckevicius <aras@nesnausk.org>
Date: Tue, 16 Jan 2024 14:31:36 +0200
Subject: [PATCH] Cleanup: simplify code in ImBuf IMB_transform

There were some abstractions inside IMB_transform implementation that
were kinda getting in the way. I want to speed up Subsampled3x3
filtering by reformulating the math being done, but that needs to sample
byte images while storing intermediate result as a float -- but the
whole Pixel, PixelPointer et al. machinery was not quite prepared to
deal with that. Feels like those helper classes do not achieve much,
just make the code longer.

So remove them (Pixel, PixelPointer, ChannelConverter etc.) and use
simple functions where needed. Now the code is more straightforward and
270 lines shorter (40% of the whole file!), and allows me to do the
followup optimization easier.
---
 source/blender/imbuf/intern/transform.cc | 837 ++++++++---------------
 1 file changed, 282 insertions(+), 555 deletions(-)
diff --git a/source/blender/imbuf/intern/transform.cc b/source/blender/imbuf/intern/transform.cc
index 9eeca79a0ea..f531f527df4 100644
--- a/source/blender/imbuf/intern/transform.cc
+++ b/source/blender/imbuf/intern/transform.cc
@@ -6,7 +6,6 @@
  * \ingroup imbuf
  */
 
-#include <array>
 #include <type_traits>
 
 #include "BLI_math_color_blend.h"
@@ -22,73 +21,39 @@
 
 namespace blender::imbuf::transform {
 
-struct TransformUserData {
-  /** \brief Source image buffer to read from. */
+struct TransformContext {
   const ImBuf *src;
-  /** \brief Destination image buffer to write to. */
   ImBuf *dst;
-  /** \brief UV coordinates at the origin (0,0) in source image space. */
+  eIMBTransformMode mode;
+
+  /* UV coordinates at the destination origin (0,0) in source image space. */
   float2 start_uv;
 
-  /**
-   * \brief delta UV coordinates along the source image buffer, when moving a single pixel in the X
-   * axis of the dst image buffer.
-   */
+  /* Source UV step delta, when moving along one destination pixel in X axis. */
   float2 add_x;
 
-  /**
-   * \brief delta UV coordinate along the source image buffer, when moving a single pixel in the Y
-   * axes of the dst image buffer.
-   */
+  /* Source UV step delta, when moving along one destination pixel in Y axis. */
   float2 add_y;
 
-  struct {
-    /**
-     * Contains per sub-sample a delta to be added to the uv of the source image buffer.
-     */
-    Vector<float2, 9> delta_uvs;
-  } subsampling;
+  /* Per-subsample source image delta UVs. */
+  Vector<float2, 9> subsampling_deltas;
 
-  struct {
-    IndexRange x_range;
-    IndexRange y_range;
-  } destination_region;
+  IndexRange dst_region_x_range;
+  IndexRange dst_region_y_range;
 
-  /**
-   * \brief Cropping region in source image pixel space.
-   */
+  /* Cropping region in source image pixel space. */
   rctf src_crop;
 
-  /**
-   * \brief Initialize the start_uv, add_x and add_y fields based on the given transform matrix.
-   */
-  void init(const float4x4 &transform_matrix,
-            const int num_subsamples,
-            const bool do_crop_destination_region)
+  void init(const float4x4 &transform_matrix, const int num_subsamples, const bool has_source_crop)
   {
-    init_start_uv(transform_matrix);
-    init_add_x(transform_matrix);
-    init_add_y(transform_matrix);
+    start_uv = transform_matrix.location().xy();
+    add_x = transform_matrix.x_axis().xy();
+    add_y = transform_matrix.y_axis().xy();
     init_subsampling(num_subsamples);
-    init_destination_region(transform_matrix, do_crop_destination_region);
+    init_destination_region(transform_matrix, has_source_crop);
   }
 
  private:
-  void init_start_uv(const float4x4 &transform_matrix)
-  {
-    start_uv = transform_matrix.location().xy();
-  }
-
-  void init_add_x(const float4x4 &transform_matrix)
-  {
-    add_x = transform_matrix.x_axis().xy();
-  }
-
-  void init_add_y(const float4x4 &transform_matrix)
-  {
-    add_y = transform_matrix.y_axis().xy();
-  }
-
   void init_subsampling(const int num_subsamples)
   {
     float2 subsample_add_x = add_x / num_subsamples;
@@ -101,17 +66,16 @@ struct TransformUserData {
         float2 delta_uv = offset_x + offset_y;
         delta_uv += x * subsample_add_x;
         delta_uv += y * subsample_add_y;
-        subsampling.delta_uvs.append(delta_uv);
+        subsampling_deltas.append(delta_uv);
       }
     }
   }
 
-  void init_destination_region(const float4x4 &transform_matrix,
-                               const bool do_crop_destination_region)
+  void init_destination_region(const float4x4 &transform_matrix, const bool has_source_crop)
   {
-    if (!do_crop_destination_region) {
-      destination_region.x_range = IndexRange(dst->x);
-      destination_region.y_range = IndexRange(dst->y);
+    if (!has_source_crop) {
+      dst_region_x_range = IndexRange(dst->x);
+      dst_region_y_range = IndexRange(dst->y);
       return;
     }
 
@@ -136,99 +100,28 @@ struct TransformUserData {
     rcti dest_rect;
     BLI_rcti_init(&dest_rect, 0, dst->x, 0, dst->y);
     BLI_rcti_isect(&rect, &dest_rect, &rect);
-    destination_region.x_range = IndexRange(rect.xmin, BLI_rcti_size_x(&rect));
-    destination_region.y_range = IndexRange(rect.ymin, BLI_rcti_size_y(&rect));
+    dst_region_x_range = IndexRange(rect.xmin, BLI_rcti_size_x(&rect));
+    dst_region_y_range = IndexRange(rect.ymin, BLI_rcti_size_y(&rect));
   }
 };
 
-/**
- * \brief Crop uv-coordinates that are outside the user data src_crop rect.
- */
-struct CropSource {
-  /**
-   * \brief Should the source pixel at the given uv coordinate be discarded.
-   *
-   * Uses user_data.src_crop to determine if the uv coordinate should be skipped.
-   */
-  static bool should_discard(const TransformUserData &user_data, const float2 &uv)
-  {
-    return uv.x < user_data.src_crop.xmin || uv.x >= user_data.src_crop.xmax ||
-           uv.y < user_data.src_crop.ymin || uv.y >= user_data.src_crop.ymax;
-  }
-};
+/* Crop uv-coordinates that are outside the user data src_crop rect. */
+static bool should_discard(const TransformContext &ctx, const float2 &uv)
+{
+  return uv.x < ctx.src_crop.xmin || uv.x >= ctx.src_crop.xmax || uv.y < ctx.src_crop.ymin ||
+         uv.y >= ctx.src_crop.ymax;
+}
 
-/**
- * \brief Discard that does not discard anything.
- */
-struct NoDiscard {
-  /**
-   * \brief Should the source pixel at the given uv coordinate be discarded.
-   *
-   * Will never discard any pixels.
-   */
-  static bool should_discard(const TransformUserData & /*user_data*/, const float2 & /*uv*/)
-  {
-    return false;
-  }
-};
+template<typename T> static T *init_pixel_pointer(const ImBuf *image, int x, int y);
+template<> uchar *init_pixel_pointer(const ImBuf *image, int x, int y)
+{
+  return image->byte_buffer.data + (size_t(y) * image->x + x) * image->channels;
+}
+template<> float *init_pixel_pointer(const ImBuf *image, int x, int y)
+{
+  return image->float_buffer.data + (size_t(y) * image->x + x) * image->channels;
+}
 
-/**
- * \brief Pointer to a pixel to write to in serial.
- */
-template<
-    /**
-     * \brief Kind of buffer.
-     * Possible options: float, uchar.
-     */
-    typename StorageType = float,
-
-    /**
-     * \brief Number of channels of a single pixel.
-     */
-    int NumChannels = 4>
-class PixelPointer {
- public:
-  static const int ChannelLen = NumChannels;
-
- private:
-  StorageType *pointer;
-
- public:
-  void init_pixel_pointer(const ImBuf *image_buffer, int2 start_coordinate)
-  {
-    const size_t offset = (start_coordinate.y * size_t(image_buffer->x) + start_coordinate.x) *
-                          NumChannels;
-
-    if constexpr (std::is_same_v<StorageType, float>) {
-      pointer = image_buffer->float_buffer.data + offset;
-    }
-    else if constexpr (std::is_same_v<StorageType, uchar>) {
-      pointer = const_cast<uchar *>(
-          static_cast<const uchar *>(static_cast<const void *>(image_buffer->byte_buffer.data)) +
-          offset);
-    }
-    else {
-      pointer = nullptr;
-    }
-  }
-
-  /**
-   * \brief Get pointer to the current pixel to write to.
-   */
-  StorageType *get_pointer()
-  {
-    return pointer;
-  }
-
-  void increase_pixel_pointer()
-  {
-    pointer += NumChannels;
-  }
-};
-
-/**
- * \brief Repeats UV coordinate.
- */
 static float wrap_uv(float value, int size)
 {
   int x = int(floorf(value));
@@ -241,416 +134,246 @@ static float wrap_uv(float value, int size)
   return x;
 }
 
-/* TODO: should we use math_vectors for this. */
-template<typename StorageType, int NumChannels>
-class Pixel : public std::array<StorageType, NumChannels> {
- public:
-  void clear()
-  {
-    for (int channel_index : IndexRange(NumChannels)) {
-      (*this)[channel_index] = 0;
+template<typename T, int NumChannels>
+static void add_subsample(const T *src, T *dst, int sample_number)
+{
+  BLI_STATIC_ASSERT((is_same_any_v<T, uchar, float>), "Only uchar and float channels supported.");
+
+  float factor = 1.0 / (sample_number + 1);
+  if constexpr (std::is_same_v<T, uchar>) {
+    BLI_STATIC_ASSERT(NumChannels == 4, "Pixels using uchar requires to have 4 channels.");
+    blend_color_interpolate_byte(dst, dst, src, factor);
+  }
+  else if constexpr (std::is_same_v<T, float> && NumChannels == 4) {
+    blend_color_interpolate_float(dst, dst, src, factor);
+  }
+  else if constexpr (std::is_same_v<T, float>) {
+    for (int i : IndexRange(NumChannels)) {
+      dst[i] = dst[i] * (1.0f - factor) + src[i] * factor;
     }
   }
+}
 
-  void add_subsample(const Pixel<StorageType, NumChannels> other, int sample_number)
-  {
-    BLI_STATIC_ASSERT((std::is_same_v<StorageType, uchar>) || (std::is_same_v<StorageType, float>),
-                      "Only uchar and float channels supported.");
+template<int NumChannels>
+static void sample_nearest_float(const ImBuf *source, float u, float v, float *r_sample)
+{
+  int x1 = int(u);
+  int y1 = int(v);
 
-    float factor = 1.0 / (sample_number + 1);
-    if constexpr (std::is_same_v<StorageType, uchar>) {
-      BLI_STATIC_ASSERT(NumChannels == 4, "Pixels using uchar requires to have 4 channels.");
-      blend_color_interpolate_byte(this->data(), this->data(), other.data(), factor);
-    }
-    else if constexpr (std::is_same_v<StorageType, float> && NumChannels == 4) {
-      blend_color_interpolate_float(this->data(), this->data(), other.data(), factor);
-    }
-    else if constexpr (std::is_same_v<StorageType, float>) {
-      for (int channel_index : IndexRange(NumChannels)) {
-        (*this)[channel_index] = (*this)[channel_index] * (1.0 - factor) +
-                                 other[channel_index] * factor;
-      }
-    }
-  }
-};
-
-/**
- * \brief Read a sample from an image buffer.
- *
- * A sampler can read from an image buffer.
- */
-template<
-    /** \brief Interpolation mode to use when sampling. */
-    eIMBInterpolationFilterMode Filter,
-
-    /** \brief storage type of a single pixel channel (uchar or float). */
-    typename StorageType,
-    /**
-     * \brief number of channels if the image to read.
-     *
-     * Must match the actual channels of the image buffer that is sampled.
-     */
-    int NumChannels,
-    /**
-     * \brief Should UVs wrap
-     */
-    bool UVWrapping>
-class Sampler {
- public:
-  using ChannelType = StorageType;
-  static const int ChannelLen = NumChannels;
-  using SampleType = Pixel<StorageType, NumChannels>;
-
-  void sample(const ImBuf *source, const float2 &uv, SampleType &r_sample)
-  {
-    float u = uv.x;
-    float v = uv.y;
-    if constexpr (UVWrapping) {
-      u = wrap_uv(u, source->x);
-      v = wrap_uv(v, source->y);
-    }
-    /* BLI_bilinear_interpolation functions use `floor(uv)` and `floor(uv)+1`
-     * texels. For proper mapping between pixel and texel spaces, need to
-     * subtract 0.5. Same for bicubic. */
-    if constexpr (Filter == IMB_FILTER_BILINEAR || Filter == IMB_FILTER_BICUBIC) {
-      u -= 0.5f;
-      v -= 0.5f;
-    }
-    if constexpr (Filter == IMB_FILTER_BILINEAR && std::is_same_v<StorageType, float> &&
-                  NumChannels == 4)
-    {
-      bilinear_interpolation_color_fl(source, r_sample.data(), u, v);
-    }
-    else if constexpr (Filter == IMB_FILTER_NEAREST && std::is_same_v<StorageType, uchar> &&
-                       NumChannels == 4)
-    {
-      nearest_interpolation_color_char(source, r_sample.data(), nullptr, u, v);
-    }
-    else if constexpr (Filter == IMB_FILTER_BILINEAR && std::is_same_v<StorageType, uchar> &&
-                       NumChannels == 4)
-    {
-      bilinear_interpolation_color_char(source, r_sample.data(), u, v);
-    }
-    else if constexpr (Filter == IMB_FILTER_BILINEAR && std::is_same_v<StorageType, float>) {
-      if constexpr (UVWrapping) {
-        BLI_bilinear_interpolation_wrap_fl(source->float_buffer.data,
-                                           r_sample.data(),
-                                           source->x,
-                                           source->y,
-                                           NumChannels,
-                                           UNPACK2(uv),
-                                           true,
-                                           true);
-      }
-      else {
-        BLI_bilinear_interpolation_fl(
-            source->float_buffer.data, r_sample.data(), source->x, source->y, NumChannels, u, v);
-      }
-    }
-    else if constexpr (Filter == IMB_FILTER_NEAREST && std::is_same_v<StorageType, float>) {
-      sample_nearest_float(source, u, v, r_sample);
-    }
-    else if constexpr (Filter == IMB_FILTER_BICUBIC && std::is_same_v<StorageType, float>) {
-      BLI_bicubic_interpolation_fl(
-          source->float_buffer.data, r_sample.data(), source->x, source->y, NumChannels, u, v);
-    }
-    else if constexpr (Filter == IMB_FILTER_BICUBIC && std::is_same_v<StorageType, uchar> &&
-                       NumChannels == 4)
-    {
-      BLI_bicubic_interpolation_char(
-          source->byte_buffer.data, r_sample.data(), source->x, source->y, u, v);
-    }
-    else {
-      /* Unsupported sampler. */
-      BLI_assert_unreachable();
-    }
-  }
-
- private:
-  void sample_nearest_float(const ImBuf *source,
-                            const float u,
-                            const float v,
-                            SampleType &r_sample)
-  {
-    BLI_STATIC_ASSERT(std::is_same_v<StorageType, float>);
-
-    /* ImBuf in must have a valid rect or rect_float, assume this is already checked */
-    int x1 = int(u);
-    int y1 = int(v);
-
-    /* Break when sample outside image is requested. */
-    if (x1 < 0 || x1 >= source->x || y1 < 0 || y1 >= source->y) {
-      for (int i = 0; i < NumChannels; i++) {
-        r_sample[i] = 0.0f;
-      }
-      return;
-    }
-
-    const size_t offset = (size_t(source->x) * y1 + x1) * NumChannels;
-    const float *dataF = source->float_buffer.data + offset;
+  /* Break when sample outside image is requested. */
+  if (x1 < 0 || x1 >= source->x || y1 < 0 || y1 >= source->y) {
     for (int i = 0; i < NumChannels; i++) {
-      r_sample[i] = dataF[i];
-    }
-  }
-};
-
-/**
- * \brief Change the number of channels and store it.
- *
- * Template class to convert and store a sample in a PixelPointer.
- * It supports:
- * - 4 channel uchar -> 4 channel uchar.
- * - 4 channel float -> 4 channel float.
- * - 3 channel float -> 4 channel float.
- * - 2 channel float -> 4 channel float.
- * - 1 channel float -> 4 channel float.
- */
-template<typename StorageType, int SourceNumChannels, int DestinationNumChannels>
-class ChannelConverter {
- public:
-  using SampleType = Pixel<StorageType, SourceNumChannels>;
-  using PixelType = PixelPointer<StorageType, DestinationNumChannels>;
-
-  /**
-   * \brief Convert the number of channels of the given sample to match the pixel pointer and
-   * store it at the location the pixel_pointer points at.
-   */
-  void convert_and_store(const SampleType &sample, PixelType &pixel_pointer)
-  {
-    if constexpr (std::is_same_v<StorageType, uchar>) {
-      BLI_STATIC_ASSERT(SourceNumChannels == 4, "Unsigned chars always have 4 channels.");
-      BLI_STATIC_ASSERT(DestinationNumChannels == 4, "Unsigned chars always have 4 channels.");
-
-      copy_v4_v4_uchar(pixel_pointer.get_pointer(), sample.data());
-    }
-    else if constexpr (std::is_same_v<StorageType, float> && SourceNumChannels == 4 &&
-                       DestinationNumChannels == 4)
-    {
-      copy_v4_v4(pixel_pointer.get_pointer(), sample.data());
-    }
-    else if constexpr (std::is_same_v<StorageType, float> && SourceNumChannels == 3 &&
-                       DestinationNumChannels == 4)
-    {
-      copy_v4_fl4(pixel_pointer.get_pointer(), sample[0], sample[1], sample[2], 1.0f);
-    }
-    else if constexpr (std::is_same_v<StorageType, float> && SourceNumChannels == 2 &&
-                       DestinationNumChannels == 4)
-    {
-      copy_v4_fl4(pixel_pointer.get_pointer(), sample[0], sample[1], 0.0f, 1.0f);
-    }
-    else if constexpr (std::is_same_v<StorageType, float> && SourceNumChannels == 1 &&
-                       DestinationNumChannels == 4)
-    {
-      copy_v4_fl4(pixel_pointer.get_pointer(), sample[0], sample[0], sample[0], 1.0f);
-    }
-    else {
-      BLI_assert_unreachable();
+      r_sample[i] = 0.0f;
     }
+    return;
   }
 
-  void mix_and_store(const SampleType &sample, PixelType &pixel_pointer, const float mix_factor)
-  {
-    if constexpr (std::is_same_v<StorageType, uchar>) {
-      BLI_STATIC_ASSERT(SourceNumChannels == 4, "Unsigned chars always have 4 channels.");
-      BLI_STATIC_ASSERT(DestinationNumChannels == 4, "Unsigned chars always have 4 channels.");
-      blend_color_interpolate_byte(
-          pixel_pointer.get_pointer(), pixel_pointer.get_pointer(), sample.data(), mix_factor);
-    }
-    else if constexpr (std::is_same_v<StorageType, float> && SourceNumChannels == 4 &&
-                       DestinationNumChannels == 4)
-    {
-      blend_color_interpolate_float(
-          pixel_pointer.get_pointer(), pixel_pointer.get_pointer(), sample.data(), mix_factor);
-    }
-    else {
-      BLI_assert_unreachable();
-    }
+  size_t offset = (size_t(source->x) * y1 + x1) * NumChannels;
+  const float *dataF = source->float_buffer.data + offset;
+  for (int i = 0; i < NumChannels; i++) {
+    r_sample[i] = dataF[i];
   }
-};
-
-/**
- * \brief Processor for a scanline.
- */
-template<
-    /**
-     * \brief Discard functor that implements `should_discard`.
-     */
-    typename Discard,
-
-    /**
-     * \brief Color interpolation function to read from the source buffer.
-     */
-    typename Sampler,
-
-    /**
-     * \brief Kernel to store to the destination buffer.
-     * Should be an PixelPointer
-     */
-    typename OutputPixelPointer>
-class ScanlineProcessor {
-  Discard discarder;
-  OutputPixelPointer output;
-  Sampler sampler;
-
-  /**
-   * \brief Channels sizzling logic to convert between the input image buffer and the output
-   * image buffer.
-   */
-  ChannelConverter<typename Sampler::ChannelType,
-                   Sampler::ChannelLen,
-                   OutputPixelPointer::ChannelLen>
-      channel_converter;
-
- public:
-  /**
-   * \brief Inner loop of the transformations, processing a full scanline.
-   */
-  void process(const TransformUserData *user_data, int scanline)
-  {
-    if (user_data->subsampling.delta_uvs.size() > 1) {
-      process_with_subsampling(user_data, scanline);
-    }
-    else {
-      process_one_sample_per_pixel(user_data, scanline);
-    }
-  }
-
- private:
-  void process_one_sample_per_pixel(const TransformUserData *user_data, int scanline)
-  {
-    /* Note: sample at pixel center for proper filtering. */
-    float pixel_x = 0.5f;
-    float pixel_y = scanline + 0.5f;
-    float2 uv0 = user_data->start_uv + user_data->add_x * pixel_x + user_data->add_y * pixel_y;
-
-    output.init_pixel_pointer(user_data->dst,
-                              int2(user_data->destination_region.x_range.first(), scanline));
-    for (int xi : user_data->destination_region.x_range) {
-      float2 uv = uv0 + xi * user_data->add_x;
-      if (!discarder.should_discard(*user_data, uv)) {
-        typename Sampler::SampleType sample;
-        sampler.sample(user_data->src, uv, sample);
-        channel_converter.convert_and_store(sample, output);
-      }
-      output.increase_pixel_pointer();
-    }
-  }
-
-  void process_with_subsampling(const TransformUserData *user_data, int scanline)
-  {
-    /* Note: sample at pixel center for proper filtering. */
-    float pixel_x = 0.5f;
-    float pixel_y = scanline + 0.5f;
-    float2 uv0 = user_data->start_uv + user_data->add_x * pixel_x + user_data->add_y * pixel_y;
-
-    output.init_pixel_pointer(user_data->dst,
-                              int2(user_data->destination_region.x_range.first(), scanline));
-    for (int xi : user_data->destination_region.x_range) {
-      float2 uv = uv0 + xi * user_data->add_x;
-      typename Sampler::SampleType sample;
-      sample.clear();
-      int num_subsamples_added = 0;
-
-      for (const float2 &delta_uv : user_data->subsampling.delta_uvs) {
-        const float2 subsample_uv = uv + delta_uv;
-        if (!discarder.should_discard(*user_data, subsample_uv)) {
-          typename Sampler::SampleType sub_sample;
-          sampler.sample(user_data->src, subsample_uv, sub_sample);
-          sample.add_subsample(sub_sample, num_subsamples_added);
-          num_subsamples_added += 1;
-        }
-      }
-
-      if (num_subsamples_added != 0) {
-        const float mix_weight = float(num_subsamples_added) /
-                                 user_data->subsampling.delta_uvs.size();
-        channel_converter.mix_and_store(sample, output, mix_weight);
-      }
-      output.increase_pixel_pointer();
-    }
-  }
-};
-
-/**
- * \brief callback function for threaded transformation.
- */
-template<typename Processor> void transform_scanline_function(void *custom_data, int scanline)
-{
-  const TransformUserData *user_data = static_cast<const TransformUserData *>(custom_data);
-  Processor processor;
-  processor.process(user_data, scanline);
 }
 
+/* Read a pixel from an image buffer, with filtering/wrapping parameters. */
+template<eIMBInterpolationFilterMode Filter, typename T, int NumChannels, bool WrapUV>
+static void sample_image(const ImBuf *source, float u, float v, T *r_sample)
+{
+  if constexpr (WrapUV) {
+    u = wrap_uv(u, source->x);
+    v = wrap_uv(v, source->y);
+  }
+  /* BLI_bilinear_interpolation functions use `floor(uv)` and `floor(uv)+1`
+   * texels. For proper mapping between pixel and texel spaces, need to
+   * subtract 0.5. Same for bicubic. */
+  if constexpr (Filter == IMB_FILTER_BILINEAR || Filter == IMB_FILTER_BICUBIC) {
+    u -= 0.5f;
+    v -= 0.5f;
+  }
+  if constexpr (Filter == IMB_FILTER_BILINEAR && std::is_same_v<T, float> && NumChannels == 4) {
+    bilinear_interpolation_color_fl(source, r_sample, u, v);
+  }
+  else if constexpr (Filter == IMB_FILTER_NEAREST && std::is_same_v<T, uchar> && NumChannels == 4)
+  {
+    nearest_interpolation_color_char(source, r_sample, nullptr, u, v);
+  }
+  else if constexpr (Filter == IMB_FILTER_BILINEAR && std::is_same_v<T, uchar> && NumChannels == 4)
+  {
+    bilinear_interpolation_color_char(source, r_sample, u, v);
+  }
+  else if constexpr (Filter == IMB_FILTER_BILINEAR && std::is_same_v<T, float>) {
+    if constexpr (WrapUV) {
+      BLI_bilinear_interpolation_wrap_fl(source->float_buffer.data,
+                                         r_sample,
+                                         source->x,
+                                         source->y,
+                                         NumChannels,
+                                         u,
+                                         v,
+                                         true,
+                                         true);
+    }
+    else {
+      BLI_bilinear_interpolation_fl(
+          source->float_buffer.data, r_sample, source->x, source->y, NumChannels, u, v);
+    }
+  }
+  else if constexpr (Filter == IMB_FILTER_NEAREST && std::is_same_v<T, float>) {
+    sample_nearest_float<NumChannels>(source, u, v, r_sample);
+  }
+  else if constexpr (Filter == IMB_FILTER_BICUBIC && std::is_same_v<T, float>) {
+    BLI_bicubic_interpolation_fl(
+        source->float_buffer.data, r_sample, source->x, source->y, NumChannels, u, v);
+  }
+  else if constexpr (Filter == IMB_FILTER_BICUBIC && std::is_same_v<T, uchar> && NumChannels == 4)
+  {
+    BLI_bicubic_interpolation_char(source->byte_buffer.data, r_sample, source->x, source->y, u, v);
+  }
+  else {
+    /* Unsupported sampler. */
+    BLI_assert_unreachable();
+  }
+}
+
+template<typename T, int SrcChannels> static void store_sample(const T *sample, T *dst)
+{
+  if constexpr (std::is_same_v<T, uchar>) {
+    BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels.");
+    copy_v4_v4_uchar(dst, sample);
+  }
+  else if constexpr (std::is_same_v<T, float> && SrcChannels == 4) {
+    copy_v4_v4(dst, sample);
+  }
+  else if constexpr (std::is_same_v<T, float> && SrcChannels == 3) {
+    copy_v4_fl4(dst, sample[0], sample[1], sample[2], 1.0f);
+  }
+  else if constexpr (std::is_same_v<T, float> && SrcChannels == 2) {
+    copy_v4_fl4(dst, sample[0], sample[1], 0.0f, 1.0f);
+  }
+  else if constexpr (std::is_same_v<T, float> && SrcChannels == 1) {
+    /* Note: single channel sample is stored as grayscale. */
+    copy_v4_fl4(dst, sample[0], sample[0], sample[0], 1.0f);
+  }
+  else {
+    BLI_assert_unreachable();
+  }
+}
+
+template<typename T, int SrcChannels>
+static void mix_and_store_sample(const T *sample, T *dst, const float mix_factor)
+{
+  if constexpr (std::is_same_v<T, uchar>) {
+    BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels.");
+    blend_color_interpolate_byte(dst, dst, sample, mix_factor);
+  }
+  else if constexpr (std::is_same_v<T, float> && SrcChannels == 4) {
+    blend_color_interpolate_float(dst, dst, sample, mix_factor);
+  }
+  else {
+    BLI_assert_unreachable();
+  }
+}
+
+/* Process a block of destination image scanlines. */
 template<eIMBInterpolationFilterMode Filter,
-         typename StorageType,
-         int SourceNumChannels,
-         int DestinationNumChannels>
-ScanlineThreadFunc get_scanline_function(const eIMBTransformMode mode)
-
+         typename T,
+         int SrcChannels,
+         bool CropSource,
+         bool WrapUV>
+static void process_scanlines(const TransformContext &ctx, IndexRange y_range)
 {
-  switch (mode) {
-    case IMB_TRANSFORM_MODE_REGULAR:
-      return transform_scanline_function<
-          ScanlineProcessor<NoDiscard,
-                            Sampler<Filter, StorageType, SourceNumChannels, false>,
-                            PixelPointer<StorageType, DestinationNumChannels>>>;
-    case IMB_TRANSFORM_MODE_CROP_SRC:
-      return transform_scanline_function<
-          ScanlineProcessor<CropSource,
-                            Sampler<Filter, StorageType, SourceNumChannels, false>,
-                            PixelPointer<StorageType, DestinationNumChannels>>>;
-    case IMB_TRANSFORM_MODE_WRAP_REPEAT:
-      return transform_scanline_function<
-          ScanlineProcessor<NoDiscard,
-                            Sampler<Filter, StorageType, SourceNumChannels, true>,
-                            PixelPointer<StorageType, DestinationNumChannels>>>;
-  }
+  /* Note: sample at pixel center for proper filtering. */
+  float2 uv_start = ctx.start_uv + ctx.add_x * 0.5f + ctx.add_y * 0.5f;
 
-  BLI_assert_unreachable();
-  return nullptr;
-}
+  if (ctx.subsampling_deltas.size() > 1) {
+    /* Multiple samples per pixel. */
+    for (int yi : y_range) {
+      T *output = init_pixel_pointer<T>(ctx.dst, ctx.dst_region_x_range.first(), yi);
+      float2 uv_row = uv_start + yi * ctx.add_y;
+      for (int xi : ctx.dst_region_x_range) {
+        float2 uv = uv_row + xi * ctx.add_x;
+        T sample[4] = {};
+        int num_subsamples_added = 0;
 
-template<eIMBInterpolationFilterMode Filter>
-ScanlineThreadFunc get_scanline_function(const TransformUserData *user_data,
-                                         const eIMBTransformMode mode)
-{
-  const ImBuf *src = user_data->src;
-  const ImBuf *dst = user_data->dst;
+        for (const float2 &delta_uv : ctx.subsampling_deltas) {
+          const float2 sub_uv = uv + delta_uv;
+          if (!CropSource || !should_discard(ctx, sub_uv)) {
+            T sub_sample[4];
+            sample_image<Filter, T, SrcChannels, WrapUV>(ctx.src, sub_uv.x, sub_uv.y, sub_sample);
+            add_subsample<T, SrcChannels>(sub_sample, sample, num_subsamples_added);
+            num_subsamples_added += 1;
+          }
+        }
 
-  if (src->channels == 4 && dst->channels == 4) {
-    return get_scanline_function<Filter, float, 4, 4>(mode);
-  }
-  if (src->channels == 3 && dst->channels == 4) {
-    return get_scanline_function<Filter, float, 3, 4>(mode);
-  }
-  if (src->channels == 2 && dst->channels == 4) {
-    return get_scanline_function<Filter, float, 2, 4>(mode);
-  }
-  if (src->channels == 1 && dst->channels == 4) {
-    return get_scanline_function<Filter, float, 1, 4>(mode);
-  }
-  return nullptr;
-}
-
-template<eIMBInterpolationFilterMode Filter>
-static void transform_threaded(TransformUserData *user_data, const eIMBTransformMode mode)
-{
-  ScanlineThreadFunc scanline_func = nullptr;
-
-  if (user_data->dst->float_buffer.data && user_data->src->float_buffer.data) {
-    scanline_func = get_scanline_function<Filter>(user_data, mode);
-  }
-  else if (user_data->dst->byte_buffer.data && user_data->src->byte_buffer.data) {
-    /* Number of channels is always 4 when using uchar buffers (sRGB + straight alpha). */
-    scanline_func = get_scanline_function<Filter, uchar, 4, 4>(mode);
-  }
-
-  if (scanline_func != nullptr) {
-    threading::parallel_for(user_data->destination_region.y_range, 8, [&](IndexRange range) {
-      for (int scanline : range) {
-        scanline_func(user_data, scanline);
+        if (num_subsamples_added != 0) {
+          const float mix_weight = float(num_subsamples_added) / ctx.subsampling_deltas.size();
+          mix_and_store_sample<T, SrcChannels>(sample, output, mix_weight);
+        }
+        output += 4;
       }
-    });
+    }
+  }
+  else {
+    /* One sample per pixel. */
+    for (int yi : y_range) {
+      T *output = init_pixel_pointer<T>(ctx.dst, ctx.dst_region_x_range.first(), yi);
+      float2 uv_row = uv_start + yi * ctx.add_y;
+      for (int xi : ctx.dst_region_x_range) {
+        float2 uv = uv_row + xi * ctx.add_x;
+        if (!CropSource || !should_discard(ctx, uv)) {
+          T sample[4];
+          sample_image<Filter, T, SrcChannels, WrapUV>(ctx.src, uv.x, uv.y, sample);
+          store_sample<T, SrcChannels>(sample, output);
+        }
+        output += 4;
+      }
+    }
+  }
+}
+
+template<eIMBInterpolationFilterMode Filter, typename T, int SrcChannels>
+static void transform_scanlines(const TransformContext &ctx, IndexRange y_range)
+{
+  switch (ctx.mode) {
+    case IMB_TRANSFORM_MODE_REGULAR:
+      process_scanlines<Filter, T, SrcChannels, false, false>(ctx, y_range);
+      break;
+    case IMB_TRANSFORM_MODE_CROP_SRC:
+      process_scanlines<Filter, T, SrcChannels, true, false>(ctx, y_range);
+      break;
+    case IMB_TRANSFORM_MODE_WRAP_REPEAT:
+      process_scanlines<Filter, T, SrcChannels, false, true>(ctx, y_range);
+      break;
+    default:
+      BLI_assert_unreachable();
+      break;
+  }
+}
+
+template<eIMBInterpolationFilterMode Filter>
+static void transform_scanlines_filter(const TransformContext &ctx, IndexRange y_range)
+{
+  int channels = ctx.src->channels;
+  if (ctx.dst->float_buffer.data && ctx.src->float_buffer.data) {
+    /* Float images. */
+    if (channels == 4) {
+      transform_scanlines<Filter, float, 4>(ctx, y_range);
+    }
+    else if (channels == 3) {
+      transform_scanlines<Filter, float, 3>(ctx, y_range);
+    }
+    else if (channels == 2) {
+      transform_scanlines<Filter, float, 2>(ctx, y_range);
+    }
+    else if (channels == 1) {
+      transform_scanlines<Filter, float, 1>(ctx, y_range);
+    }
+  }
+  else if (ctx.dst->byte_buffer.data && ctx.src->byte_buffer.data) {
+    /* Byte images. */
+    if (channels == 4) {
+      transform_scanlines<Filter, uchar, 4>(ctx, y_range);
+    }
   }
 }
 
@@ -659,6 +382,7 @@ static void transform_threaded(TransformUserData *user_data, const eIMBTransform
 extern "C" {
 
 using namespace blender::imbuf::transform;
+using namespace blender;
 
 void IMB_transform(const ImBuf *src,
                    ImBuf *dst,
@@ -671,25 +395,28 @@ void IMB_transform(const ImBuf *src,
   BLI_assert_msg(mode != IMB_TRANSFORM_MODE_CROP_SRC || src_crop != nullptr,
                  "No source crop rect given, but crop source is requested. Or source crop rect "
                  "was given, but crop source was not requested.");
+  BLI_assert_msg(dst->channels == 4, "Destination image must have 4 channels.");
 
-  TransformUserData user_data;
-  user_data.src = src;
-  user_data.dst = dst;
-  if (mode == IMB_TRANSFORM_MODE_CROP_SRC) {
-    user_data.src_crop = *src_crop;
+  TransformContext ctx;
+  ctx.src = src;
+  ctx.dst = dst;
+  ctx.mode = mode;
+  bool crop = mode == IMB_TRANSFORM_MODE_CROP_SRC;
+  if (crop) {
+    ctx.src_crop = *src_crop;
   }
-  user_data.init(blender::float4x4(transform_matrix),
-                 num_subsamples,
-                 ELEM(mode, IMB_TRANSFORM_MODE_CROP_SRC));
+  ctx.init(blender::float4x4(transform_matrix), num_subsamples, crop);
 
-  if (filter == IMB_FILTER_NEAREST) {
-    transform_threaded<IMB_FILTER_NEAREST>(&user_data, mode);
-  }
-  else if (filter == IMB_FILTER_BILINEAR) {
-    transform_threaded<IMB_FILTER_BILINEAR>(&user_data, mode);
-  }
-  else if (filter == IMB_FILTER_BICUBIC) {
-    transform_threaded<IMB_FILTER_BICUBIC>(&user_data, mode);
-  }
+  threading::parallel_for(ctx.dst_region_y_range, 8, [&](IndexRange y_range) {
+    if (filter == IMB_FILTER_NEAREST) {
+      transform_scanlines_filter<IMB_FILTER_NEAREST>(ctx, y_range);
+    }
+    else if (filter == IMB_FILTER_BILINEAR) {
+      transform_scanlines_filter<IMB_FILTER_BILINEAR>(ctx, y_range);
+    }
+    else if (filter == IMB_FILTER_BICUBIC) {
+      transform_scanlines_filter<IMB_FILTER_BICUBIC>(ctx, y_range);
+    }
+  });
 }
 }
-- 
2.30.2