From 82182bce3dcb9bb4761a1ae06c710efa91219f78 Mon Sep 17 00:00:00 2001
From: Aras Pranckevicius <aras@nesnausk.org>
Date: Tue, 12 Dec 2023 10:41:02 +0200
Subject: [PATCH 1/2] VSE: reduce code duplication in some effects

Logic between byte vs float effects was the same in many places, so now
that the source is in C++, we can reduce that duplication. Effects:
alpha over, alpha under, gamma cross, wipe, apply blend function util.
---
 source/blender/sequencer/intern/effects.cc | 583 +++++++--------------
 1 file changed, 190 insertions(+), 393 deletions(-)
diff --git a/source/blender/sequencer/intern/effects.cc b/source/blender/sequencer/intern/effects.cc
index 3a0d4e91ef8..99f6cd9c34a 100644
--- a/source/blender/sequencer/intern/effects.cc
+++ b/source/blender/sequencer/intern/effects.cc
@@ -64,6 +64,8 @@
 #include "strip_time.hh"
 #include "utils.hh"
 
+using blender::float4;
+
 static SeqEffectHandle get_sequence_effect_impl(int seq_type);
 
 /* -------------------------------------------------------------------- */
@@ -120,6 +122,44 @@ static void slice_get_float_buffers(const SeqRenderData *context,
   }
 }
 
+static float4 load_premul_pixel(const uchar *ptr)
+{
+  float4 res;
+  straight_uchar_to_premul_float(res, ptr);
+  return res;
+}
+
+static float4 load_premul_pixel(const float *ptr)
+{
+  return float4(ptr);
+}
+
+static void store_premul_pixel(const float4 &pix, uchar *dst)
+{
+  premul_float_to_straight_uchar(dst, pix);
+}
+
+static void store_premul_pixel(const float4 &pix, float *dst)
+{
+  *reinterpret_cast<float4 *>(dst) = pix;
+}
+
+static void store_opaque_black_pixel(uchar *dst)
+{
+  dst[0] = 0;
+  dst[1] = 0;
+  dst[2] = 0;
+  dst[3] = 255;
+}
+
+static void store_opaque_black_pixel(float *dst)
+{
+  dst[0] = 0.0f;
+  dst[1] = 0.0f;
+  dst[2] = 0.0f;
+  dst[3] = 1.0f;
+}
+
 /** \} */
 
 /* -------------------------------------------------------------------- */
@@ -203,72 +243,32 @@ static void init_alpha_over_or_under(Sequence *seq)
   seq->seq1 = seq2;
 }
 
-static void do_alphaover_effect_byte(
-    float fac, int x, int y, uchar *rect1, uchar *rect2, uchar *out)
+/* dst = src1 over src2 (alpha from src1) */
+template<typename T>
+static void do_alphaover_effect(
+    float fac, int width, int height, const T *src1, const T *src2, T *dst)
 {
-  uchar *cp1 = rect1;
-  uchar *cp2 = rect2;
-  uchar *rt = out;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      /* rt = rt1 over rt2  (alpha from rt1) */
-
-      float tempc[4], rt1[4], rt2[4];
-      straight_uchar_to_premul_float(rt1, cp1);
-      straight_uchar_to_premul_float(rt2, cp2);
-
-      float mfac = 1.0f - fac * rt1[3];
-
-      if (fac <= 0.0f) {
-        *((uint *)rt) = *((uint *)cp2);
-      }
-      else if (mfac <= 0.0f) {
-        *((uint *)rt) = *((uint *)cp1);
-      }
-      else {
-        tempc[0] = fac * rt1[0] + mfac * rt2[0];
-        tempc[1] = fac * rt1[1] + mfac * rt2[1];
-        tempc[2] = fac * rt1[2] + mfac * rt2[2];
-        tempc[3] = fac * rt1[3] + mfac * rt2[3];
-
-        premul_float_to_straight_uchar(rt, tempc);
-      }
-      cp1 += 4;
-      cp2 += 4;
-      rt += 4;
-    }
+  if (fac <= 0.0f) {
+    memcpy(dst, src2, sizeof(T) * 4 * width * height);
+    return;
   }
-}
 
-static void do_alphaover_effect_float(
-    float fac, int x, int y, float *rect1, float *rect2, float *out)
-{
-  float *rt1 = rect1;
-  float *rt2 = rect2;
-  float *rt = out;
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      float4 col1 = load_premul_pixel(src1);
+      float mfac = 1.0f - fac * col1.w;
 
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      /* rt = rt1 over rt2  (alpha from rt1) */
-
-      float mfac = 1.0f - (fac * rt1[3]);
-
-      if (fac <= 0.0f) {
-        memcpy(rt, rt2, sizeof(float[4]));
-      }
-      else if (mfac <= 0) {
-        memcpy(rt, rt1, sizeof(float[4]));
+      if (mfac <= 0.0f) {
+        memcpy(dst, src1, sizeof(T) * 4);
       }
       else {
-        rt[0] = fac * rt1[0] + mfac * rt2[0];
-        rt[1] = fac * rt1[1] + mfac * rt2[1];
-        rt[2] = fac * rt1[2] + mfac * rt2[2];
-        rt[3] = fac * rt1[3] + mfac * rt2[3];
+        float4 col2 = load_premul_pixel(src2);
+        float4 col = fac * col1 + mfac * col2;
+        store_premul_pixel(col, dst);
       }
-      rt1 += 4;
-      rt2 += 4;
-      rt += 4;
+      src1 += 4;
+      src2 += 4;
+      dst += 4;
     }
   }
 }
@@ -290,7 +290,7 @@ static void do_alphaover_effect(const SeqRenderData *context,
     slice_get_float_buffers(
         context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
 
-    do_alphaover_effect_float(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_alphaover_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
   }
   else {
     uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
@@ -298,7 +298,7 @@ static void do_alphaover_effect(const SeqRenderData *context,
     slice_get_byte_buffers(
         context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
 
-    do_alphaover_effect_byte(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_alphaover_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
   }
 }
 
@@ -308,88 +308,34 @@ static void do_alphaover_effect(const SeqRenderData *context,
 /** \name Alpha Under Effect
  * \{ */
 
-static void do_alphaunder_effect_byte(
-    float fac, int x, int y, uchar *rect1, uchar *rect2, uchar *out)
+/* dst = src1 under src2 (alpha from src2) */
+template<typename T>
+static void do_alphaunder_effect(
+    float fac, int width, int height, const T *src1, const T *src2, T *dst)
 {
-  uchar *cp1 = rect1;
-  uchar *cp2 = rect2;
-  uchar *rt = out;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      /* rt = rt1 under rt2  (alpha from rt2) */
-
-      float tempc[4], rt1[4], rt2[4];
-      straight_uchar_to_premul_float(rt1, cp1);
-      straight_uchar_to_premul_float(rt2, cp2);
-
-      /* this complex optimization is because the
-       * 'skybuf' can be crossed in
-       */
-      if (rt2[3] <= 0.0f && fac >= 1.0f) {
-        *((uint *)rt) = *((uint *)cp1);
-      }
-      else if (rt2[3] >= 1.0f) {
-        *((uint *)rt) = *((uint *)cp2);
-      }
-      else {
-        float temp_fac = (fac * (1.0f - rt2[3]));
-
-        if (fac <= 0) {
-          *((uint *)rt) = *((uint *)cp2);
-        }
-        else {
-          tempc[0] = (temp_fac * rt1[0] + rt2[0]);
-          tempc[1] = (temp_fac * rt1[1] + rt2[1]);
-          tempc[2] = (temp_fac * rt1[2] + rt2[2]);
-          tempc[3] = (temp_fac * rt1[3] + rt2[3]);
-
-          premul_float_to_straight_uchar(rt, tempc);
-        }
-      }
-      cp1 += 4;
-      cp2 += 4;
-      rt += 4;
-    }
+  if (fac >= 1.0f) {
+    memcpy(dst, src1, sizeof(T) * 4 * width * height);
+    return;
   }
-}
 
-static void do_alphaunder_effect_float(
-    float fac, int x, int y, float *rect1, float *rect2, float *out)
-{
-  float *rt1 = rect1;
-  float *rt2 = rect2;
-  float *rt = out;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      /* rt = rt1 under rt2  (alpha from rt2) */
-
-      /* this complex optimization is because the
-       * 'skybuf' can be crossed in
-       */
-      if (rt2[3] <= 0 && fac >= 1.0f) {
-        memcpy(rt, rt1, sizeof(float[4]));
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      float4 col2 = load_premul_pixel(src2);
+      if (col2.w <= 0.0f) {
+        memcpy(dst, src1, sizeof(T) * 4);
       }
-      else if (rt2[3] >= 1.0f) {
-        memcpy(rt, rt2, sizeof(float[4]));
+      else if (col2.w >= 1.0f || fac <= 0.0f) {
+        memcpy(dst, src2, sizeof(T) * 4);
       }
       else {
-        float temp_fac = fac * (1.0f - rt2[3]);
-
-        if (fac == 0) {
-          memcpy(rt, rt2, sizeof(float[4]));
-        }
-        else {
-          rt[0] = temp_fac * rt1[0] + rt2[0];
-          rt[1] = temp_fac * rt1[1] + rt2[1];
-          rt[2] = temp_fac * rt1[2] + rt2[2];
-          rt[3] = temp_fac * rt1[3] + rt2[3];
-        }
+        float mfac = fac * (1.0f - col2.w);
+        float4 col1 = load_premul_pixel(src1);
+        float4 col = mfac * col1 + col2;
+        store_premul_pixel(col, dst);
       }
-      rt1 += 4;
-      rt2 += 4;
-      rt += 4;
+      src1 += 4;
+      src2 += 4;
+      dst += 4;
     }
   }
 }
@@ -411,7 +357,7 @@ static void do_alphaunder_effect(const SeqRenderData *context,
     slice_get_float_buffers(
         context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
 
-    do_alphaunder_effect_float(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_alphaunder_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
   }
   else {
     uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
@@ -419,7 +365,7 @@ static void do_alphaunder_effect(const SeqRenderData *context,
     slice_get_byte_buffers(
         context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
 
-    do_alphaunder_effect_byte(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_alphaunder_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
   }
 }
 
@@ -527,53 +473,24 @@ static float invGammaCorrect(float c)
   return sqrtf_signed(c);
 }
 
-static void do_gammacross_effect_byte(
-    float fac, int x, int y, uchar *rect1, uchar *rect2, uchar *out)
+template<typename T>
+static void do_gammacross_effect(
+    float fac, int width, int height, const T *src1, const T *src2, T *dst)
 {
-  uchar *cp1 = rect1;
-  uchar *cp2 = rect2;
-  uchar *rt = out;
-
   float mfac = 1.0f - fac;
 
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      float rt1[4], rt2[4], tempc[4];
-
-      straight_uchar_to_premul_float(rt1, cp1);
-      straight_uchar_to_premul_float(rt2, cp2);
-
-      tempc[0] = gammaCorrect(mfac * invGammaCorrect(rt1[0]) + fac * invGammaCorrect(rt2[0]));
-      tempc[1] = gammaCorrect(mfac * invGammaCorrect(rt1[1]) + fac * invGammaCorrect(rt2[1]));
-      tempc[2] = gammaCorrect(mfac * invGammaCorrect(rt1[2]) + fac * invGammaCorrect(rt2[2]));
-      tempc[3] = gammaCorrect(mfac * invGammaCorrect(rt1[3]) + fac * invGammaCorrect(rt2[3]));
-
-      premul_float_to_straight_uchar(rt, tempc);
-      cp1 += 4;
-      cp2 += 4;
-      rt += 4;
-    }
-  }
-}
-
-static void do_gammacross_effect_float(
-    float fac, int x, int y, float *rect1, float *rect2, float *out)
-{
-  float *rt1 = rect1;
-  float *rt2 = rect2;
-  float *rt = out;
-
-  float mfac = 1.0f - fac;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      rt[0] = gammaCorrect(mfac * invGammaCorrect(rt1[0]) + fac * invGammaCorrect(rt2[0]));
-      rt[1] = gammaCorrect(mfac * invGammaCorrect(rt1[1]) + fac * invGammaCorrect(rt2[1]));
-      rt[2] = gammaCorrect(mfac * invGammaCorrect(rt1[2]) + fac * invGammaCorrect(rt2[2]));
-      rt[3] = gammaCorrect(mfac * invGammaCorrect(rt1[3]) + fac * invGammaCorrect(rt2[3]));
-      rt1 += 4;
-      rt2 += 4;
-      rt += 4;
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      float4 col1 = load_premul_pixel(src1);
+      float4 col2 = load_premul_pixel(src2);
+      float4 col;
+      for (int c = 0; c < 4; ++c) {
+        col[c] = gammaCorrect(mfac * invGammaCorrect(col1[c]) + fac * invGammaCorrect(col2[c]));
+      }
+      store_premul_pixel(col, dst);
+      src1 += 4;
+      src2 += 4;
+      dst += 4;
     }
   }
 }
@@ -604,7 +521,7 @@ static void do_gammacross_effect(const SeqRenderData *context,
     slice_get_float_buffers(
         context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
 
-    do_gammacross_effect_float(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_gammacross_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
   }
   else {
     uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
@@ -612,7 +529,7 @@ static void do_gammacross_effect(const SeqRenderData *context,
     slice_get_byte_buffers(
         context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
 
-    do_gammacross_effect_byte(fac, context->rectx, total_lines, rect1, rect2, rect_out);
+    do_gammacross_effect(fac, context->rectx, total_lines, rect1, rect2, rect_out);
   }
 }
 
@@ -943,57 +860,21 @@ static void do_mul_effect(const SeqRenderData *context,
 /** \name Blend Mode Effect
  * \{ */
 
-using IMB_blend_func_byte = void (*)(uchar *dst, const uchar *src1, const uchar *src2);
-using IMB_blend_func_float = void (*)(float *dst, const float *src1, const float *src2);
-
-BLI_INLINE void apply_blend_function_byte(float fac,
-                                          int x,
-                                          int y,
-                                          uchar *rect1,
-                                          uchar *rect2,
-                                          uchar *out,
-                                          IMB_blend_func_byte blend_function)
+/* blend_function has to be: void (T* dst, const T *src1, const T *src2) */
+template<typename T, typename Func>
+static void apply_blend_function(
+    float fac, int width, int height, const T *src1, T *src2, T *dst, Func blend_function)
 {
-  uchar *rt1 = rect1;
-  uchar *rt2 = rect2;
-  uchar *rt = out;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      uint achannel = rt2[3];
-      rt2[3] = uint(achannel) * fac;
-      blend_function(rt, rt1, rt2);
-      rt2[3] = achannel;
-      rt[3] = rt1[3];
-      rt1 += 4;
-      rt2 += 4;
-      rt += 4;
-    }
-  }
-}
-
-BLI_INLINE void apply_blend_function_float(float fac,
-                                           int x,
-                                           int y,
-                                           float *rect1,
-                                           float *rect2,
-                                           float *out,
-                                           IMB_blend_func_float blend_function)
-{
-  float *rt1 = rect1;
-  float *rt2 = rect2;
-  float *rt = out;
-
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      float achannel = rt2[3];
-      rt2[3] = achannel * fac;
-      blend_function(rt, rt1, rt2);
-      rt2[3] = achannel;
-      rt[3] = rt1[3];
-      rt1 += 4;
-      rt2 += 4;
-      rt += 4;
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      T achannel = src2[3];
+      src2[3] = T(achannel * fac);
+      blend_function(dst, src1, src2);
+      src2[3] = achannel;
+      dst[3] = src1[3];
+      src1 += 4;
+      src2 += 4;
+      dst += 4;
     }
   }
 }
@@ -1003,67 +884,67 @@ static void do_blend_effect_float(
 {
   switch (btype) {
     case SEQ_TYPE_ADD:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_add_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_add_float);
       break;
     case SEQ_TYPE_SUB:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_sub_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_sub_float);
       break;
     case SEQ_TYPE_MUL:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_mul_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_mul_float);
       break;
     case SEQ_TYPE_DARKEN:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_darken_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_darken_float);
       break;
     case SEQ_TYPE_COLOR_BURN:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_burn_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_burn_float);
       break;
     case SEQ_TYPE_LINEAR_BURN:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_linearburn_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_linearburn_float);
       break;
     case SEQ_TYPE_SCREEN:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_screen_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_screen_float);
       break;
     case SEQ_TYPE_LIGHTEN:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_lighten_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_lighten_float);
       break;
     case SEQ_TYPE_DODGE:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_dodge_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_dodge_float);
       break;
     case SEQ_TYPE_OVERLAY:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_overlay_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_overlay_float);
       break;
     case SEQ_TYPE_SOFT_LIGHT:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_softlight_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_softlight_float);
       break;
     case SEQ_TYPE_HARD_LIGHT:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_hardlight_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_hardlight_float);
       break;
     case SEQ_TYPE_PIN_LIGHT:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_pinlight_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_pinlight_float);
       break;
     case SEQ_TYPE_LIN_LIGHT:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_linearlight_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_linearlight_float);
       break;
     case SEQ_TYPE_VIVID_LIGHT:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_vividlight_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_vividlight_float);
       break;
     case SEQ_TYPE_BLEND_COLOR:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_color_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_color_float);
       break;
     case SEQ_TYPE_HUE:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_hue_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_hue_float);
       break;
     case SEQ_TYPE_SATURATION:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_saturation_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_saturation_float);
       break;
     case SEQ_TYPE_VALUE:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_luminosity_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_luminosity_float);
       break;
     case SEQ_TYPE_DIFFERENCE:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_difference_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_difference_float);
       break;
     case SEQ_TYPE_EXCLUSION:
-      apply_blend_function_float(fac, x, y, rect1, rect2, out, blend_color_exclusion_float);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_exclusion_float);
       break;
     default:
       break;
@@ -1075,67 +956,67 @@ static void do_blend_effect_byte(
 {
   switch (btype) {
     case SEQ_TYPE_ADD:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_add_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_add_byte);
       break;
     case SEQ_TYPE_SUB:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_sub_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_sub_byte);
       break;
     case SEQ_TYPE_MUL:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_mul_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_mul_byte);
       break;
     case SEQ_TYPE_DARKEN:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_darken_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_darken_byte);
       break;
     case SEQ_TYPE_COLOR_BURN:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_burn_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_burn_byte);
       break;
     case SEQ_TYPE_LINEAR_BURN:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_linearburn_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_linearburn_byte);
       break;
     case SEQ_TYPE_SCREEN:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_screen_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_screen_byte);
       break;
     case SEQ_TYPE_LIGHTEN:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_lighten_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_lighten_byte);
       break;
     case SEQ_TYPE_DODGE:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_dodge_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_dodge_byte);
       break;
     case SEQ_TYPE_OVERLAY:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_overlay_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_overlay_byte);
       break;
     case SEQ_TYPE_SOFT_LIGHT:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_softlight_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_softlight_byte);
       break;
     case SEQ_TYPE_HARD_LIGHT:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_hardlight_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_hardlight_byte);
       break;
     case SEQ_TYPE_PIN_LIGHT:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_pinlight_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_pinlight_byte);
       break;
     case SEQ_TYPE_LIN_LIGHT:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_linearlight_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_linearlight_byte);
       break;
     case SEQ_TYPE_VIVID_LIGHT:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_vividlight_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_vividlight_byte);
       break;
     case SEQ_TYPE_BLEND_COLOR:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_color_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_color_byte);
       break;
     case SEQ_TYPE_HUE:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_hue_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_hue_byte);
       break;
     case SEQ_TYPE_SATURATION:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_saturation_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_saturation_byte);
       break;
     case SEQ_TYPE_VALUE:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_luminosity_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_luminosity_byte);
       break;
     case SEQ_TYPE_DIFFERENCE:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_difference_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_difference_byte);
       break;
     case SEQ_TYPE_EXCLUSION:
-      apply_blend_function_byte(fac, x, y, rect1, rect2, out, blend_color_exclusion_byte);
+      apply_blend_function(fac, x, y, rect1, rect2, out, blend_color_exclusion_byte);
       break;
     default:
       break;
@@ -1516,58 +1397,38 @@ static void copy_wipe_effect(Sequence *dst, Sequence *src, const int /*flag*/)
   dst->effectdata = MEM_dupallocN(src->effectdata);
 }
 
-static void do_wipe_effect_byte(const Sequence *seq,
-                                float fac,
-                                int width,
-                                int height,
-                                const uchar *rect1,
-                                const uchar *rect2,
-                                uchar *out)
+template<typename T>
+static void do_wipe_effect(
+    const Sequence *seq, float fac, int width, int height, const T *rect1, const T *rect2, T *out)
 {
   using namespace blender;
   const WipeVars *wipe = (const WipeVars *)seq->effectdata;
   const WipeZone wipezone = precalc_wipe_zone(wipe, width, height);
 
   threading::parallel_for(IndexRange(height), 64, [&](const IndexRange y_range) {
-    const uchar *cp1 = rect1 + y_range.first() * width * 4;
-    const uchar *cp2 = rect2 + y_range.first() * width * 4;
-    uchar *rt = out + y_range.first() * width * 4;
+    const T *cp1 = rect1 ? rect1 + y_range.first() * width * 4 : nullptr;
+    const T *cp2 = rect2 ? rect2 + y_range.first() * width * 4 : nullptr;
+    T *rt = out + y_range.first() * width * 4;
     for (const int y : y_range) {
       for (int x = 0; x < width; x++) {
         float check = check_zone(&wipezone, x, y, fac);
         if (check) {
           if (cp1) {
-            float rt1[4], rt2[4], tempc[4];
-
-            straight_uchar_to_premul_float(rt1, cp1);
-            straight_uchar_to_premul_float(rt2, cp2);
-
-            tempc[0] = rt1[0] * check + rt2[0] * (1 - check);
-            tempc[1] = rt1[1] * check + rt2[1] * (1 - check);
-            tempc[2] = rt1[2] * check + rt2[2] * (1 - check);
-            tempc[3] = rt1[3] * check + rt2[3] * (1 - check);
-
-            premul_float_to_straight_uchar(rt, tempc);
+            float4 col1 = load_premul_pixel(cp1);
+            float4 col2 = load_premul_pixel(cp2);
+            float4 col = col1 * check + col2 * (1.0f - check);
+            store_premul_pixel(col, rt);
           }
           else {
-            rt[0] = 0;
-            rt[1] = 0;
-            rt[2] = 0;
-            rt[3] = 255;
+            store_opaque_black_pixel(rt);
           }
         }
         else {
           if (cp2) {
-            rt[0] = cp2[0];
-            rt[1] = cp2[1];
-            rt[2] = cp2[2];
-            rt[3] = cp2[3];
+            memcpy(rt, cp2, sizeof(T) * 4);
           }
           else {
-            rt[0] = 0;
-            rt[1] = 0;
-            rt[2] = 0;
-            rt[3] = 255;
+            store_opaque_black_pixel(rt);
           }
         }
 
@@ -1583,66 +1444,6 @@ static void do_wipe_effect_byte(const Sequence *seq,
   });
 }
 
-static void do_wipe_effect_float(Sequence *seq,
-                                 float fac,
-                                 int width,
-                                 int height,
-                                 const float *rect1,
-                                 const float *rect2,
-                                 float *out)
-{
-  using namespace blender;
-  const WipeVars *wipe = (const WipeVars *)seq->effectdata;
-  const WipeZone wipezone = precalc_wipe_zone(wipe, width, height);
-
-  threading::parallel_for(IndexRange(height), 64, [&](const IndexRange y_range) {
-    const float *rt1 = rect1 + y_range.first() * width * 4;
-    const float *rt2 = rect2 + y_range.first() * width * 4;
-    float *rt = out + y_range.first() * width * 4;
-    for (const int y : y_range) {
-      for (int x = 0; x < width; x++) {
-        float check = check_zone(&wipezone, x, y, fac);
-        if (check) {
-          if (rt1) {
-            rt[0] = rt1[0] * check + rt2[0] * (1 - check);
-            rt[1] = rt1[1] * check + rt2[1] * (1 - check);
-            rt[2] = rt1[2] * check + rt2[2] * (1 - check);
-            rt[3] = rt1[3] * check + rt2[3] * (1 - check);
-          }
-          else {
-            rt[0] = 0;
-            rt[1] = 0;
-            rt[2] = 0;
-            rt[3] = 1.0;
-          }
-        }
-        else {
-          if (rt2) {
-            rt[0] = rt2[0];
-            rt[1] = rt2[1];
-            rt[2] = rt2[2];
-            rt[3] = rt2[3];
-          }
-          else {
-            rt[0] = 0;
-            rt[1] = 0;
-            rt[2] = 0;
-            rt[3] = 1.0;
-          }
-        }
-
-        rt += 4;
-        if (rt1 != nullptr) {
-          rt1 += 4;
-        }
-        if (rt2 != nullptr) {
-          rt2 += 4;
-        }
-      }
-    }
-  });
-}
-
 static ImBuf *do_wipe_effect(const SeqRenderData *context,
                              Sequence *seq,
                              float /*timeline_frame*/,
@@ -1654,22 +1455,22 @@ static ImBuf *do_wipe_effect(const SeqRenderData *context,
   ImBuf *out = prepare_effect_imbufs(context, ibuf1, ibuf2, ibuf3);
 
   if (out->float_buffer.data) {
-    do_wipe_effect_float(seq,
-                         fac,
-                         context->rectx,
-                         context->recty,
-                         ibuf1->float_buffer.data,
-                         ibuf2->float_buffer.data,
-                         out->float_buffer.data);
+    do_wipe_effect(seq,
+                   fac,
+                   context->rectx,
+                   context->recty,
+                   ibuf1->float_buffer.data,
+                   ibuf2->float_buffer.data,
+                   out->float_buffer.data);
   }
   else {
-    do_wipe_effect_byte(seq,
-                        fac,
-                        context->rectx,
-                        context->recty,
-                        ibuf1->byte_buffer.data,
-                        ibuf2->byte_buffer.data,
-                        out->byte_buffer.data);
+    do_wipe_effect(seq,
+                   fac,
+                   context->rectx,
+                   context->recty,
+                   ibuf1->byte_buffer.data,
+                   ibuf2->byte_buffer.data,
+                   out->byte_buffer.data);
   }
 
   return out;
@@ -1837,12 +1638,8 @@ static void do_transform_effect(const SeqRenderData *context,
 /** \name Glow Effect
  * \{ */
 
-static void glow_blur_bitmap(const blender::float4 *src,
-                             blender::float4 *map,
-                             int width,
-                             int height,
-                             float blur,
-                             int quality)
+static void glow_blur_bitmap(
+    const float4 *src, float4 *map, int width, int height, float blur, int quality)
 {
   using namespace blender;
 
@@ -1913,8 +1710,8 @@ static void glow_blur_bitmap(const blender::float4 *src,
   });
 }
 
-static void blur_isolate_highlights(const blender::float4 *in,
-                                    blender::float4 *out,
+static void blur_isolate_highlights(const float4 *in,
+                                    float4 *out,
                                     int width,
                                     int height,
                                     float threshold,
@@ -2525,7 +2322,7 @@ static void do_overdrop_effect(const SeqRenderData *context,
         context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
 
     do_drop_effect_float(fac, x, y, rect1, rect2, rect_out);
-    do_alphaover_effect_float(fac, x, y, rect1, rect2, rect_out);
+    do_alphaover_effect(fac, x, y, rect1, rect2, rect_out);
   }
   else {
     uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
@@ -2534,7 +2331,7 @@ static void do_overdrop_effect(const SeqRenderData *context,
         context, ibuf1, ibuf2, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
 
     do_drop_effect_byte(fac, x, y, rect1, rect2, rect_out);
-    do_alphaover_effect_byte(fac, x, y, rect1, rect2, rect_out);
+    do_alphaover_effect(fac, x, y, rect1, rect2, rect_out);
   }
 }
 
-- 
2.30.2


From bac048169736bbde8847217fbb33469cc4debbf5 Mon Sep 17 00:00:00 2001
From: Aras Pranckevicius <aras@nesnausk.org>
Date: Tue, 12 Dec 2023 11:22:27 +0200
Subject: [PATCH 2/2] VSE: make Gaussian Blur effect both faster and with less
 code

Similar to previous commit, there was a lot of code duplication
between "byte" and "float" gaussian blur code variants. Also:

- Use direct parallel_for threading instead of very roundabout
  way that was coming from C times. Way less code, and allows loading
  CPU cores better (parallel_for seems better than task pool, also
  gaussian blur is relatively expensive so use smaller grain sizes).
- Calculate gaussian kernel tables once per effect, instead of once
  per CPU job.
- Do "sample from this to that pixel" including boundary conditions
  calculation explicitly, instead of checking boundary condition
  at each iteration inside the inner loop.

Applying gaussian blur effect of 100x100 size on 4K UHD sequencer
strip input, on Windows / Ryzen 5950X: 630ms -> 450ms

And with 220 fewer lines of code :)
---
 source/blender/sequencer/intern/effects.cc | 467 ++++++---------------
 1 file changed, 124 insertions(+), 343 deletions(-)

diff --git a/source/blender/sequencer/intern/effects.cc b/source/blender/sequencer/intern/effects.cc
index 99f6cd9c34a..f8ef6a668fc 100644
--- a/source/blender/sequencer/intern/effects.cc
+++ b/source/blender/sequencer/intern/effects.cc
@@ -2341,14 +2341,6 @@ static void do_overdrop_effect(const SeqRenderData *context,
 /** \name Gaussian Blur
  * \{ */
 
-/* NOTE: This gaussian blur implementation accumulates values in the square
- * kernel rather that doing X direction and then Y direction because of the
- * lack of using multiple-staged filters.
- *
- * Once we can we'll implement a way to apply filter as multiple stages we
- * can optimize hell of a lot in here.
- */
-
 static void init_gaussian_blur_effect(Sequence *seq)
 {
   if (seq->effectdata) {
@@ -2382,346 +2374,92 @@ static int early_out_gaussian_blur(Sequence *seq, float /*fac*/)
   return EARLY_DO_EFFECT;
 }
 
-/* TODO(sergey): De-duplicate with compositor. */
-static float *make_gaussian_blur_kernel(float rad, int size)
+static blender::Array<float> make_gaussian_blur_kernel(float rad, int size)
 {
-  float *gausstab, sum, val;
-  float fac;
-  int i, n;
+  int n = 2 * size + 1;
+  blender::Array<float> gausstab(n);
 
-  n = 2 * size + 1;
-
-  gausstab = (float *)MEM_mallocN(sizeof(float) * n, __func__);
-
-  sum = 0.0f;
-  fac = (rad > 0.0f ? 1.0f / rad : 0.0f);
-  for (i = -size; i <= size; i++) {
-    val = RE_filter_value(R_FILTER_GAUSS, float(i) * fac);
+  float sum = 0.0f;
+  float fac = (rad > 0.0f ? 1.0f / rad : 0.0f);
+  for (int i = -size; i <= size; i++) {
+    float val = RE_filter_value(R_FILTER_GAUSS, float(i) * fac);
     sum += val;
     gausstab[i + size] = val;
   }
 
-  sum = 1.0f / sum;
-  for (i = 0; i < n; i++) {
-    gausstab[i] *= sum;
+  float inv_sum = 1.0f / sum;
+  for (int i = 0; i < n; i++) {
+    gausstab[i] *= inv_sum;
   }
 
   return gausstab;
 }
 
-static void do_gaussian_blur_effect_byte_x(Sequence *seq,
-                                           int start_line,
-                                           int x,
-                                           int y,
-                                           int frame_width,
-                                           int /*frame_height*/,
-                                           const uchar *rect,
-                                           uchar *out)
+template<typename T>
+static void gaussian_blur_x(const blender::Array<float> &gausstab,
+                            int half_size,
+                            int start_line,
+                            int width,
+                            int height,
+                            int /*frame_height*/,
+                            const T *rect,
+                            T *dst)
 {
-#define INDEX(_x, _y) (((_y) * (x) + (_x)) * 4)
-  GaussianBlurVars *data = static_cast<GaussianBlurVars *>(seq->effectdata);
-  const int size_x = int(data->size_x + 0.5f);
-  int i, j;
-
-  /* Make gaussian weight table. */
-  float *gausstab_x;
-  gausstab_x = make_gaussian_blur_kernel(data->size_x, size_x);
-
-  for (i = 0; i < y; i++) {
-    for (j = 0; j < x; j++) {
-      int out_index = INDEX(j, i);
-      float accum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+  dst += start_line * width * 4;
+  for (int y = start_line; y < start_line + height; y++) {
+    for (int x = 0; x < width; x++) {
+      float4 accum(0.0f);
       float accum_weight = 0.0f;
 
-      for (int current_x = j - size_x; current_x <= j + size_x; current_x++) {
-        if (current_x < 0 || current_x >= frame_width) {
-          /* Out of bounds. */
-          continue;
-        }
-        int index = INDEX(current_x, i + start_line);
-        float weight = gausstab_x[current_x - j + size_x];
-        accum[0] += rect[index] * weight;
-        accum[1] += rect[index + 1] * weight;
-        accum[2] += rect[index + 2] * weight;
-        accum[3] += rect[index + 3] * weight;
+      int xmin = blender::math::max(x - half_size, 0);
+      int xmax = blender::math::min(x + half_size, width - 1);
+      for (int nx = xmin, index = (xmin - x) + half_size; nx <= xmax; nx++, index++) {
+        float weight = gausstab[index];
+        int offset = (y * width + nx) * 4;
+        accum += float4(rect + offset) * weight;
         accum_weight += weight;
       }
-
-      float inv_accum_weight = 1.0f / accum_weight;
-      out[out_index + 0] = accum[0] * inv_accum_weight;
-      out[out_index + 1] = accum[1] * inv_accum_weight;
-      out[out_index + 2] = accum[2] * inv_accum_weight;
-      out[out_index + 3] = accum[3] * inv_accum_weight;
+      accum *= (1.0f / accum_weight);
+      dst[0] = accum[0];
+      dst[1] = accum[1];
+      dst[2] = accum[2];
+      dst[3] = accum[3];
+      dst += 4;
     }
   }
-
-  MEM_freeN(gausstab_x);
-#undef INDEX
 }
 
-static void do_gaussian_blur_effect_byte_y(Sequence *seq,
-                                           int start_line,
-                                           int x,
-                                           int y,
-                                           int /*frame_width*/,
-                                           int frame_height,
-                                           const uchar *rect,
-                                           uchar *out)
+template<typename T>
+static void gaussian_blur_y(const blender::Array<float> &gausstab,
+                            int half_size,
+                            int start_line,
+                            int width,
+                            int height,
+                            int frame_height,
+                            const T *rect,
+                            T *dst)
 {
-#define INDEX(_x, _y) (((_y) * (x) + (_x)) * 4)
-  GaussianBlurVars *data = static_cast<GaussianBlurVars *>(seq->effectdata);
-  const int size_y = int(data->size_y + 0.5f);
-  int i, j;
-
-  /* Make gaussian weight table. */
-  float *gausstab_y;
-  gausstab_y = make_gaussian_blur_kernel(data->size_y, size_y);
-
-  for (i = 0; i < y; i++) {
-    for (j = 0; j < x; j++) {
-      int out_index = INDEX(j, i);
-      float accum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+  dst += start_line * width * 4;
+  for (int y = start_line; y < start_line + height; y++) {
+    for (int x = 0; x < width; x++) {
+      float4 accum(0.0f);
       float accum_weight = 0.0f;
-      for (int current_y = i - size_y; current_y <= i + size_y; current_y++) {
-        if (current_y < -start_line || current_y + start_line >= frame_height) {
-          /* Out of bounds. */
-          continue;
-        }
-        int index = INDEX(j, current_y + start_line);
-        float weight = gausstab_y[current_y - i + size_y];
-        accum[0] += rect[index] * weight;
-        accum[1] += rect[index + 1] * weight;
-        accum[2] += rect[index + 2] * weight;
-        accum[3] += rect[index + 3] * weight;
+      int ymin = blender::math::max(y - half_size, 0);
+      int ymax = blender::math::min(y + half_size, frame_height - 1);
+      for (int ny = ymin, index = (ymin - y) + half_size; ny <= ymax; ny++, index++) {
+        float weight = gausstab[index];
+        int offset = (ny * width + x) * 4;
+        accum += float4(rect + offset) * weight;
         accum_weight += weight;
       }
-      float inv_accum_weight = 1.0f / accum_weight;
-      out[out_index + 0] = accum[0] * inv_accum_weight;
-      out[out_index + 1] = accum[1] * inv_accum_weight;
-      out[out_index + 2] = accum[2] * inv_accum_weight;
-      out[out_index + 3] = accum[3] * inv_accum_weight;
+      accum *= (1.0f / accum_weight);
+      dst[0] = accum[0];
+      dst[1] = accum[1];
+      dst[2] = accum[2];
+      dst[3] = accum[3];
+      dst += 4;
     }
   }
-
-  MEM_freeN(gausstab_y);
-#undef INDEX
-}
-
-static void do_gaussian_blur_effect_float_x(Sequence *seq,
-                                            int start_line,
-                                            int x,
-                                            int y,
-                                            int frame_width,
-                                            int /*frame_height*/,
-                                            float *rect,
-                                            float *out)
-{
-#define INDEX(_x, _y) (((_y) * (x) + (_x)) * 4)
-  GaussianBlurVars *data = static_cast<GaussianBlurVars *>(seq->effectdata);
-  const int size_x = int(data->size_x + 0.5f);
-  int i, j;
-
-  /* Make gaussian weight table. */
-  float *gausstab_x;
-  gausstab_x = make_gaussian_blur_kernel(data->size_x, size_x);
-
-  for (i = 0; i < y; i++) {
-    for (j = 0; j < x; j++) {
-      int out_index = INDEX(j, i);
-      float accum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-      float accum_weight = 0.0f;
-      for (int current_x = j - size_x; current_x <= j + size_x; current_x++) {
-        if (current_x < 0 || current_x >= frame_width) {
-          /* Out of bounds. */
-          continue;
-        }
-        int index = INDEX(current_x, i + start_line);
-        float weight = gausstab_x[current_x - j + size_x];
-        madd_v4_v4fl(accum, &rect[index], weight);
-        accum_weight += weight;
-      }
-      mul_v4_v4fl(&out[out_index], accum, 1.0f / accum_weight);
-    }
-  }
-
-  MEM_freeN(gausstab_x);
-#undef INDEX
-}
-
-static void do_gaussian_blur_effect_float_y(Sequence *seq,
-                                            int start_line,
-                                            int x,
-                                            int y,
-                                            int /*frame_width*/,
-                                            int frame_height,
-                                            float *rect,
-                                            float *out)
-{
-#define INDEX(_x, _y) (((_y) * (x) + (_x)) * 4)
-  GaussianBlurVars *data = static_cast<GaussianBlurVars *>(seq->effectdata);
-  const int size_y = int(data->size_y + 0.5f);
-  int i, j;
-
-  /* Make gaussian weight table. */
-  float *gausstab_y;
-  gausstab_y = make_gaussian_blur_kernel(data->size_y, size_y);
-
-  for (i = 0; i < y; i++) {
-    for (j = 0; j < x; j++) {
-      int out_index = INDEX(j, i);
-      float accum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-      float accum_weight = 0.0f;
-      for (int current_y = i - size_y; current_y <= i + size_y; current_y++) {
-        if (current_y < -start_line || current_y + start_line >= frame_height) {
-          /* Out of bounds. */
-          continue;
-        }
-        int index = INDEX(j, current_y + start_line);
-        float weight = gausstab_y[current_y - i + size_y];
-        madd_v4_v4fl(accum, &rect[index], weight);
-        accum_weight += weight;
-      }
-      mul_v4_v4fl(&out[out_index], accum, 1.0f / accum_weight);
-    }
-  }
-
-  MEM_freeN(gausstab_y);
-#undef INDEX
-}
-
-static void do_gaussian_blur_effect_x_cb(const SeqRenderData *context,
-                                         Sequence *seq,
-                                         ImBuf *ibuf,
-                                         int start_line,
-                                         int total_lines,
-                                         ImBuf *out)
-{
-  if (out->float_buffer.data) {
-    float *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
-
-    slice_get_float_buffers(
-        context, ibuf, nullptr, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
-
-    do_gaussian_blur_effect_float_x(seq,
-                                    start_line,
-                                    context->rectx,
-                                    total_lines,
-                                    context->rectx,
-                                    context->recty,
-                                    ibuf->float_buffer.data,
-                                    rect_out);
-  }
-  else {
-    uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
-
-    slice_get_byte_buffers(
-        context, ibuf, nullptr, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
-
-    do_gaussian_blur_effect_byte_x(seq,
-                                   start_line,
-                                   context->rectx,
-                                   total_lines,
-                                   context->rectx,
-                                   context->recty,
-                                   ibuf->byte_buffer.data,
-                                   rect_out);
-  }
-}
-
-static void do_gaussian_blur_effect_y_cb(const SeqRenderData *context,
-                                         Sequence *seq,
-                                         ImBuf *ibuf,
-                                         int start_line,
-                                         int total_lines,
-                                         ImBuf *out)
-{
-  if (out->float_buffer.data) {
-    float *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
-
-    slice_get_float_buffers(
-        context, ibuf, nullptr, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
-
-    do_gaussian_blur_effect_float_y(seq,
-                                    start_line,
-                                    context->rectx,
-                                    total_lines,
-                                    context->rectx,
-                                    context->recty,
-                                    ibuf->float_buffer.data,
-                                    rect_out);
-  }
-  else {
-    uchar *rect1 = nullptr, *rect2 = nullptr, *rect_out = nullptr;
-
-    slice_get_byte_buffers(
-        context, ibuf, nullptr, nullptr, out, start_line, &rect1, &rect2, nullptr, &rect_out);
-
-    do_gaussian_blur_effect_byte_y(seq,
-                                   start_line,
-                                   context->rectx,
-                                   total_lines,
-                                   context->rectx,
-                                   context->recty,
-                                   ibuf->byte_buffer.data,
-                                   rect_out);
-  }
-}
-
-struct RenderGaussianBlurEffectInitData {
-  const SeqRenderData *context;
-  Sequence *seq;
-  ImBuf *ibuf;
-  ImBuf *out;
-};
-
-struct RenderGaussianBlurEffectThread {
-  const SeqRenderData *context;
-  Sequence *seq;
-  ImBuf *ibuf;
-  ImBuf *out;
-  int start_line, tot_line;
-};
-
-static void render_effect_execute_init_handle(void *handle_v,
-                                              int start_line,
-                                              int tot_line,
-                                              void *init_data_v)
-{
-  RenderGaussianBlurEffectThread *handle = (RenderGaussianBlurEffectThread *)handle_v;
-  RenderGaussianBlurEffectInitData *init_data = (RenderGaussianBlurEffectInitData *)init_data_v;
-
-  handle->context = init_data->context;
-  handle->seq = init_data->seq;
-  handle->ibuf = init_data->ibuf;
-  handle->out = init_data->out;
-
-  handle->start_line = start_line;
-  handle->tot_line = tot_line;
-}
-
-static void *render_effect_execute_do_x_thread(void *thread_data_v)
-{
-  RenderGaussianBlurEffectThread *thread_data = (RenderGaussianBlurEffectThread *)thread_data_v;
-  do_gaussian_blur_effect_x_cb(thread_data->context,
-                               thread_data->seq,
-                               thread_data->ibuf,
-                               thread_data->start_line,
-                               thread_data->tot_line,
-                               thread_data->out);
-  return nullptr;
-}
-
-static void *render_effect_execute_do_y_thread(void *thread_data_v)
-{
-  RenderGaussianBlurEffectThread *thread_data = (RenderGaussianBlurEffectThread *)thread_data_v;
-  do_gaussian_blur_effect_y_cb(thread_data->context,
-                               thread_data->seq,
-                               thread_data->ibuf,
-                               thread_data->start_line,
-                               thread_data->tot_line,
-                               thread_data->out);
-
-  return nullptr;
 }
 
 static ImBuf *do_gaussian_blur_effect(const SeqRenderData *context,
@@ -2732,32 +2470,75 @@ static ImBuf *do_gaussian_blur_effect(const SeqRenderData *context,
                                       ImBuf * /*ibuf2*/,
                                       ImBuf * /*ibuf3*/)
 {
+  using namespace blender;
+
+  /* Create blur kernel weights. */
+  const GaussianBlurVars *data = static_cast<const GaussianBlurVars *>(seq->effectdata);
+  const int half_size_x = int(data->size_x + 0.5f);
+  const int half_size_y = int(data->size_y + 0.5f);
+  Array<float> gausstab_x = make_gaussian_blur_kernel(data->size_x, half_size_x);
+  Array<float> gausstab_y = make_gaussian_blur_kernel(data->size_y, half_size_y);
+
+  const int width = context->rectx;
+  const int height = context->recty;
+  const bool is_float = ibuf1->float_buffer.data;
+
+  /* Horizontal blur: create output, blur ibuf1 into it. */
   ImBuf *out = prepare_effect_imbufs(context, ibuf1, nullptr, nullptr);
+  threading::parallel_for(IndexRange(context->recty), 32, [&](const IndexRange y_range) {
+    const int y_first = y_range.first();
+    const int y_size = y_range.size();
+    if (is_float) {
+      gaussian_blur_x(gausstab_x,
+                      half_size_x,
+                      y_first,
+                      width,
+                      y_size,
+                      height,
+                      ibuf1->float_buffer.data,
+                      out->float_buffer.data);
+    }
+    else {
+      gaussian_blur_x(gausstab_x,
+                      half_size_x,
+                      y_first,
+                      width,
+                      y_size,
+                      height,
+                      ibuf1->byte_buffer.data,
+                      out->byte_buffer.data);
+    }
+  });
 
-  RenderGaussianBlurEffectInitData init_data;
-
-  init_data.context = context;
-  init_data.seq = seq;
-  init_data.ibuf = ibuf1;
-  init_data.out = out;
-
-  IMB_processor_apply_threaded(out->y,
-                               sizeof(RenderGaussianBlurEffectThread),
-                               &init_data,
-                               render_effect_execute_init_handle,
-                               render_effect_execute_do_x_thread);
-
+  /* Vertical blur: create output, blur previous output into it. */
   ibuf1 = out;
-  init_data.ibuf = ibuf1;
   out = prepare_effect_imbufs(context, ibuf1, nullptr, nullptr);
-  init_data.out = out;
-
-  IMB_processor_apply_threaded(out->y,
-                               sizeof(RenderGaussianBlurEffectThread),
-                               &init_data,
-                               render_effect_execute_init_handle,
-                               render_effect_execute_do_y_thread);
+  threading::parallel_for(IndexRange(context->recty), 32, [&](const IndexRange y_range) {
+    const int y_first = y_range.first();
+    const int y_size = y_range.size();
+    if (is_float) {
+      gaussian_blur_y(gausstab_y,
+                      half_size_y,
+                      y_first,
+                      width,
+                      y_size,
+                      height,
+                      ibuf1->float_buffer.data,
+                      out->float_buffer.data);
+    }
+    else {
+      gaussian_blur_y(gausstab_y,
+                      half_size_y,
+                      y_first,
+                      width,
+                      y_size,
+                      height,
+                      ibuf1->byte_buffer.data,
+                      out->byte_buffer.data);
+    }
+  });
 
+  /* Free the first output. */
   IMB_freeImBuf(ibuf1);
 
   return out;
-- 
2.30.2