From 20b2d709ab4f2010d99d5a4b0aaa6406af392de4 Mon Sep 17 00:00:00 2001
From: Aras Pranckevicius <aras@nesnausk.org>
Date: Thu, 7 Dec 2023 14:31:15 +0200
Subject: [PATCH] VSE: speedup Alpha Over blend with SIMD

Alpha Over is default blend mode used by all strips (images, text etc.),
so while SIMD arguably could be used in all of them, this one is
probably most important.

Makes straight_uchar_to_premul_float + actual blending math +
premul_float_to_straight_uchar use SIMD in a 4-wide fashion, i.e. still
works on one pixel at a time.

At 4K UHD resolution, Windows Ryzen 5950X:
- do_alphaover_effect_byte time 6.82ms -> 4.93ms
- with four image strips, sequencer playback 15.9FPS -> 17.9FPS
  (there are other time costs, not just alpha over blending)
---
 source/blender/sequencer/intern/effects.cc | 142 +++++++++++++++++----
 1 file changed, 114 insertions(+), 28 deletions(-)

diff --git a/source/blender/sequencer/intern/effects.cc b/source/blender/sequencer/intern/effects.cc
index 3ab8bdcaa10..fefcc0f9e48 100644
--- a/source/blender/sequencer/intern/effects.cc
+++ b/source/blender/sequencer/intern/effects.cc
@@ -21,6 +21,7 @@
 #include "BLI_math_vector_types.hh"
 #include "BLI_path_util.h"
 #include "BLI_rect.h"
+#include "BLI_simd.h"
 #include "BLI_string.h"
 #include "BLI_task.hh"
 #include "BLI_threads.h"
@@ -203,37 +204,122 @@ static void init_alpha_over_or_under(Sequence *seq)
   seq->seq1 = seq2;
 }
 
-static void do_alphaover_effect_byte(
-    float fac, int x, int y, uchar *rect1, uchar *rect2, uchar *out)
+#if BLI_HAVE_SSE2
+
+static __m128 straight_uchar_to_premul_float_simd(const unsigned char color[4])
 {
-  uchar *cp1 = rect1;
-  uchar *cp2 = rect2;
+  int packed;
+  memcpy(&packed, color, 4);
+  /* Packed 8 bit values. */
+  __m128i rgba8 = _mm_cvtsi32_si128(packed);
+  /* Spread to 16 bit values. */
+  __m128i rgba16 = _mm_unpacklo_epi8(rgba8, _mm_setzero_si128());
+  /* Spread to 32 bit values, now each SSE lane has the RGBA value. */
+  __m128i rgba32 = _mm_unpacklo_epi16(rgba16, _mm_setzero_si128());
+
+  /* Premultiply. */
+  __m128 inv_255 = _mm_set1_ps(1.0f / 255.0f);
+  __m128 col = _mm_cvtepi32_ps(rgba32);
+  __m128 alpha = _mm_mul_ps(_mm_shuffle_ps(col, col, _MM_SHUFFLE(3, 3, 3, 3)), inv_255);
+  __m128 fac = _mm_mul_ps(alpha, inv_255);
+  __m128 premul = _mm_mul_ps(col, fac);
+
+  /* Select RGB from premultiplied color, and alpha as is.
+   * With SSE4 this could use _mm_blendv_ps. */
+  __m128 mask = _mm_castsi128_ps(_mm_set_epi32(~0, 0, 0, 0));
+  __m128 res = _mm_or_ps(_mm_and_ps(mask, alpha), _mm_andnot_ps(mask, premul));
+  return res;
+}
+
+static void premul_float_to_straight_uchar_simd(unsigned char *result, __m128 color)
+{
+  __m128 alpha = _mm_shuffle_ps(color, color, _MM_SHUFFLE(3, 3, 3, 3));
+  __m128 one = _mm_set1_ps(1.0f);
+  __m128 is_one = _mm_cmpeq_ps(alpha, one);
+  __m128 is_zero = _mm_cmpeq_ps(alpha, _mm_setzero_ps());
+
+  __m128 straight = _mm_div_ps(color, alpha);
+
+  __m128 mask = _mm_castsi128_ps(_mm_set_epi32(~0, 0, 0, 0));
+  mask = _mm_or_ps(mask, _mm_or_ps(is_zero, is_one));
+  /* With SSE4 this could use _mm_blendv_ps. */
+  color = _mm_or_ps(_mm_and_ps(mask, color), _mm_andnot_ps(mask, straight));
+
+  /* Convert to 0..255. */
+  color = _mm_max_ps(color, _mm_setzero_ps());
+  color = _mm_min_ps(color, one);
+  color = _mm_mul_ps(color, _mm_set1_ps(255.0f));
+  color = _mm_add_ps(color, _mm_set1_ps(0.5f));
+
+  /* Pack and write to destination: pack to 16 bit signed, then to 8 bit
+   * unsigned, then write resulting 32-bit value. */
+  __m128i rgba32 = _mm_cvttps_epi32(color);
+  __m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
+  __m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
+  _mm_store_ss((float *)result, _mm_castsi128_ps(rgba8));
+}
+
+#endif /* BLI_HAVE_SSE2 */
+
+/* Blend a pixel: cp1 over cp2, using factor combined with alpha from cp1. */
+static void blend_pixel_alphaover_byte(const uchar *cp1, const uchar *cp2, uchar *out, float fac)
+{
+#if !BLI_HAVE_SSE2
+  /* Scalar implementation as a fallback and reference. */
+  float col[4], rt1[4], rt2[4];
+  straight_uchar_to_premul_float(rt1, cp1);
+
+  float mfac = 1.0f - fac * rt1[3];
+
+  if (mfac <= 0.0f) {
+    *((uint *)out) = *((uint *)cp1);
+  }
+  else {
+    straight_uchar_to_premul_float(rt2, cp2);
+    col[0] = fac * rt1[0] + mfac * rt2[0];
+    col[1] = fac * rt1[1] + mfac * rt2[1];
+    col[2] = fac * rt1[2] + mfac * rt2[2];
+    col[3] = fac * rt1[3] + mfac * rt2[3];
+    premul_float_to_straight_uchar(out, col);
+  }
+
+#else
+  /* Same as above, but with SIMD. */
+  __m128 rt1 = straight_uchar_to_premul_float_simd(cp1);
+
+  __m128 a = _mm_shuffle_ps(rt1, rt1, _MM_SHUFFLE(3, 3, 3, 3));
+  __m128 fac4 = _mm_set1_ps(fac);
+  __m128 mfac = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(fac4, a));
+  float mfac1 = _mm_cvtss_f32(mfac);
+
+  if (mfac1 <= 0.0f) {
+    *((uint *)out) = *((uint *)cp1);
+  }
+  else {
+    __m128 rt2 = straight_uchar_to_premul_float_simd(cp2);
+    __m128 col1 = _mm_mul_ps(fac4, rt1);
+    __m128 col2 = _mm_mul_ps(mfac, rt2);
+    __m128 col = _mm_add_ps(col1, col2);
+    premul_float_to_straight_uchar_simd(out, col);
+  }
+#endif
+}
+
+static void do_alphaover_effect_byte(
+    float fac, int width, int height, const uchar *rect1, const uchar *rect2, uchar *out)
+{
+  if (fac <= 0.0f) {
+    memcpy(out, rect2, width * height * 4);
+    return;
+  }
+
+  const uchar *cp1 = rect1;
+  const uchar *cp2 = rect2;
   uchar *rt = out;
 
-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      /* rt = rt1 over rt2  (alpha from rt1) */
-
-      float tempc[4], rt1[4], rt2[4];
-      straight_uchar_to_premul_float(rt1, cp1);
-      straight_uchar_to_premul_float(rt2, cp2);
-
-      float mfac = 1.0f - fac * rt1[3];
-
-      if (fac <= 0.0f) {
-        *((uint *)rt) = *((uint *)cp2);
-      }
-      else if (mfac <= 0.0f) {
-        *((uint *)rt) = *((uint *)cp1);
-      }
-      else {
-        tempc[0] = fac * rt1[0] + mfac * rt2[0];
-        tempc[1] = fac * rt1[1] + mfac * rt2[1];
-        tempc[2] = fac * rt1[2] + mfac * rt2[2];
-        tempc[3] = fac * rt1[3] + mfac * rt2[3];
-
-        premul_float_to_straight_uchar(rt, tempc);
-      }
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      blend_pixel_alphaover_byte(cp1, cp2, rt, fac);
       cp1 += 4;
       cp2 += 4;
       rt += 4;
-- 
2.30.2