1 changed files with 114 additions and 28 deletions
--- a/source/blender/sequencer/intern/effects.cc
+++ b/source/blender/sequencer/intern/effects.cc
@ -21,6 +21,7 @@
 #include "BLI_math_vector_types.hh"
 #include "BLI_path_util.h"
 #include "BLI_rect.h"
+#include "BLI_simd.h"
 #include "BLI_string.h"
 #include "BLI_task.hh"
 #include "BLI_threads.h"
@ -203,37 +204,122 @@ static void init_alpha_over_or_under(Sequence *seq)
  seq->seq1 = seq2;
 }

-static void do_alphaover_effect_byte(
-    float fac, int x, int y, uchar *rect1, uchar *rect2, uchar *out)
+#if BLI_HAVE_SSE2
+
+static __m128 straight_uchar_to_premul_float_simd(const unsigned char color[4])
 {
-  uchar *cp1 = rect1;
-  uchar *cp2 = rect2;
+  int packed;
+  memcpy(&packed, color, 4);
+  /* Packed 8 bit values. */
+  __m128i rgba8 = _mm_cvtsi32_si128(packed);
+  /* Spread to 16 bit values. */
+  __m128i rgba16 = _mm_unpacklo_epi8(rgba8, _mm_setzero_si128());
+  /* Spread to 32 bit values, now each SSE lane has the RGBA value. */
+  __m128i rgba32 = _mm_unpacklo_epi16(rgba16, _mm_setzero_si128());
+
+  /* Premultiply. */
+  __m128 inv_255 = _mm_set1_ps(1.0f / 255.0f);
+  __m128 col = _mm_cvtepi32_ps(rgba32);
+  __m128 alpha = _mm_mul_ps(_mm_shuffle_ps(col, col, _MM_SHUFFLE(3, 3, 3, 3)), inv_255);
+  __m128 fac = _mm_mul_ps(alpha, inv_255);
+  __m128 premul = _mm_mul_ps(col, fac);
+
+  /* Select RGB from premultiplied color, and alpha as is.
+   * With SSE4 this could use _mm_blendv_ps. */
+  __m128 mask = _mm_castsi128_ps(_mm_set_epi32(~0, 0, 0, 0));
+  __m128 res = _mm_or_ps(_mm_and_ps(mask, alpha), _mm_andnot_ps(mask, premul));
+  return res;
+}
+
+static void premul_float_to_straight_uchar_simd(unsigned char *result, __m128 color)
+{
+  __m128 alpha = _mm_shuffle_ps(color, color, _MM_SHUFFLE(3, 3, 3, 3));
+  __m128 one = _mm_set1_ps(1.0f);
+  __m128 is_one = _mm_cmpeq_ps(alpha, one);
+  __m128 is_zero = _mm_cmpeq_ps(alpha, _mm_setzero_ps());
+
+  __m128 straight = _mm_div_ps(color, alpha);
+
+  __m128 mask = _mm_castsi128_ps(_mm_set_epi32(~0, 0, 0, 0));
+  mask = _mm_or_ps(mask, _mm_or_ps(is_zero, is_one));
+  /* With SSE4 this could use _mm_blendv_ps. */
+  color = _mm_or_ps(_mm_and_ps(mask, color), _mm_andnot_ps(mask, straight));
+
+  /* Convert to 0..255. */
+  color = _mm_max_ps(color, _mm_setzero_ps());
+  color = _mm_min_ps(color, one);
+  color = _mm_mul_ps(color, _mm_set1_ps(255.0f));
+  color = _mm_add_ps(color, _mm_set1_ps(0.5f));
+
+  /* Pack and write to destination: pack to 16 bit signed, then to 8 bit
+   * unsigned, then write resulting 32-bit value. */
+  __m128i rgba32 = _mm_cvttps_epi32(color);
+  __m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
+  __m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
+  _mm_store_ss((float *)result, _mm_castsi128_ps(rgba8));
+}
+
+#endif /* BLI_HAVE_SSE2 */
+
+/* Blend a pixel: cp1 over cp2, using factor combined with alpha from cp1. */
+static void blend_pixel_alphaover_byte(const uchar *cp1, const uchar *cp2, uchar *out, float fac)
+{
+#if !BLI_HAVE_SSE2
+  /* Scalar implementation as a fallback and reference. */
+  float col[4], rt1[4], rt2[4];
+  straight_uchar_to_premul_float(rt1, cp1);
+
+  float mfac = 1.0f - fac * rt1[3];
+
+  if (mfac <= 0.0f) {
+    *((uint *)out) = *((uint *)cp1);
+  }
+  else {
+    straight_uchar_to_premul_float(rt2, cp2);
+    col[0] = fac * rt1[0] + mfac * rt2[0];
+    col[1] = fac * rt1[1] + mfac * rt2[1];
+    col[2] = fac * rt1[2] + mfac * rt2[2];
+    col[3] = fac * rt1[3] + mfac * rt2[3];
+    premul_float_to_straight_uchar(out, col);
+  }
+
+#else
+  /* Same as above, but with SIMD. */
+  __m128 rt1 = straight_uchar_to_premul_float_simd(cp1);
+
+  __m128 a = _mm_shuffle_ps(rt1, rt1, _MM_SHUFFLE(3, 3, 3, 3));
+  __m128 fac4 = _mm_set1_ps(fac);
+  __m128 mfac = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(fac4, a));
+  float mfac1 = _mm_cvtss_f32(mfac);
+
+  if (mfac1 <= 0.0f) {
+    *((uint *)out) = *((uint *)cp1);
+  }
+  else {
+    __m128 rt2 = straight_uchar_to_premul_float_simd(cp2);
+    __m128 col1 = _mm_mul_ps(fac4, rt1);
+    __m128 col2 = _mm_mul_ps(mfac, rt2);
+    __m128 col = _mm_add_ps(col1, col2);
+    premul_float_to_straight_uchar_simd(out, col);
+  }
+#endif
+}
+
+static void do_alphaover_effect_byte(
+    float fac, int width, int height, const uchar *rect1, const uchar *rect2, uchar *out)
+{
+  if (fac <= 0.0f) {
+    memcpy(out, rect2, width * height * 4);
+    return;
+  }
+
+  const uchar *cp1 = rect1;
+  const uchar *cp2 = rect2;
  uchar *rt = out;

-  for (int i = 0; i < y; i++) {
-    for (int j = 0; j < x; j++) {
-      /* rt = rt1 over rt2  (alpha from rt1) */
-
-      float tempc[4], rt1[4], rt2[4];
-      straight_uchar_to_premul_float(rt1, cp1);
-      straight_uchar_to_premul_float(rt2, cp2);
-
-      float mfac = 1.0f - fac * rt1[3];
-
-      if (fac <= 0.0f) {
-        *((uint *)rt) = *((uint *)cp2);
-      }
-      else if (mfac <= 0.0f) {
-        *((uint *)rt) = *((uint *)cp1);
-      }
-      else {
-        tempc[0] = fac * rt1[0] + mfac * rt2[0];
-        tempc[1] = fac * rt1[1] + mfac * rt2[1];
-        tempc[2] = fac * rt1[2] + mfac * rt2[2];
-        tempc[3] = fac * rt1[3] + mfac * rt2[3];
-
-        premul_float_to_straight_uchar(rt, tempc);
-      }
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      blend_pixel_alphaover_byte(cp1, cp2, rt, fac);
      cp1 += 4;
      cp2 += 4;
      rt += 4;