WIP: VSE: speedup Alpha Over blend with SIMD #115892

Aras Pranckevicius wants to merge 1 commits from aras_p/blender:vse-alphaover-opt into main

1 changed files with 114 additions and 28 deletions

@ -21,6 +21,7 @@
#include "BLI_math_vector_types.hh"
#include "BLI_path_util.h"
#include "BLI_rect.h"
#include "BLI_simd.h"
#include "BLI_string.h"
#include "BLI_task.hh"
#include "BLI_threads.h"
@ -203,37 +204,122 @@ static void init_alpha_over_or_under(Sequence *seq)
seq->seq1 = seq2;
static void do_alphaover_effect_byte(
float fac, int x, int y, uchar *rect1, uchar *rect2, uchar *out)
static __m128 straight_uchar_to_premul_float_simd(const unsigned char color[4])
uchar *cp1 = rect1;
uchar *cp2 = rect2;
int packed;
memcpy(&packed, color, 4);
/* Packed 8 bit values. */
__m128i rgba8 = _mm_cvtsi32_si128(packed);
/* Spread to 16 bit values. */
__m128i rgba16 = _mm_unpacklo_epi8(rgba8, _mm_setzero_si128());
/* Spread to 32 bit values, now each SSE lane has the RGBA value. */
__m128i rgba32 = _mm_unpacklo_epi16(rgba16, _mm_setzero_si128());
/* Premultiply. */
__m128 inv_255 = _mm_set1_ps(1.0f / 255.0f);
__m128 col = _mm_cvtepi32_ps(rgba32);
__m128 alpha = _mm_mul_ps(_mm_shuffle_ps(col, col, _MM_SHUFFLE(3, 3, 3, 3)), inv_255);
__m128 fac = _mm_mul_ps(alpha, inv_255);
__m128 premul = _mm_mul_ps(col, fac);
/* Select RGB from premultiplied color, and alpha as is.
* With SSE4 this could use _mm_blendv_ps. */
__m128 mask = _mm_castsi128_ps(_mm_set_epi32(~0, 0, 0, 0));
__m128 res = _mm_or_ps(_mm_and_ps(mask, alpha), _mm_andnot_ps(mask, premul));
return res;
static void premul_float_to_straight_uchar_simd(unsigned char *result, __m128 color)
__m128 alpha = _mm_shuffle_ps(color, color, _MM_SHUFFLE(3, 3, 3, 3));
__m128 one = _mm_set1_ps(1.0f);
__m128 is_one = _mm_cmpeq_ps(alpha, one);
__m128 is_zero = _mm_cmpeq_ps(alpha, _mm_setzero_ps());
__m128 straight = _mm_div_ps(color, alpha);
__m128 mask = _mm_castsi128_ps(_mm_set_epi32(~0, 0, 0, 0));
mask = _mm_or_ps(mask, _mm_or_ps(is_zero, is_one));
/* With SSE4 this could use _mm_blendv_ps. */
color = _mm_or_ps(_mm_and_ps(mask, color), _mm_andnot_ps(mask, straight));
/* Convert to 0..255. */
color = _mm_max_ps(color, _mm_setzero_ps());
color = _mm_min_ps(color, one);
color = _mm_mul_ps(color, _mm_set1_ps(255.0f));
color = _mm_add_ps(color, _mm_set1_ps(0.5f));
/* Pack and write to destination: pack to 16 bit signed, then to 8 bit
* unsigned, then write resulting 32-bit value. */
__m128i rgba32 = _mm_cvttps_epi32(color);
__m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
__m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
_mm_store_ss((float *)result, _mm_castsi128_ps(rgba8));
#endif /* BLI_HAVE_SSE2 */
/* Blend a pixel: cp1 over cp2, using factor combined with alpha from cp1. */
static void blend_pixel_alphaover_byte(const uchar *cp1, const uchar *cp2, uchar *out, float fac)
/* Scalar implementation as a fallback and reference. */
float col[4], rt1[4], rt2[4];
straight_uchar_to_premul_float(rt1, cp1);
float mfac = 1.0f - fac * rt1[3];
if (mfac <= 0.0f) {
*((uint *)out) = *((uint *)cp1);
else {
straight_uchar_to_premul_float(rt2, cp2);
col[0] = fac * rt1[0] + mfac * rt2[0];
col[1] = fac * rt1[1] + mfac * rt2[1];
col[2] = fac * rt1[2] + mfac * rt2[2];
col[3] = fac * rt1[3] + mfac * rt2[3];
premul_float_to_straight_uchar(out, col);
/* Same as above, but with SIMD. */
__m128 rt1 = straight_uchar_to_premul_float_simd(cp1);
__m128 a = _mm_shuffle_ps(rt1, rt1, _MM_SHUFFLE(3, 3, 3, 3));
__m128 fac4 = _mm_set1_ps(fac);
__m128 mfac = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(fac4, a));
float mfac1 = _mm_cvtss_f32(mfac);
if (mfac1 <= 0.0f) {
*((uint *)out) = *((uint *)cp1);
else {
__m128 rt2 = straight_uchar_to_premul_float_simd(cp2);
__m128 col1 = _mm_mul_ps(fac4, rt1);
__m128 col2 = _mm_mul_ps(mfac, rt2);
__m128 col = _mm_add_ps(col1, col2);
premul_float_to_straight_uchar_simd(out, col);
static void do_alphaover_effect_byte(
float fac, int width, int height, const uchar *rect1, const uchar *rect2, uchar *out)
if (fac <= 0.0f) {
memcpy(out, rect2, width * height * 4);
const uchar *cp1 = rect1;
const uchar *cp2 = rect2;
uchar *rt = out;
for (int i = 0; i < y; i++) {
for (int j = 0; j < x; j++) {
/* rt = rt1 over rt2 (alpha from rt1) */
float tempc[4], rt1[4], rt2[4];
straight_uchar_to_premul_float(rt1, cp1);
straight_uchar_to_premul_float(rt2, cp2);
float mfac = 1.0f - fac * rt1[3];
if (fac <= 0.0f) {
*((uint *)rt) = *((uint *)cp2);
else if (mfac <= 0.0f) {
*((uint *)rt) = *((uint *)cp1);
else {
tempc[0] = fac * rt1[0] + mfac * rt2[0];
tempc[1] = fac * rt1[1] + mfac * rt2[1];
tempc[2] = fac * rt1[2] + mfac * rt2[2];
tempc[3] = fac * rt1[3] + mfac * rt2[3];
premul_float_to_straight_uchar(rt, tempc);
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
blend_pixel_alphaover_byte(cp1, cp2, rt, fac);
cp1 += 4;
cp2 += 4;
rt += 4;