From 692a8a6816f047a2393b12e32634b02abde3fe09 Mon Sep 17 00:00:00 2001 From: Aras Pranckevicius Date: Tue, 5 Dec 2023 21:44:49 +0200 Subject: [PATCH 1/5] VSE: optimize Glow effect with multi-threading Applying glow at 4K UHD resolution, on Windows Ryzen 5950X: - distance 4: 935ms -> 136ms - distance 20: 3524ms -> 365ms --- source/blender/sequencer/intern/effects.cc | 319 ++++++++++----------- 1 file changed, 149 insertions(+), 170 deletions(-) diff --git a/source/blender/sequencer/intern/effects.cc b/source/blender/sequencer/intern/effects.cc index 9fa721f4e2b..16003f850ce 100644 --- a/source/blender/sequencer/intern/effects.cc +++ b/source/blender/sequencer/intern/effects.cc @@ -14,11 +14,14 @@ #include "MEM_guardedalloc.h" +#include "BLI_array.hh" #include "BLI_listbase.h" #include "BLI_math_rotation.h" +#include "BLI_math_vector_types.hh" #include "BLI_path_util.h" #include "BLI_rect.h" #include "BLI_string.h" +#include "BLI_task.hh" #include "BLI_threads.h" #include "BLI_utildefines.h" @@ -60,6 +63,8 @@ #include "strip_time.hh" #include "utils.hh" +#include "BLI_timeit.hh" + static SeqEffectHandle get_sequence_effect_impl(int seq_type); /* -------------------------------------------------------------------- */ @@ -122,13 +127,6 @@ static void slice_get_float_buffers(const SeqRenderData *context, /** \name Glow Effect * \{ */ -enum { - GlowR = 0, - GlowG = 1, - GlowB = 2, - GlowA = 3, -}; - static ImBuf *prepare_effect_imbufs(const SeqRenderData *context, ImBuf *ibuf1, ImBuf *ibuf2, @@ -1960,20 +1958,9 @@ static void do_transform_effect(const SeqRenderData *context, /** \name Glow Effect * \{ */ -static void RVBlurBitmap2_float(float *map, int width, int height, float blur, int quality) +static void glow_blur_bitmap(blender::float4 *map, int width, int height, float blur, int quality) { - /* Much better than the previous blur! - * We do the blurring in two passes which is a whole lot faster. - * I changed the math around to implement an actual Gaussian distribution. - * - * Watch out though, it tends to misbehave with large blur values on - * a small bitmap. Avoid! */ - - float *temp = nullptr, *swap; - float *filter = nullptr; - int x, y, i, fx, fy; - int index, ix, halfWidth; - float fval, k, curColor[4], curColor2[4], weight = 0; + using namespace blender; /* If we're not really blurring, bail out */ if (blur <= 0) { @@ -1981,183 +1968,141 @@ static void RVBlurBitmap2_float(float *map, int width, int height, float blur, i } /* If result would be no blurring, early out. */ - halfWidth = ((quality + 1) * blur); + const int halfWidth = ((quality + 1) * blur); if (halfWidth == 0) { return; } - /* Allocate memory for the temp-map and the blur filter matrix. */ - temp = static_cast(MEM_mallocN(sizeof(float[4]) * width * height, "blurbitmaptemp")); - if (!temp) { - return; - } + Array temp(width * height); - /* Allocate memory for the filter elements */ - filter = (float *)MEM_mallocN(sizeof(float) * halfWidth * 2, "blurbitmapfilter"); - if (!filter) { - MEM_freeN(temp); - return; - } - - /* Apparently we're calculating a bell curve based on the standard deviation (or radius) - * This code is based on an example posted to comp.graphics.algorithms by - * Blancmange - */ - - k = -1.0f / (2.0f * float(M_PI) * blur * blur); - - for (ix = 0; ix < halfWidth; ix++) { + /* Initialize the gaussian filter. @TODO: use code from RE_filter_value */ + Array filter(halfWidth * 2); + const float k = -1.0f / (2.0f * float(M_PI) * blur * blur); + float weight = 0; + for (int ix = 0; ix < halfWidth; ix++) { weight = float(exp(k * (ix * ix))); filter[halfWidth - ix] = weight; filter[halfWidth + ix] = weight; } filter[0] = weight; - /* Normalize the array */ - fval = 0; - for (ix = 0; ix < halfWidth * 2; ix++) { + float fval = 0; + for (int ix = 0; ix < halfWidth * 2; ix++) { fval += filter[ix]; } - - for (ix = 0; ix < halfWidth * 2; ix++) { + for (int ix = 0; ix < halfWidth * 2; ix++) { filter[ix] /= fval; } /* Blur the rows */ - for (y = 0; y < height; y++) { - /* Do the left & right strips */ - for (x = 0; x < halfWidth; x++) { - fx = 0; - zero_v4(curColor); - zero_v4(curColor2); + threading::parallel_for(IndexRange(height), 32, [&](const IndexRange y_range) { + for (const int y : y_range) { + /* Do the left & right strips */ + for (int x = 0; x < halfWidth; x++) { + float4 curColor = float4(0.0f); + float4 curColor2 = float4(0.0f); - for (i = x - halfWidth; i < x + halfWidth; i++) { - if ((i >= 0) && (i < width)) { - index = (i + y * width) * 4; - madd_v4_v4fl(curColor, map + index, filter[fx]); - - index = (width - 1 - i + y * width) * 4; - madd_v4_v4fl(curColor2, map + index, filter[fx]); + int fx = 0; + for (int i = x - halfWidth; i < x + halfWidth; i++) { + if ((i >= 0) && (i < width)) { + curColor += map[i + y * width] * filter[fx]; + curColor2 += map[width - 1 - i + y * width] * filter[fx]; + } + fx++; } - fx++; + temp[x + y * width] = curColor; + temp[width - 1 - x + y * width] = curColor2; } - index = (x + y * width) * 4; - copy_v4_v4(temp + index, curColor); - index = (width - 1 - x + y * width) * 4; - copy_v4_v4(temp + index, curColor2); - } - - /* Do the main body */ - for (x = halfWidth; x < width - halfWidth; x++) { - fx = 0; - zero_v4(curColor); - for (i = x - halfWidth; i < x + halfWidth; i++) { - index = (i + y * width) * 4; - madd_v4_v4fl(curColor, map + index, filter[fx]); - fx++; + /* Do the main body */ + for (int x = halfWidth; x < width - halfWidth; x++) { + int fx = 0; + float4 curColor = float4(0.0f); + for (int i = x - halfWidth; i < x + halfWidth; i++) { + curColor += map[i + y * width] * filter[fx]; + fx++; + } + temp[x + y * width] = curColor; } - index = (x + y * width) * 4; - copy_v4_v4(temp + index, curColor); } - } - - /* Swap buffers */ - swap = temp; - temp = map; - map = swap; + }); /* Blur the columns */ - for (x = 0; x < width; x++) { - /* Do the top & bottom strips */ - for (y = 0; y < halfWidth; y++) { - fy = 0; - zero_v4(curColor); - zero_v4(curColor2); - for (i = y - halfWidth; i < y + halfWidth; i++) { - if ((i >= 0) && (i < height)) { - /* Bottom */ - index = (x + i * width) * 4; - madd_v4_v4fl(curColor, map + index, filter[fy]); - - /* Top */ - index = (x + (height - 1 - i) * width) * 4; - madd_v4_v4fl(curColor2, map + index, filter[fy]); + threading::parallel_for(IndexRange(width), 32, [&](const IndexRange x_range) { + for (const int x : x_range) { + /* Do the top & bottom strips */ + for (int y = 0; y < halfWidth; y++) { + float4 curColor = float4(0.0f); + float4 curColor2 = float4(0.0f); + int fy = 0; + for (int i = y - halfWidth; i < y + halfWidth; i++) { + if ((i >= 0) && (i < height)) { + curColor += temp[x + i * width] * filter[fy]; + curColor2 += temp[x + (height - 1 - i) * width] * filter[fy]; + } + fy++; } - fy++; + + map[x + y * width] = curColor; + map[x + (height - 1 - y) * width] = curColor2; } - index = (x + y * width) * 4; - copy_v4_v4(temp + index, curColor); - index = (x + (height - 1 - y) * width) * 4; - copy_v4_v4(temp + index, curColor2); - } - - /* Do the main body */ - for (y = halfWidth; y < height - halfWidth; y++) { - fy = 0; - zero_v4(curColor); - for (i = y - halfWidth; i < y + halfWidth; i++) { - index = (x + i * width) * 4; - madd_v4_v4fl(curColor, map + index, filter[fy]); - fy++; + /* Do the main body */ + for (int y = halfWidth; y < height - halfWidth; y++) { + float4 curColor = float4(0.0f); + int fy = 0; + for (int i = y - halfWidth; i < y + halfWidth; i++) { + curColor += temp[x + i * width] * filter[fy]; + fy++; + } + map[x + y * width] = curColor; } - index = (x + y * width) * 4; - copy_v4_v4(temp + index, curColor); } - } - - /* Swap buffers */ - swap = temp; - temp = map; - // map = swap; /* UNUSED. */ - - /* Tidy up. */ - MEM_freeN(filter); - MEM_freeN(temp); + }); } -static void RVAddBitmaps_float(float *a, float *b, float *c, int width, int height) +static void blur_add_bitmap(const float *src, float *dst, int width, int height) { - int x, y, index; - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) { - index = (x + y * width) * 4; - c[index + GlowR] = min_ff(1.0f, a[index + GlowR] + b[index + GlowR]); - c[index + GlowG] = min_ff(1.0f, a[index + GlowG] + b[index + GlowG]); - c[index + GlowB] = min_ff(1.0f, a[index + GlowB] + b[index + GlowB]); - c[index + GlowA] = min_ff(1.0f, a[index + GlowA] + b[index + GlowA]); + using namespace blender; + threading::parallel_for(IndexRange(height), 64, [&](const IndexRange y_range) { + for (const int y : y_range) { + int index = y * width * 4; + for (int x = 0; x < width; x++, index += 4) { + dst[index + 0] = min_ff(1.0f, src[index + 0] + dst[index + 0]); + dst[index + 1] = min_ff(1.0f, src[index + 1] + dst[index + 1]); + dst[index + 2] = min_ff(1.0f, src[index + 2] + dst[index + 2]); + dst[index + 3] = min_ff(1.0f, src[index + 3] + dst[index + 3]); + } } - } + }); } -static void RVIsolateHighlights_float( +static void blur_isolate_highlights( const float *in, float *out, int width, int height, float threshold, float boost, float clamp) { - int x, y, index; - float intensity; + using namespace blender; + threading::parallel_for(IndexRange(height), 64, [&](const IndexRange y_range) { + for (const int y : y_range) { + int index = y * width * 4; + for (int x = 0; x < width; x++, index += 4) { - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) { - index = (x + y * width) * 4; - - /* Isolate the intensity */ - intensity = (in[index + GlowR] + in[index + GlowG] + in[index + GlowB] - threshold); - if (intensity > 0) { - out[index + GlowR] = min_ff(clamp, (in[index + GlowR] * boost * intensity)); - out[index + GlowG] = min_ff(clamp, (in[index + GlowG] * boost * intensity)); - out[index + GlowB] = min_ff(clamp, (in[index + GlowB] * boost * intensity)); - out[index + GlowA] = min_ff(clamp, (in[index + GlowA] * boost * intensity)); - } - else { - out[index + GlowR] = 0; - out[index + GlowG] = 0; - out[index + GlowB] = 0; - out[index + GlowA] = 0; + /* Isolate the intensity */ + float intensity = (in[index + 0] + in[index + 1] + in[index + 2] - threshold); + if (intensity > 0) { + out[index + 0] = min_ff(clamp, (in[index + 0] * boost * intensity)); + out[index + 1] = min_ff(clamp, (in[index + 1] * boost * intensity)); + out[index + 2] = min_ff(clamp, (in[index + 2] * boost * intensity)); + out[index + 3] = min_ff(clamp, (in[index + 3] * boost * intensity)); + } + else { + out[index + 0] = 0; + out[index + 1] = 0; + out[index + 2] = 0; + out[index + 3] = 0; + } } } - } + }); } static void init_glow_effect(Sequence *seq) @@ -2203,25 +2148,54 @@ static void do_glow_effect_byte(Sequence *seq, uchar * /*rect2*/, uchar *out) { - float *outbuf, *inbuf; + SCOPED_TIMER(__func__); GlowVars *glow = (GlowVars *)seq->effectdata; - inbuf = static_cast(MEM_mallocN(sizeof(float[4]) * x * y, "glow effect input")); - outbuf = static_cast(MEM_mallocN(sizeof(float[4]) * x * y, "glow effect output")); + float *inbuf = static_cast(MEM_mallocN(sizeof(float[4]) * x * y, "glow effect input")); + float *outbuf = static_cast( + MEM_mallocN(sizeof(float[4]) * x * y, "glow effect output")); - IMB_buffer_float_from_byte(inbuf, rect1, IB_PROFILE_SRGB, IB_PROFILE_SRGB, false, x, y, x, x); - IMB_buffer_float_premultiply(inbuf, x, y); + using namespace blender; + threading::parallel_for(IndexRange(y), 64, [&](const IndexRange y_range) { + size_t offset = y_range.first() * x * 4; + IMB_buffer_float_from_byte(inbuf + offset, + rect1 + offset, + IB_PROFILE_SRGB, + IB_PROFILE_SRGB, + false, + x, + y_range.size(), + x, + x); + IMB_buffer_float_premultiply(inbuf + offset, x, y_range.size()); + }); - RVIsolateHighlights_float( + blur_isolate_highlights( inbuf, outbuf, x, y, glow->fMini * 3.0f, glow->fBoost * fac, glow->fClamp); - RVBlurBitmap2_float(outbuf, x, y, glow->dDist * (render_size / 100.0f), glow->dQuality); + glow_blur_bitmap(reinterpret_cast(outbuf), + x, + y, + glow->dDist * (render_size / 100.0f), + glow->dQuality); if (!glow->bNoComp) { - RVAddBitmaps_float(inbuf, outbuf, outbuf, x, y); + blur_add_bitmap(inbuf, outbuf, x, y); } - IMB_buffer_float_unpremultiply(outbuf, x, y); - IMB_buffer_byte_from_float( - out, outbuf, 4, 0.0f, IB_PROFILE_SRGB, IB_PROFILE_SRGB, false, x, y, x, x); + threading::parallel_for(IndexRange(y), 64, [&](const IndexRange y_range) { + size_t offset = y_range.first() * x * 4; + IMB_buffer_float_unpremultiply(outbuf + offset, x, y_range.size()); + IMB_buffer_byte_from_float(out + offset, + outbuf + offset, + 4, + 0.0f, + IB_PROFILE_SRGB, + IB_PROFILE_SRGB, + false, + x, + y_range.size(), + x, + x); + }); MEM_freeN(inbuf); MEM_freeN(outbuf); @@ -2236,15 +2210,20 @@ static void do_glow_effect_float(Sequence *seq, float * /*rect2*/, float *out) { + SCOPED_TIMER(__func__); float *outbuf = out; float *inbuf = rect1; GlowVars *glow = (GlowVars *)seq->effectdata; - RVIsolateHighlights_float( + blur_isolate_highlights( inbuf, outbuf, x, y, glow->fMini * 3.0f, glow->fBoost * fac, glow->fClamp); - RVBlurBitmap2_float(outbuf, x, y, glow->dDist * (render_size / 100.0f), glow->dQuality); + glow_blur_bitmap(reinterpret_cast(outbuf), + x, + y, + glow->dDist * (render_size / 100.0f), + glow->dQuality); if (!glow->bNoComp) { - RVAddBitmaps_float(inbuf, outbuf, outbuf, x, y); + blur_add_bitmap(inbuf, outbuf, x, y); } } -- 2.30.2 From ebde205d2b8d33584f461a5ecfd48b4bb0031b92 Mon Sep 17 00:00:00 2001 From: Aras Pranckevicius Date: Tue, 5 Dec 2023 22:01:45 +0200 Subject: [PATCH 2/5] VSE: further speedup of Glow by merging operations Instead of doing preparation/finishing operations in separate passes over the image, do a combined operation in one go. This also makes IMB_buffer_float_unpremultiply and IMB_buffer_float_premultiply not be used by anything, so remove. Applying glow at 4K UHD resolution, on Windows Ryzen 5950X: - distance 4: 136ms -> 122ms - distance 20: 365ms -> 346ms --- source/blender/imbuf/IMB_imbuf.h | 2 -- source/blender/imbuf/intern/divers.cc | 20 -------------------- source/blender/sequencer/intern/effects.cc | 17 ++--------------- 3 files changed, 2 insertions(+), 37 deletions(-) diff --git a/source/blender/imbuf/IMB_imbuf.h b/source/blender/imbuf/IMB_imbuf.h index fdb9098b339..fa2aac4ea35 100644 --- a/source/blender/imbuf/IMB_imbuf.h +++ b/source/blender/imbuf/IMB_imbuf.h @@ -643,8 +643,6 @@ void IMB_buffer_byte_from_byte(unsigned char *rect_to, int height, int stride_to, int stride_from); -void IMB_buffer_float_unpremultiply(float *buf, int width, int height); -void IMB_buffer_float_premultiply(float *buf, int width, int height); /** * Change the ordering of the color bytes pointed to by rect from diff --git a/source/blender/imbuf/intern/divers.cc b/source/blender/imbuf/intern/divers.cc index 21240912448..d2fcfed4712 100644 --- a/source/blender/imbuf/intern/divers.cc +++ b/source/blender/imbuf/intern/divers.cc @@ -856,26 +856,6 @@ void IMB_color_to_bw(ImBuf *ibuf) } } -void IMB_buffer_float_unpremultiply(float *buf, int width, int height) -{ - size_t total = size_t(width) * height; - float *fp = buf; - while (total--) { - premul_to_straight_v4(fp); - fp += 4; - } -} - -void IMB_buffer_float_premultiply(float *buf, int width, int height) -{ - size_t total = size_t(width) * height; - float *fp = buf; - while (total--) { - straight_to_premul_v4(fp); - fp += 4; - } -} - /** \} */ /* -------------------------------------------------------------------- */ diff --git a/source/blender/sequencer/intern/effects.cc b/source/blender/sequencer/intern/effects.cc index 16003f850ce..e2eb24f9971 100644 --- a/source/blender/sequencer/intern/effects.cc +++ b/source/blender/sequencer/intern/effects.cc @@ -2156,19 +2156,7 @@ static void do_glow_effect_byte(Sequence *seq, MEM_mallocN(sizeof(float[4]) * x * y, "glow effect output")); using namespace blender; - threading::parallel_for(IndexRange(y), 64, [&](const IndexRange y_range) { - size_t offset = y_range.first() * x * 4; - IMB_buffer_float_from_byte(inbuf + offset, - rect1 + offset, - IB_PROFILE_SRGB, - IB_PROFILE_SRGB, - false, - x, - y_range.size(), - x, - x); - IMB_buffer_float_premultiply(inbuf + offset, x, y_range.size()); - }); + IMB_colormanagement_transform_from_byte_threaded(inbuf, rect1, x, y, 4, "sRGB", "sRGB"); blur_isolate_highlights( inbuf, outbuf, x, y, glow->fMini * 3.0f, glow->fBoost * fac, glow->fClamp); @@ -2183,14 +2171,13 @@ static void do_glow_effect_byte(Sequence *seq, threading::parallel_for(IndexRange(y), 64, [&](const IndexRange y_range) { size_t offset = y_range.first() * x * 4; - IMB_buffer_float_unpremultiply(outbuf + offset, x, y_range.size()); IMB_buffer_byte_from_float(out + offset, outbuf + offset, 4, 0.0f, IB_PROFILE_SRGB, IB_PROFILE_SRGB, - false, + true, x, y_range.size(), x, -- 2.30.2 From e649b52d5951d02ca59b28eea5d9cbadf5ab5710 Mon Sep 17 00:00:00 2001 From: Aras Pranckevicius Date: Tue, 5 Dec 2023 22:19:35 +0200 Subject: [PATCH 3/5] VSE: simplify Glow code by using vector types No performance difference observed --- source/blender/sequencer/intern/effects.cc | 76 +++++++++++----------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/source/blender/sequencer/intern/effects.cc b/source/blender/sequencer/intern/effects.cc index e2eb24f9971..f01a5720d1d 100644 --- a/source/blender/sequencer/intern/effects.cc +++ b/source/blender/sequencer/intern/effects.cc @@ -17,6 +17,7 @@ #include "BLI_array.hh" #include "BLI_listbase.h" #include "BLI_math_rotation.h" +#include "BLI_math_vector.hh" #include "BLI_math_vector_types.hh" #include "BLI_path_util.h" #include "BLI_rect.h" @@ -2061,45 +2062,48 @@ static void glow_blur_bitmap(blender::float4 *map, int width, int height, float }); } -static void blur_add_bitmap(const float *src, float *dst, int width, int height) +static void blur_add_bitmap(const blender::float4 *src, + blender::float4 *dst, + int width, + int height) { using namespace blender; threading::parallel_for(IndexRange(height), 64, [&](const IndexRange y_range) { + const float4 v1 = float4(1.0f); for (const int y : y_range) { - int index = y * width * 4; - for (int x = 0; x < width; x++, index += 4) { - dst[index + 0] = min_ff(1.0f, src[index + 0] + dst[index + 0]); - dst[index + 1] = min_ff(1.0f, src[index + 1] + dst[index + 1]); - dst[index + 2] = min_ff(1.0f, src[index + 2] + dst[index + 2]); - dst[index + 3] = min_ff(1.0f, src[index + 3] + dst[index + 3]); + int index = y * width; + for (int x = 0; x < width; x++, index++) { + dst[index] = math::min(v1, src[index] + dst[index]); } } }); } -static void blur_isolate_highlights( - const float *in, float *out, int width, int height, float threshold, float boost, float clamp) +static void blur_isolate_highlights(const blender::float4 *in, + blender::float4 *out, + int width, + int height, + float threshold, + float boost, + float clamp) { using namespace blender; threading::parallel_for(IndexRange(height), 64, [&](const IndexRange y_range) { + const float4 clampv = float4(clamp); for (const int y : y_range) { - int index = y * width * 4; - for (int x = 0; x < width; x++, index += 4) { + int index = y * width; + for (int x = 0; x < width; x++, index++) { /* Isolate the intensity */ - float intensity = (in[index + 0] + in[index + 1] + in[index + 2] - threshold); + float intensity = (in[index].x + in[index].y + in[index].z - threshold); + float4 val; if (intensity > 0) { - out[index + 0] = min_ff(clamp, (in[index + 0] * boost * intensity)); - out[index + 1] = min_ff(clamp, (in[index + 1] * boost * intensity)); - out[index + 2] = min_ff(clamp, (in[index + 2] * boost * intensity)); - out[index + 3] = min_ff(clamp, (in[index + 3] * boost * intensity)); + val = math::min(clampv, in[index] * (boost * intensity)); } else { - out[index + 0] = 0; - out[index + 1] = 0; - out[index + 2] = 0; - out[index + 3] = 0; + val = float4(0.0f); } + out[index] = val; } } }); @@ -2148,31 +2152,27 @@ static void do_glow_effect_byte(Sequence *seq, uchar * /*rect2*/, uchar *out) { + using namespace blender; SCOPED_TIMER(__func__); GlowVars *glow = (GlowVars *)seq->effectdata; - float *inbuf = static_cast(MEM_mallocN(sizeof(float[4]) * x * y, "glow effect input")); - float *outbuf = static_cast( - MEM_mallocN(sizeof(float[4]) * x * y, "glow effect output")); + Array inbuf(x * y); + Array outbuf(x * y); using namespace blender; - IMB_colormanagement_transform_from_byte_threaded(inbuf, rect1, x, y, 4, "sRGB", "sRGB"); + IMB_colormanagement_transform_from_byte_threaded(*inbuf.data(), rect1, x, y, 4, "sRGB", "sRGB"); blur_isolate_highlights( - inbuf, outbuf, x, y, glow->fMini * 3.0f, glow->fBoost * fac, glow->fClamp); - glow_blur_bitmap(reinterpret_cast(outbuf), - x, - y, - glow->dDist * (render_size / 100.0f), - glow->dQuality); + inbuf.data(), outbuf.data(), x, y, glow->fMini * 3.0f, glow->fBoost * fac, glow->fClamp); + glow_blur_bitmap(outbuf.data(), x, y, glow->dDist * (render_size / 100.0f), glow->dQuality); if (!glow->bNoComp) { - blur_add_bitmap(inbuf, outbuf, x, y); + blur_add_bitmap(inbuf.data(), outbuf.data(), x, y); } threading::parallel_for(IndexRange(y), 64, [&](const IndexRange y_range) { - size_t offset = y_range.first() * x * 4; - IMB_buffer_byte_from_float(out + offset, - outbuf + offset, + size_t offset = y_range.first() * x; + IMB_buffer_byte_from_float(out + offset * 4, + *(outbuf.data() + offset), 4, 0.0f, IB_PROFILE_SRGB, @@ -2183,9 +2183,6 @@ static void do_glow_effect_byte(Sequence *seq, x, x); }); - - MEM_freeN(inbuf); - MEM_freeN(outbuf); } static void do_glow_effect_float(Sequence *seq, @@ -2197,9 +2194,10 @@ static void do_glow_effect_float(Sequence *seq, float * /*rect2*/, float *out) { + using namespace blender; SCOPED_TIMER(__func__); - float *outbuf = out; - float *inbuf = rect1; + float4 *outbuf = reinterpret_cast(out); + float4 *inbuf = reinterpret_cast(rect1); GlowVars *glow = (GlowVars *)seq->effectdata; blur_isolate_highlights( -- 2.30.2 From b695329bb9d7da8372669c037311b6360eacbd2c Mon Sep 17 00:00:00 2001 From: Aras Pranckevicius Date: Wed, 6 Dec 2023 09:49:37 +0200 Subject: [PATCH 4/5] VSE: simplify and speedup Glow some more Instead of applying blur kernel to "left + right side, followed by middle", do much simpler thing and just apply it normally, taking care of boundary conditions where kernel would step outside the image. Also instead of doing "add glow to original image" in a separate pass over the whole image, just add source when writing the final pixel. Less code, and faster. Applying glow at 4K UHD resolution, on Windows Ryzen 5950X: - distance 4: 122ms -> 109ms - distance 20: 346ms -> 336ms --- source/blender/sequencer/intern/effects.cc | 102 ++++++--------------- 1 file changed, 29 insertions(+), 73 deletions(-) diff --git a/source/blender/sequencer/intern/effects.cc b/source/blender/sequencer/intern/effects.cc index f01a5720d1d..ba06101d9b3 100644 --- a/source/blender/sequencer/intern/effects.cc +++ b/source/blender/sequencer/intern/effects.cc @@ -1959,7 +1959,12 @@ static void do_transform_effect(const SeqRenderData *context, /** \name Glow Effect * \{ */ -static void glow_blur_bitmap(blender::float4 *map, int width, int height, float blur, int quality) +static void glow_blur_bitmap(const blender::float4 *src, + blender::float4 *map, + int width, + int height, + float blur, + int quality) { using namespace blender; @@ -1995,66 +2000,34 @@ static void glow_blur_bitmap(blender::float4 *map, int width, int height, float filter[ix] /= fval; } - /* Blur the rows */ + /* Blur the rows: read map, write temp */ threading::parallel_for(IndexRange(height), 32, [&](const IndexRange y_range) { for (const int y : y_range) { - /* Do the left & right strips */ - for (int x = 0; x < halfWidth; x++) { + for (int x = 0; x < width; x++) { float4 curColor = float4(0.0f); - float4 curColor2 = float4(0.0f); - - int fx = 0; - for (int i = x - halfWidth; i < x + halfWidth; i++) { - if ((i >= 0) && (i < width)) { - curColor += map[i + y * width] * filter[fx]; - curColor2 += map[width - 1 - i + y * width] * filter[fx]; - } - fx++; - } - temp[x + y * width] = curColor; - temp[width - 1 - x + y * width] = curColor2; - } - - /* Do the main body */ - for (int x = halfWidth; x < width - halfWidth; x++) { - int fx = 0; - float4 curColor = float4(0.0f); - for (int i = x - halfWidth; i < x + halfWidth; i++) { - curColor += map[i + y * width] * filter[fx]; - fx++; + int xmin = math::max(x - halfWidth, 0); + int xmax = math::min(x + halfWidth, width); + for (int nx = xmin, index = (xmin - x) + halfWidth; nx < xmax; nx++, index++) { + curColor += map[nx + y * width] * filter[index]; } temp[x + y * width] = curColor; } } }); - /* Blur the columns */ + /* Blur the columns: read temp, write map */ threading::parallel_for(IndexRange(width), 32, [&](const IndexRange x_range) { + const float4 one = float4(1.0f); for (const int x : x_range) { - /* Do the top & bottom strips */ - for (int y = 0; y < halfWidth; y++) { + for (int y = 0; y < height; y++) { float4 curColor = float4(0.0f); - float4 curColor2 = float4(0.0f); - int fy = 0; - for (int i = y - halfWidth; i < y + halfWidth; i++) { - if ((i >= 0) && (i < height)) { - curColor += temp[x + i * width] * filter[fy]; - curColor2 += temp[x + (height - 1 - i) * width] * filter[fy]; - } - fy++; + int ymin = math::max(y - halfWidth, 0); + int ymax = math::min(y + halfWidth, height); + for (int ny = ymin, index = (ymin - y) + halfWidth; ny < ymax; ny++, index++) { + curColor += temp[x + ny * width] * filter[index]; } - - map[x + y * width] = curColor; - map[x + (height - 1 - y) * width] = curColor2; - } - - /* Do the main body */ - for (int y = halfWidth; y < height - halfWidth; y++) { - float4 curColor = float4(0.0f); - int fy = 0; - for (int i = y - halfWidth; i < y + halfWidth; i++) { - curColor += temp[x + i * width] * filter[fy]; - fy++; + if (src != nullptr) { + curColor = math::min(one, src[x + y * width] + curColor); } map[x + y * width] = curColor; } @@ -2062,23 +2035,6 @@ static void glow_blur_bitmap(blender::float4 *map, int width, int height, float }); } -static void blur_add_bitmap(const blender::float4 *src, - blender::float4 *dst, - int width, - int height) -{ - using namespace blender; - threading::parallel_for(IndexRange(height), 64, [&](const IndexRange y_range) { - const float4 v1 = float4(1.0f); - for (const int y : y_range) { - int index = y * width; - for (int x = 0; x < width; x++, index++) { - dst[index] = math::min(v1, src[index] + dst[index]); - } - } - }); -} - static void blur_isolate_highlights(const blender::float4 *in, blender::float4 *out, int width, @@ -2164,10 +2120,12 @@ static void do_glow_effect_byte(Sequence *seq, blur_isolate_highlights( inbuf.data(), outbuf.data(), x, y, glow->fMini * 3.0f, glow->fBoost * fac, glow->fClamp); - glow_blur_bitmap(outbuf.data(), x, y, glow->dDist * (render_size / 100.0f), glow->dQuality); - if (!glow->bNoComp) { - blur_add_bitmap(inbuf.data(), outbuf.data(), x, y); - } + glow_blur_bitmap(glow->bNoComp ? nullptr : inbuf.data(), + outbuf.data(), + x, + y, + glow->dDist * (render_size / 100.0f), + glow->dQuality); threading::parallel_for(IndexRange(y), 64, [&](const IndexRange y_range) { size_t offset = y_range.first() * x; @@ -2202,14 +2160,12 @@ static void do_glow_effect_float(Sequence *seq, blur_isolate_highlights( inbuf, outbuf, x, y, glow->fMini * 3.0f, glow->fBoost * fac, glow->fClamp); - glow_blur_bitmap(reinterpret_cast(outbuf), + glow_blur_bitmap(glow->bNoComp ? nullptr : inbuf, + outbuf, x, y, glow->dDist * (render_size / 100.0f), glow->dQuality); - if (!glow->bNoComp) { - blur_add_bitmap(inbuf, outbuf, x, y); - } } static ImBuf *do_glow_effect(const SeqRenderData *context, -- 2.30.2 From 4e9e35edcb41ff6e69167f77446ef9097bcaa070 Mon Sep 17 00:00:00 2001 From: Aras Pranckevicius Date: Wed, 6 Dec 2023 12:05:16 +0200 Subject: [PATCH 5/5] Cleanup timers --- source/blender/sequencer/intern/effects.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/source/blender/sequencer/intern/effects.cc b/source/blender/sequencer/intern/effects.cc index ba06101d9b3..b3c2a816b71 100644 --- a/source/blender/sequencer/intern/effects.cc +++ b/source/blender/sequencer/intern/effects.cc @@ -64,8 +64,6 @@ #include "strip_time.hh" #include "utils.hh" -#include "BLI_timeit.hh" - static SeqEffectHandle get_sequence_effect_impl(int seq_type); /* -------------------------------------------------------------------- */ @@ -2109,7 +2107,6 @@ static void do_glow_effect_byte(Sequence *seq, uchar *out) { using namespace blender; - SCOPED_TIMER(__func__); GlowVars *glow = (GlowVars *)seq->effectdata; Array inbuf(x * y); @@ -2153,7 +2150,6 @@ static void do_glow_effect_float(Sequence *seq, float *out) { using namespace blender; - SCOPED_TIMER(__func__); float4 *outbuf = reinterpret_cast(out); float4 *inbuf = reinterpret_cast(rect1); GlowVars *glow = (GlowVars *)seq->effectdata; -- 2.30.2