From 724f0c2c9e1d11fbd2a0c6d3121dfea3bf71147f Mon Sep 17 00:00:00 2001 From: Aras Pranckevicius Date: Tue, 16 Jan 2024 22:02:57 +0200 Subject: [PATCH] ImBuf: speed up Subsampled3x3 image filter Conceptually Subsampling filter is a box filter: it sums up N source image pixels, computes their average and outputs the result. Critical thing is, that should be done in premultiplied space so that colors from fully or mostly transparent regions do not "override" opaque colors. Previously, especially when operating on byte images, the code achieved this by always working on byte values, doing "progressively smaller" lerps into byte color result, taking care of premultiplication and again storing the "straight" alpha for each sample being processed. This meant that for each sample, there are 3 divisions involved! This also led to some precision loss, since for all 9 samples all the intermediate results would only be stored at byte precision. Reformulate that by simply accumulating the premultiplied color as a float color. This gets rid of all divisions, except the last step when said float needs to be written back into a byte color. Processing destination 4K UHD resolution image with Subsampling 3x3 filter: - Windows/VS2022/Ryzen5950X: 52.7ms -> 28.3ms - Mac/clang15/M1Max: 54.4ms -> 43.7ms The unit test results have a tiny difference, since now it is better (as per above, previously it was having some precision loss). --- source/blender/imbuf/intern/transform.cc | 105 ++++++++---------- source/blender/imbuf/intern/transform_test.cc | 6 +- 2 files changed, 50 insertions(+), 61 deletions(-) diff --git a/source/blender/imbuf/intern/transform.cc b/source/blender/imbuf/intern/transform.cc index f531f527df4..3629e23e911 100644 --- a/source/blender/imbuf/intern/transform.cc +++ b/source/blender/imbuf/intern/transform.cc @@ -134,26 +134,6 @@ static float wrap_uv(float value, int size) return x; } -template -static void add_subsample(const T *src, T *dst, int sample_number) -{ - BLI_STATIC_ASSERT((is_same_any_v), "Only uchar and float channels supported."); - - float factor = 1.0 / (sample_number + 1); - if constexpr (std::is_same_v) { - BLI_STATIC_ASSERT(NumChannels == 4, "Pixels using uchar requires to have 4 channels."); - blend_color_interpolate_byte(dst, dst, src, factor); - } - else if constexpr (std::is_same_v && NumChannels == 4) { - blend_color_interpolate_float(dst, dst, src, factor); - } - else if constexpr (std::is_same_v) { - for (int i : IndexRange(NumChannels)) { - dst[i] = dst[i] * (1.0f - factor) + src[i] * factor; - } - } -} - template static void sample_nearest_float(const ImBuf *source, float u, float v, float *r_sample) { @@ -235,39 +215,48 @@ static void sample_image(const ImBuf *source, float u, float v, T *r_sample) } } -template static void store_sample(const T *sample, T *dst) +static void add_subsample(const float src[4], float dst[4]) { - if constexpr (std::is_same_v) { - BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels."); - copy_v4_v4_uchar(dst, sample); - } - else if constexpr (std::is_same_v && SrcChannels == 4) { - copy_v4_v4(dst, sample); - } - else if constexpr (std::is_same_v && SrcChannels == 3) { - copy_v4_fl4(dst, sample[0], sample[1], sample[2], 1.0f); - } - else if constexpr (std::is_same_v && SrcChannels == 2) { - copy_v4_fl4(dst, sample[0], sample[1], 0.0f, 1.0f); - } - else if constexpr (std::is_same_v && SrcChannels == 1) { - /* Note: single channel sample is stored as grayscale. */ - copy_v4_fl4(dst, sample[0], sample[0], sample[0], 1.0f); - } - else { - BLI_assert_unreachable(); - } + add_v4_v4(dst, src); } -template -static void mix_and_store_sample(const T *sample, T *dst, const float mix_factor) +static void add_subsample(const uchar src[4], float dst[4]) { - if constexpr (std::is_same_v) { - BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels."); - blend_color_interpolate_byte(dst, dst, sample, mix_factor); + float premul[4]; + straight_uchar_to_premul_float(premul, src); + add_v4_v4(dst, premul); +} + +static void store_premul_float_sample(const float sample[4], float dst[4]) +{ + copy_v4_v4(dst, sample); +} + +static void store_premul_float_sample(const float sample[4], uchar dst[4]) +{ + premul_float_to_straight_uchar(dst, sample); +} + +template static void store_sample(const uchar *sample, uchar *dst) +{ + BLI_STATIC_ASSERT(SrcChannels == 4, "Unsigned chars always have 4 channels."); + copy_v4_v4_uchar(dst, sample); +} + +template static void store_sample(const float *sample, float *dst) +{ + if constexpr (SrcChannels == 4) { + copy_v4_v4(dst, sample); } - else if constexpr (std::is_same_v && SrcChannels == 4) { - blend_color_interpolate_float(dst, dst, sample, mix_factor); + else if constexpr (SrcChannels == 3) { + copy_v4_fl4(dst, sample[0], sample[1], sample[2], 1.0f); + } + else if constexpr (SrcChannels == 2) { + copy_v4_fl4(dst, sample[0], sample[1], 0.0f, 1.0f); + } + else if constexpr (SrcChannels == 1) { + /* Note: single channel sample is stored as grayscale. */ + copy_v4_fl4(dst, sample[0], sample[0], sample[0], 1.0f); } else { BLI_assert_unreachable(); @@ -286,29 +275,29 @@ static void process_scanlines(const TransformContext &ctx, IndexRange y_range) float2 uv_start = ctx.start_uv + ctx.add_x * 0.5f + ctx.add_y * 0.5f; if (ctx.subsampling_deltas.size() > 1) { - /* Multiple samples per pixel. */ + /* Multiple samples per pixel: accumulate them premultiplied, + * divide by sample count and write out (un-premultiplying if writing out + * to byte image). */ + const float inv_count = 1.0f / ctx.subsampling_deltas.size(); for (int yi : y_range) { T *output = init_pixel_pointer(ctx.dst, ctx.dst_region_x_range.first(), yi); float2 uv_row = uv_start + yi * ctx.add_y; for (int xi : ctx.dst_region_x_range) { float2 uv = uv_row + xi * ctx.add_x; - T sample[4] = {}; - int num_subsamples_added = 0; + float sample[4] = {}; for (const float2 &delta_uv : ctx.subsampling_deltas) { const float2 sub_uv = uv + delta_uv; if (!CropSource || !should_discard(ctx, sub_uv)) { T sub_sample[4]; sample_image(ctx.src, sub_uv.x, sub_uv.y, sub_sample); - add_subsample(sub_sample, sample, num_subsamples_added); - num_subsamples_added += 1; + add_subsample(sub_sample, sample); } } - if (num_subsamples_added != 0) { - const float mix_weight = float(num_subsamples_added) / ctx.subsampling_deltas.size(); - mix_and_store_sample(sample, output, mix_weight); - } + mul_v4_v4fl(sample, sample, inv_count); + store_premul_float_sample(sample, output); + output += 4; } } @@ -323,7 +312,7 @@ static void process_scanlines(const TransformContext &ctx, IndexRange y_range) if (!CropSource || !should_discard(ctx, uv)) { T sample[4]; sample_image(ctx.src, uv.x, uv.y, sample); - store_sample(sample, output); + store_sample(sample, output); } output += 4; } diff --git a/source/blender/imbuf/intern/transform_test.cc b/source/blender/imbuf/intern/transform_test.cc index f513cff1285..816926b5d88 100644 --- a/source/blender/imbuf/intern/transform_test.cc +++ b/source/blender/imbuf/intern/transform_test.cc @@ -71,9 +71,9 @@ TEST(imbuf_transform, nearest_subsample3_2x_smaller) { ImBuf *res = transform_2x_smaller(IMB_FILTER_NEAREST, 3); const ColorTheme4b *got = reinterpret_cast(res->byte_buffer.data); - EXPECT_EQ(got[0], ColorTheme4b(226, 168, 113, 255)); - EXPECT_EQ(got[1], ColorTheme4b(133, 55, 31, 16)); - EXPECT_EQ(got[2], ColorTheme4b(55, 22, 64, 254)); + EXPECT_EQ(got[0], ColorTheme4b(227, 170, 113, 255)); + EXPECT_EQ(got[1], ColorTheme4b(133, 55, 31, 17)); + EXPECT_EQ(got[2], ColorTheme4b(56, 22, 64, 253)); IMB_freeImBuf(res); } -- 2.30.2