Realtime Compositor: Improve classic Kuwahara precision #114191

Closed
Omar Emara wants to merge 2 commits from OmarEmaraDev/blender:improve-sat-precision into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
10 changed files with 116 additions and 151 deletions

View File

@ -178,7 +178,6 @@ set(GLSL_SRC
shaders/compositor_smaa_edge_detection.glsl
shaders/compositor_smaa_neighborhood_blending.glsl
shaders/compositor_split_viewer.glsl
shaders/compositor_summed_area_table_compute_complete_blocks.glsl
shaders/compositor_summed_area_table_compute_complete_x_prologues.glsl
shaders/compositor_summed_area_table_compute_complete_y_prologues.glsl
shaders/compositor_summed_area_table_compute_incomplete_prologues.glsl

View File

@ -23,7 +23,9 @@ enum class SummedAreaTableOperation : uint8_t {
* a box filter. */
void summed_area_table(Context &context,
Result &input,
Result &output,
Result &blocks,
Result &x_prologues,
Result &y_prologues,
SummedAreaTableOperation operation = SummedAreaTableOperation::Identity);
} // namespace blender::realtime_compositor

View File

@ -54,6 +54,7 @@ static const char *get_compute_incomplete_prologues_shader(SummedAreaTableOperat
static void compute_incomplete_prologues(Context &context,
Result &input,
SummedAreaTableOperation operation,
Result &blocks,
Result &incomplete_x_prologues,
Result &incomplete_y_prologues)
{
@ -67,6 +68,9 @@ static void compute_incomplete_prologues(Context &context,
const int2 input_size = input.domain().size;
const int2 number_of_groups = math::divide_ceil(input_size, group_size);
blocks.allocate_texture(input.domain());
blocks.bind_as_image(shader, "blocks_img");
incomplete_x_prologues.allocate_texture(Domain(int2(input_size.y, number_of_groups.x)));
incomplete_x_prologues.bind_as_image(shader, "incomplete_x_prologues_img");
@ -151,57 +155,11 @@ static void compute_complete_y_prologues(Context &context,
complete_y_prologues.unbind_as_image();
}
static const char *get_compute_complete_blocks_shader(SummedAreaTableOperation operation)
{
switch (operation) {
case SummedAreaTableOperation::Identity:
return "compositor_summed_area_table_compute_complete_blocks_identity";
case SummedAreaTableOperation::Square:
return "compositor_summed_area_table_compute_complete_blocks_square";
}
BLI_assert_unreachable();
return "";
}
/* Computes the final summed area table blocks from the complete X and Y prologues using equation
* (41) to implement the fourth pass of Algorithm SAT. That equation simply uses an intermediate
* shared memory to cascade the accumulation of rows and then column in each block using the
* prologues as initial values and writes each step of the latter accumulation to the output. */
static void compute_complete_blocks(Context &context,
Result &input,
Result &complete_x_prologues,
Result &complete_y_prologues,
SummedAreaTableOperation operation,
Result &output)
{
GPUShader *shader = context.get_shader(get_compute_complete_blocks_shader(operation),
ResultPrecision::Full);
GPU_shader_bind(shader);
input.bind_as_texture(shader, "input_tx");
complete_x_prologues.bind_as_texture(shader, "complete_x_prologues_tx");
complete_y_prologues.bind_as_texture(shader, "complete_y_prologues_tx");
output.allocate_texture(input.domain());
output.bind_as_image(shader, "output_img", true);
const int2 group_size = int2(16);
const int2 input_size = input.domain().size;
const int2 number_of_groups = math::divide_ceil(input_size, group_size);
GPU_compute_dispatch(shader, number_of_groups.x, number_of_groups.y, 1);
GPU_shader_unbind();
input.unbind_as_texture();
complete_x_prologues.unbind_as_texture();
complete_y_prologues.unbind_as_texture();
output.unbind_as_image();
}
void summed_area_table(Context &context,
Result &input,
Result &output,
Result &blocks,
Result &x_prologues,
Result &y_prologues,
SummedAreaTableOperation operation)
{
Result incomplete_x_prologues = context.create_temporary_result(ResultType::Color,
@ -209,27 +167,18 @@ void summed_area_table(Context &context,
Result incomplete_y_prologues = context.create_temporary_result(ResultType::Color,
ResultPrecision::Full);
compute_incomplete_prologues(
context, input, operation, incomplete_x_prologues, incomplete_y_prologues);
context, input, operation, blocks, incomplete_x_prologues, incomplete_y_prologues);
Result complete_x_prologues = context.create_temporary_result(ResultType::Color,
ResultPrecision::Full);
Result complete_x_prologues_sum = context.create_temporary_result(ResultType::Color,
ResultPrecision::Full);
compute_complete_x_prologues(
context, input, incomplete_x_prologues, complete_x_prologues, complete_x_prologues_sum);
context, input, incomplete_x_prologues, x_prologues, complete_x_prologues_sum);
incomplete_x_prologues.release();
Result complete_y_prologues = context.create_temporary_result(ResultType::Color,
ResultPrecision::Full);
compute_complete_y_prologues(
context, input, incomplete_y_prologues, complete_x_prologues_sum, complete_y_prologues);
context, input, incomplete_y_prologues, complete_x_prologues_sum, y_prologues);
incomplete_y_prologues.release();
complete_x_prologues_sum.release();
compute_complete_blocks(
context, input, complete_x_prologues, complete_y_prologues, operation, output);
complete_x_prologues.release();
complete_y_prologues.release();
}
} // namespace blender::realtime_compositor

View File

@ -35,9 +35,13 @@ void main()
int quadrant_pixel_count = region_size.x * region_size.y;
#if defined(SUMMED_AREA_TABLE)
mean_of_color_of_quadrants[q] = summed_area_table_sum(table_tx, lower_bound, upper_bound);
mean_of_squared_color_of_quadrants[q] = summed_area_table_sum(
squared_table_tx, lower_bound, upper_bound);
mean_of_color_of_quadrants[q] = summed_area_table_sum(
blocks_tx, x_prologues_tx, y_prologues_tx, lower_bound, upper_bound);
mean_of_squared_color_of_quadrants[q] = summed_area_table_sum(squared_blocks_tx,
squared_x_prologues_tx,
squared_y_prologues_tx,
lower_bound,
upper_bound);
#else
for (int j = 0; j <= radius; j++) {
for (int i = 0; i <= radius; i++) {

View File

@ -1,38 +0,0 @@
/* SPDX-FileCopyrightText: 2023 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
/* An intermediate shared memory where the result of X accumulation will be stored. */
shared vec4 block[gl_WorkGroupSize.x][gl_WorkGroupSize.y];
void main()
{
/* Accumulate the block along the horizontal direction starting from the X prologue value,
* writing each accumulation step to the intermediate shared memory. */
if (gl_LocalInvocationID.x == 0) {
ivec2 x_prologue_texel = ivec2(gl_GlobalInvocationID.y, gl_WorkGroupID.x);
vec4 x_accumulated_color = texture_load(complete_x_prologues_tx, x_prologue_texel, vec4(0.0));
for (int i = 0; i < gl_WorkGroupSize.x; i++) {
ivec2 texel = ivec2(gl_WorkGroupID.x * gl_WorkGroupSize.x + i, gl_GlobalInvocationID.y);
x_accumulated_color += OPERATION(texture_load(input_tx, texel, vec4(0.0)));
block[i][gl_LocalInvocationID.y] = x_accumulated_color;
}
}
/* Make sure the result of X accumulation is completely done. */
barrier();
/* Accumulate the block along the vertical direction starting from the Y prologue value,
* writing each accumulation step to the output image. */
if (gl_LocalInvocationID.y == 0) {
ivec2 y_prologue_texel = ivec2(gl_GlobalInvocationID.x, gl_WorkGroupID.y);
vec4 y_accumulated_color = texture_load(complete_y_prologues_tx, y_prologue_texel, vec4(0.0));
for (int i = 0; i < gl_WorkGroupSize.y; i++) {
y_accumulated_color += block[gl_LocalInvocationID.x][i];
ivec2 texel = ivec2(gl_GlobalInvocationID.x, gl_WorkGroupID.y * gl_WorkGroupSize.y + i);
imageStore(output_img, texel, y_accumulated_color);
}
}
}

View File

@ -41,6 +41,8 @@ void main()
vec4 y_accumulated_color = vec4(0.0);
for (int i = 0; i < gl_WorkGroupSize.y; i++) {
y_accumulated_color += block[gl_LocalInvocationID.x][i];
ivec2 texel = ivec2(gl_GlobalInvocationID.x, gl_WorkGroupID.y * gl_WorkGroupSize.y + i);
imageStore(blocks_img, texel, y_accumulated_color);
}
/* Note that the first row of prologues is the result of accumulating a virtual block that is

View File

@ -28,8 +28,12 @@ GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_convolution_variable_size)
GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_summed_area_table_shared)
.additional_info("compositor_kuwahara_classic_shared")
.define("SUMMED_AREA_TABLE")
.sampler(0, ImageType::FLOAT_2D, "table_tx")
.sampler(1, ImageType::FLOAT_2D, "squared_table_tx");
.sampler(0, ImageType::FLOAT_2D, "blocks_tx")
.sampler(1, ImageType::FLOAT_2D, "x_prologues_tx")
.sampler(2, ImageType::FLOAT_2D, "y_prologues_tx")
.sampler(3, ImageType::FLOAT_2D, "squared_blocks_tx")
.sampler(4, ImageType::FLOAT_2D, "squared_x_prologues_tx")
.sampler(5, ImageType::FLOAT_2D, "squared_y_prologues_tx");
GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_summed_area_table_constant_size)
.additional_info("compositor_kuwahara_classic_summed_area_table_shared")
@ -39,7 +43,7 @@ GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_summed_area_table_constant_si
GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_summed_area_table_variable_size)
.additional_info("compositor_kuwahara_classic_summed_area_table_shared")
.sampler(2, ImageType::FLOAT_2D, "size_tx")
.sampler(6, ImageType::FLOAT_2D, "size_tx")
.define("VARIABLE_SIZE")
.do_static_compilation(true);

View File

@ -7,8 +7,9 @@
GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_incomplete_prologues_shared)
.local_group_size(16, 16)
.sampler(0, ImageType::FLOAT_2D, "input_tx")
.image(0, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "incomplete_x_prologues_img")
.image(1, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "incomplete_y_prologues_img")
.image(0, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "blocks_img")
.image(1, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "incomplete_x_prologues_img")
.image(2, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "incomplete_y_prologues_img")
.compute_source("compositor_summed_area_table_compute_incomplete_prologues.glsl");
GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_incomplete_prologues_identity)
@ -36,21 +37,3 @@ GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_y_prologues
.image(0, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "complete_y_prologues_img")
.compute_source("compositor_summed_area_table_compute_complete_y_prologues.glsl")
.do_static_compilation(true);
GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_blocks_shared)
.local_group_size(16, 16)
.sampler(0, ImageType::FLOAT_2D, "input_tx")
.sampler(1, ImageType::FLOAT_2D, "complete_x_prologues_tx")
.sampler(2, ImageType::FLOAT_2D, "complete_y_prologues_tx")
.image(0, GPU_RGBA32F, Qualifier::READ_WRITE, ImageType::FLOAT_2D, "output_img")
.compute_source("compositor_summed_area_table_compute_complete_blocks.glsl");
GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_blocks_identity)
.additional_info("compositor_summed_area_table_compute_complete_blocks_shared")
.define("OPERATION(value)", "value")
.do_static_compilation(true);
GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_blocks_square)
.additional_info("compositor_summed_area_table_compute_complete_blocks_shared")
.define("OPERATION(value)", "value * value")
.do_static_compilation(true);

View File

@ -2,6 +2,27 @@
*
* SPDX-License-Identifier: GPL-2.0-or-later */
#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
#define SAT_BLOCK_SIZE 16
vec4 summed_area_table_value(sampler2D blocks,
sampler2D x_prologues,
sampler2D y_prologues,
ivec2 texel,
out vec4 inter_value)
{
inter_value = vec4(0.0);
int start_y = (texel.y / SAT_BLOCK_SIZE) * SAT_BLOCK_SIZE;
for (int i = start_y; i <= texel.y; i++) {
inter_value += texture_load(x_prologues, ivec2(i, texel.x / SAT_BLOCK_SIZE), vec4(0.0));
}
inter_value += texture_load(y_prologues, texel / ivec2(1, SAT_BLOCK_SIZE), vec4(0.0));
return texture_load(blocks, texel, vec4(0.0));
}
/* Computes the sum of the rectangular region defined by the given lower and upper bounds from the
* given summed area table. It is assumed that the given upper bound is larger than the given lower
* bound, otherwise, undefined behavior is invoked. Looking at the diagram below, in order to
@ -35,17 +56,31 @@
* The aforementioned equation eliminates the edges between regions X, C, and A since they get
* subtracted with C and A. To avoid this, we subtract 1 from the lower bound and fallback to zero
* for out of bound sampling. */
#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
vec4 summed_area_table_sum(sampler2D table, ivec2 lower_bound, ivec2 upper_bound)
vec4 summed_area_table_sum(sampler2D blocks,
sampler2D x_prologues,
sampler2D y_prologues,
ivec2 lower_bound,
ivec2 upper_bound)
{
ivec2 corrected_lower_bound = lower_bound - ivec2(1);
ivec2 corrected_upper_bound = min(texture_size(table) - ivec2(1), upper_bound);
vec4 addend = texture_load(table, corrected_upper_bound, vec4(0.0)) +
texture_load(table, corrected_lower_bound, vec4(0.0));
vec4 subtrahend =
texture_load(table, ivec2(corrected_lower_bound.x, corrected_upper_bound.y), vec4(0.0)) +
texture_load(table, ivec2(corrected_upper_bound.x, corrected_lower_bound.y), vec4(0.0));
return addend - subtrahend;
ivec2 lower_left_texel = lower_bound - ivec2(1);
ivec2 upper_right_texel = min(texture_size(blocks) - ivec2(1), upper_bound);
ivec2 upper_left_texel = ivec2(lower_left_texel.x, upper_right_texel.y);
ivec2 lower_right_texel = ivec2(upper_right_texel.x, lower_left_texel.y);
vec4 inter_upper_right, inter_lower_left, inter_upper_left, inter_lower_right;
vec4 intra_upper_right = summed_area_table_value(
blocks, x_prologues, y_prologues, upper_right_texel, inter_upper_right);
vec4 intra_lower_left = summed_area_table_value(
blocks, x_prologues, y_prologues, lower_left_texel, inter_lower_left);
vec4 intra_upper_left = summed_area_table_value(
blocks, x_prologues, y_prologues, upper_left_texel, inter_upper_left);
vec4 intra_lower_right = summed_area_table_value(
blocks, x_prologues, y_prologues, lower_right_texel, inter_lower_right);
vec4 intra_value = (intra_upper_right + intra_lower_left) -
(intra_upper_left + intra_lower_right);
vec4 inter_value = (inter_upper_right + inter_lower_left) -
(inter_upper_left + inter_lower_right);
return intra_value + inter_value;
}

View File

@ -122,13 +122,25 @@ class ConvertKuwaharaOperation : public NodeOperation {
void execute_classic_summed_area_table()
{
Result table = context().create_temporary_result(ResultType::Color, ResultPrecision::Full);
summed_area_table(context(), get_input("Image"), table);
Result blocks = context().create_temporary_result(ResultType::Color, ResultPrecision::Full);
Result x_prologues = context().create_temporary_result(ResultType::Color,
ResultPrecision::Full);
Result y_prologues = context().create_temporary_result(ResultType::Color,
ResultPrecision::Full);
summed_area_table(context(), get_input("Image"), blocks, x_prologues, y_prologues);
Result squared_table = context().create_temporary_result(ResultType::Color,
ResultPrecision::Full);
summed_area_table(
context(), get_input("Image"), squared_table, SummedAreaTableOperation::Square);
Result squared_blocks = context().create_temporary_result(ResultType::Color,
ResultPrecision::Full);
Result squared_x_prologues = context().create_temporary_result(ResultType::Color,
ResultPrecision::Full);
Result squared_y_prologues = context().create_temporary_result(ResultType::Color,
ResultPrecision::Full);
summed_area_table(context(),
get_input("Image"),
squared_blocks,
squared_x_prologues,
squared_y_prologues,
SummedAreaTableOperation::Square);
GPUShader *shader = context().get_shader(get_classic_summed_area_table_shader_name());
GPU_shader_bind(shader);
@ -141,8 +153,13 @@ class ConvertKuwaharaOperation : public NodeOperation {
size_input.bind_as_texture(shader, "size_tx");
}
table.bind_as_texture(shader, "table_tx");
squared_table.bind_as_texture(shader, "squared_table_tx");
blocks.bind_as_texture(shader, "blocks_tx");
x_prologues.bind_as_texture(shader, "x_prologues_tx");
y_prologues.bind_as_texture(shader, "y_prologues_tx");
squared_blocks.bind_as_texture(shader, "squared_blocks_tx");
squared_x_prologues.bind_as_texture(shader, "squared_x_prologues_tx");
squared_y_prologues.bind_as_texture(shader, "squared_y_prologues_tx");
const Domain domain = compute_domain();
Result &output_image = get_result("Image");
@ -151,13 +168,21 @@ class ConvertKuwaharaOperation : public NodeOperation {
compute_dispatch_threads_at_least(shader, domain.size);
table.unbind_as_texture();
squared_table.unbind_as_texture();
blocks.unbind_as_texture();
x_prologues.unbind_as_texture();
y_prologues.unbind_as_texture();
squared_blocks.unbind_as_texture();
squared_x_prologues.unbind_as_texture();
squared_y_prologues.unbind_as_texture();
output_image.unbind_as_image();
GPU_shader_unbind();
table.release();
squared_table.release();
blocks.release();
x_prologues.release();
y_prologues.release();
squared_blocks.release();
squared_x_prologues.release();
squared_y_prologues.release();
}
/* An implementation of the Anisotropic Kuwahara filter described in the paper: