10 changed files with 116 additions and 151 deletions
--- a/source/blender/compositor/realtime_compositor/CMakeLists.txt
+++ b/source/blender/compositor/realtime_compositor/CMakeLists.txt
@ -178,7 +178,6 @@ set(GLSL_SRC
  shaders/compositor_smaa_edge_detection.glsl
  shaders/compositor_smaa_neighborhood_blending.glsl
  shaders/compositor_split_viewer.glsl
-  shaders/compositor_summed_area_table_compute_complete_blocks.glsl
  shaders/compositor_summed_area_table_compute_complete_x_prologues.glsl
  shaders/compositor_summed_area_table_compute_complete_y_prologues.glsl
  shaders/compositor_summed_area_table_compute_incomplete_prologues.glsl
--- a/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_summed_area_table.hh
+++ b/source/blender/compositor/realtime_compositor/algorithms/COM_algorithm_summed_area_table.hh
@ -23,7 +23,9 @@ enum class SummedAreaTableOperation : uint8_t {
 * a box filter. */
 void summed_area_table(Context &context,
                       Result &input,
-                       Result &output,
+                       Result &blocks,
+                       Result &x_prologues,
+                       Result &y_prologues,
                       SummedAreaTableOperation operation = SummedAreaTableOperation::Identity);

 }  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/algorithms/intern/summed_area_table.cc
+++ b/source/blender/compositor/realtime_compositor/algorithms/intern/summed_area_table.cc
@ -54,6 +54,7 @@ static const char *get_compute_incomplete_prologues_shader(SummedAreaTableOperat
 static void compute_incomplete_prologues(Context &context,
                                         Result &input,
                                         SummedAreaTableOperation operation,
+                                         Result &blocks,
                                         Result &incomplete_x_prologues,
                                         Result &incomplete_y_prologues)
 {
@ -67,6 +68,9 @@ static void compute_incomplete_prologues(Context &context,
  const int2 input_size = input.domain().size;
  const int2 number_of_groups = math::divide_ceil(input_size, group_size);

+  blocks.allocate_texture(input.domain());
+  blocks.bind_as_image(shader, "blocks_img");
+
  incomplete_x_prologues.allocate_texture(Domain(int2(input_size.y, number_of_groups.x)));
  incomplete_x_prologues.bind_as_image(shader, "incomplete_x_prologues_img");

@ -151,57 +155,11 @@ static void compute_complete_y_prologues(Context &context,
  complete_y_prologues.unbind_as_image();
 }

-static const char *get_compute_complete_blocks_shader(SummedAreaTableOperation operation)
-{
-  switch (operation) {
-    case SummedAreaTableOperation::Identity:
-      return "compositor_summed_area_table_compute_complete_blocks_identity";
-    case SummedAreaTableOperation::Square:
-      return "compositor_summed_area_table_compute_complete_blocks_square";
-  }
-
-  BLI_assert_unreachable();
-  return "";
-}
-
-/* Computes the final summed area table blocks from the complete X and Y prologues using equation
- * (41) to implement the fourth pass of Algorithm SAT. That equation simply uses an intermediate
- * shared memory to cascade the accumulation of rows and then column in each block using the
- * prologues as initial values and writes each step of the latter accumulation to the output. */
-static void compute_complete_blocks(Context &context,
-                                    Result &input,
-                                    Result &complete_x_prologues,
-                                    Result &complete_y_prologues,
-                                    SummedAreaTableOperation operation,
-                                    Result &output)
-{
-  GPUShader *shader = context.get_shader(get_compute_complete_blocks_shader(operation),
-                                         ResultPrecision::Full);
-  GPU_shader_bind(shader);
-
-  input.bind_as_texture(shader, "input_tx");
-  complete_x_prologues.bind_as_texture(shader, "complete_x_prologues_tx");
-  complete_y_prologues.bind_as_texture(shader, "complete_y_prologues_tx");
-
-  output.allocate_texture(input.domain());
-  output.bind_as_image(shader, "output_img", true);
-
-  const int2 group_size = int2(16);
-  const int2 input_size = input.domain().size;
-  const int2 number_of_groups = math::divide_ceil(input_size, group_size);
-
-  GPU_compute_dispatch(shader, number_of_groups.x, number_of_groups.y, 1);
-
-  GPU_shader_unbind();
-  input.unbind_as_texture();
-  complete_x_prologues.unbind_as_texture();
-  complete_y_prologues.unbind_as_texture();
-  output.unbind_as_image();
-}
-
 void summed_area_table(Context &context,
                       Result &input,
-                       Result &output,
+                       Result &blocks,
+                       Result &x_prologues,
+                       Result &y_prologues,
                       SummedAreaTableOperation operation)
 {
  Result incomplete_x_prologues = context.create_temporary_result(ResultType::Color,
@ -209,27 +167,18 @@ void summed_area_table(Context &context,
  Result incomplete_y_prologues = context.create_temporary_result(ResultType::Color,
                                                                  ResultPrecision::Full);
  compute_incomplete_prologues(
-      context, input, operation, incomplete_x_prologues, incomplete_y_prologues);
+      context, input, operation, blocks, incomplete_x_prologues, incomplete_y_prologues);

-  Result complete_x_prologues = context.create_temporary_result(ResultType::Color,
-                                                                ResultPrecision::Full);
  Result complete_x_prologues_sum = context.create_temporary_result(ResultType::Color,
                                                                    ResultPrecision::Full);
  compute_complete_x_prologues(
-      context, input, incomplete_x_prologues, complete_x_prologues, complete_x_prologues_sum);
+      context, input, incomplete_x_prologues, x_prologues, complete_x_prologues_sum);
  incomplete_x_prologues.release();

-  Result complete_y_prologues = context.create_temporary_result(ResultType::Color,
-                                                                ResultPrecision::Full);
  compute_complete_y_prologues(
-      context, input, incomplete_y_prologues, complete_x_prologues_sum, complete_y_prologues);
+      context, input, incomplete_y_prologues, complete_x_prologues_sum, y_prologues);
  incomplete_y_prologues.release();
  complete_x_prologues_sum.release();
-
-  compute_complete_blocks(
-      context, input, complete_x_prologues, complete_y_prologues, operation, output);
-  complete_x_prologues.release();
-  complete_y_prologues.release();
 }

 }  // namespace blender::realtime_compositor
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_kuwahara_classic.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_kuwahara_classic.glsl
@ -35,9 +35,13 @@ void main()
    int quadrant_pixel_count = region_size.x * region_size.y;

 #if defined(SUMMED_AREA_TABLE)
-    mean_of_color_of_quadrants[q] = summed_area_table_sum(table_tx, lower_bound, upper_bound);
-    mean_of_squared_color_of_quadrants[q] = summed_area_table_sum(
-        squared_table_tx, lower_bound, upper_bound);
+    mean_of_color_of_quadrants[q] = summed_area_table_sum(
+        blocks_tx, x_prologues_tx, y_prologues_tx, lower_bound, upper_bound);
+    mean_of_squared_color_of_quadrants[q] = summed_area_table_sum(squared_blocks_tx,
+                                                                  squared_x_prologues_tx,
+                                                                  squared_y_prologues_tx,
+                                                                  lower_bound,
+                                                                  upper_bound);
 #else
    for (int j = 0; j <= radius; j++) {
      for (int i = 0; i <= radius; i++) {
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_complete_blocks.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_complete_blocks.glsl
@ -1,38 +0,0 @@
-/* SPDX-FileCopyrightText: 2023 Blender Authors
- *
- * SPDX-License-Identifier: GPL-2.0-or-later */
-
-#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
-
-/* An intermediate shared memory where the result of X accumulation will be stored. */
-shared vec4 block[gl_WorkGroupSize.x][gl_WorkGroupSize.y];
-
-void main()
-{
-  /* Accumulate the block along the horizontal direction starting from the X prologue value,
-   * writing each accumulation step to the intermediate shared memory. */
-  if (gl_LocalInvocationID.x == 0) {
-    ivec2 x_prologue_texel = ivec2(gl_GlobalInvocationID.y, gl_WorkGroupID.x);
-    vec4 x_accumulated_color = texture_load(complete_x_prologues_tx, x_prologue_texel, vec4(0.0));
-    for (int i = 0; i < gl_WorkGroupSize.x; i++) {
-      ivec2 texel = ivec2(gl_WorkGroupID.x * gl_WorkGroupSize.x + i, gl_GlobalInvocationID.y);
-      x_accumulated_color += OPERATION(texture_load(input_tx, texel, vec4(0.0)));
-      block[i][gl_LocalInvocationID.y] = x_accumulated_color;
-    }
-  }
-
-  /* Make sure the result of X accumulation is completely done. */
-  barrier();
-
-  /* Accumulate the block along the vertical direction starting from the Y prologue value,
-   * writing each accumulation step to the output image. */
-  if (gl_LocalInvocationID.y == 0) {
-    ivec2 y_prologue_texel = ivec2(gl_GlobalInvocationID.x, gl_WorkGroupID.y);
-    vec4 y_accumulated_color = texture_load(complete_y_prologues_tx, y_prologue_texel, vec4(0.0));
-    for (int i = 0; i < gl_WorkGroupSize.y; i++) {
-      y_accumulated_color += block[gl_LocalInvocationID.x][i];
-      ivec2 texel = ivec2(gl_GlobalInvocationID.x, gl_WorkGroupID.y * gl_WorkGroupSize.y + i);
-      imageStore(output_img, texel, y_accumulated_color);
-    }
-  }
-}
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_incomplete_prologues.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_summed_area_table_compute_incomplete_prologues.glsl
@ -41,6 +41,8 @@ void main()
    vec4 y_accumulated_color = vec4(0.0);
    for (int i = 0; i < gl_WorkGroupSize.y; i++) {
      y_accumulated_color += block[gl_LocalInvocationID.x][i];
+      ivec2 texel = ivec2(gl_GlobalInvocationID.x, gl_WorkGroupID.y * gl_WorkGroupSize.y + i);
+      imageStore(blocks_img, texel, y_accumulated_color);
    }

    /* Note that the first row of prologues is the result of accumulating a virtual block that is
--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_kuwahara_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_kuwahara_info.hh
@ -28,8 +28,12 @@ GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_convolution_variable_size)
 GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_summed_area_table_shared)
    .additional_info("compositor_kuwahara_classic_shared")
    .define("SUMMED_AREA_TABLE")
-    .sampler(0, ImageType::FLOAT_2D, "table_tx")
-    .sampler(1, ImageType::FLOAT_2D, "squared_table_tx");
+    .sampler(0, ImageType::FLOAT_2D, "blocks_tx")
+    .sampler(1, ImageType::FLOAT_2D, "x_prologues_tx")
+    .sampler(2, ImageType::FLOAT_2D, "y_prologues_tx")
+    .sampler(3, ImageType::FLOAT_2D, "squared_blocks_tx")
+    .sampler(4, ImageType::FLOAT_2D, "squared_x_prologues_tx")
+    .sampler(5, ImageType::FLOAT_2D, "squared_y_prologues_tx");

 GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_summed_area_table_constant_size)
    .additional_info("compositor_kuwahara_classic_summed_area_table_shared")
@ -39,7 +43,7 @@ GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_summed_area_table_constant_si

 GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_summed_area_table_variable_size)
    .additional_info("compositor_kuwahara_classic_summed_area_table_shared")
-    .sampler(2, ImageType::FLOAT_2D, "size_tx")
+    .sampler(6, ImageType::FLOAT_2D, "size_tx")
    .define("VARIABLE_SIZE")
    .do_static_compilation(true);

--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_summed_area_table_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_summed_area_table_info.hh
@ -7,8 +7,9 @@
 GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_incomplete_prologues_shared)
    .local_group_size(16, 16)
    .sampler(0, ImageType::FLOAT_2D, "input_tx")
-    .image(0, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "incomplete_x_prologues_img")
-    .image(1, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "incomplete_y_prologues_img")
+    .image(0, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "blocks_img")
+    .image(1, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "incomplete_x_prologues_img")
+    .image(2, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "incomplete_y_prologues_img")
    .compute_source("compositor_summed_area_table_compute_incomplete_prologues.glsl");

 GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_incomplete_prologues_identity)
@ -36,21 +37,3 @@ GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_y_prologues
    .image(0, GPU_RGBA32F, Qualifier::WRITE, ImageType::FLOAT_2D, "complete_y_prologues_img")
    .compute_source("compositor_summed_area_table_compute_complete_y_prologues.glsl")
    .do_static_compilation(true);
-
-GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_blocks_shared)
-    .local_group_size(16, 16)
-    .sampler(0, ImageType::FLOAT_2D, "input_tx")
-    .sampler(1, ImageType::FLOAT_2D, "complete_x_prologues_tx")
-    .sampler(2, ImageType::FLOAT_2D, "complete_y_prologues_tx")
-    .image(0, GPU_RGBA32F, Qualifier::READ_WRITE, ImageType::FLOAT_2D, "output_img")
-    .compute_source("compositor_summed_area_table_compute_complete_blocks.glsl");
-
-GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_blocks_identity)
-    .additional_info("compositor_summed_area_table_compute_complete_blocks_shared")
-    .define("OPERATION(value)", "value")
-    .do_static_compilation(true);
-
-GPU_SHADER_CREATE_INFO(compositor_summed_area_table_compute_complete_blocks_square)
-    .additional_info("compositor_summed_area_table_compute_complete_blocks_shared")
-    .define("OPERATION(value)", "value * value")
-    .do_static_compilation(true);
--- a/source/blender/compositor/realtime_compositor/shaders/library/gpu_shader_compositor_summed_area_table_lib.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/library/gpu_shader_compositor_summed_area_table_lib.glsl
@ -2,6 +2,27 @@
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */

+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+#define SAT_BLOCK_SIZE 16
+
+vec4 summed_area_table_value(sampler2D blocks,
+                             sampler2D x_prologues,
+                             sampler2D y_prologues,
+                             ivec2 texel,
+                             out vec4 inter_value)
+{
+  inter_value = vec4(0.0);
+  int start_y = (texel.y / SAT_BLOCK_SIZE) * SAT_BLOCK_SIZE;
+  for (int i = start_y; i <= texel.y; i++) {
+    inter_value += texture_load(x_prologues, ivec2(i, texel.x / SAT_BLOCK_SIZE), vec4(0.0));
+  }
+
+  inter_value += texture_load(y_prologues, texel / ivec2(1, SAT_BLOCK_SIZE), vec4(0.0));
+
+  return texture_load(blocks, texel, vec4(0.0));
+}
+
 /* Computes the sum of the rectangular region defined by the given lower and upper bounds from the
 * given summed area table. It is assumed that the given upper bound is larger than the given lower
 * bound, otherwise, undefined behavior is invoked. Looking at the diagram below, in order to
@ -35,17 +56,31 @@
 * The aforementioned equation eliminates the edges between regions X, C, and A since they get
 * subtracted with C and A. To avoid this, we subtract 1 from the lower bound and fallback to zero
 * for out of bound sampling. */
-
-#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
-
-vec4 summed_area_table_sum(sampler2D table, ivec2 lower_bound, ivec2 upper_bound)
+vec4 summed_area_table_sum(sampler2D blocks,
+                           sampler2D x_prologues,
+                           sampler2D y_prologues,
+                           ivec2 lower_bound,
+                           ivec2 upper_bound)
 {
-  ivec2 corrected_lower_bound = lower_bound - ivec2(1);
-  ivec2 corrected_upper_bound = min(texture_size(table) - ivec2(1), upper_bound);
-  vec4 addend = texture_load(table, corrected_upper_bound, vec4(0.0)) +
-                texture_load(table, corrected_lower_bound, vec4(0.0));
-  vec4 subtrahend =
-      texture_load(table, ivec2(corrected_lower_bound.x, corrected_upper_bound.y), vec4(0.0)) +
-      texture_load(table, ivec2(corrected_upper_bound.x, corrected_lower_bound.y), vec4(0.0));
-  return addend - subtrahend;
+  ivec2 lower_left_texel = lower_bound - ivec2(1);
+  ivec2 upper_right_texel = min(texture_size(blocks) - ivec2(1), upper_bound);
+  ivec2 upper_left_texel = ivec2(lower_left_texel.x, upper_right_texel.y);
+  ivec2 lower_right_texel = ivec2(upper_right_texel.x, lower_left_texel.y);
+
+  vec4 inter_upper_right, inter_lower_left, inter_upper_left, inter_lower_right;
+
+  vec4 intra_upper_right = summed_area_table_value(
+      blocks, x_prologues, y_prologues, upper_right_texel, inter_upper_right);
+  vec4 intra_lower_left = summed_area_table_value(
+      blocks, x_prologues, y_prologues, lower_left_texel, inter_lower_left);
+  vec4 intra_upper_left = summed_area_table_value(
+      blocks, x_prologues, y_prologues, upper_left_texel, inter_upper_left);
+  vec4 intra_lower_right = summed_area_table_value(
+      blocks, x_prologues, y_prologues, lower_right_texel, inter_lower_right);
+
+  vec4 intra_value = (intra_upper_right + intra_lower_left) -
+                     (intra_upper_left + intra_lower_right);
+  vec4 inter_value = (inter_upper_right + inter_lower_left) -
+                     (inter_upper_left + inter_lower_right);
+  return intra_value + inter_value;
 }
--- a/source/blender/nodes/composite/nodes/node_composite_kuwahara.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_kuwahara.cc
@ -122,13 +122,25 @@ class ConvertKuwaharaOperation : public NodeOperation {

  void execute_classic_summed_area_table()
  {
-    Result table = context().create_temporary_result(ResultType::Color, ResultPrecision::Full);
-    summed_area_table(context(), get_input("Image"), table);
+    Result blocks = context().create_temporary_result(ResultType::Color, ResultPrecision::Full);
+    Result x_prologues = context().create_temporary_result(ResultType::Color,
+                                                           ResultPrecision::Full);
+    Result y_prologues = context().create_temporary_result(ResultType::Color,
+                                                           ResultPrecision::Full);
+    summed_area_table(context(), get_input("Image"), blocks, x_prologues, y_prologues);

-    Result squared_table = context().create_temporary_result(ResultType::Color,
-                                                             ResultPrecision::Full);
-    summed_area_table(
-        context(), get_input("Image"), squared_table, SummedAreaTableOperation::Square);
+    Result squared_blocks = context().create_temporary_result(ResultType::Color,
+                                                              ResultPrecision::Full);
+    Result squared_x_prologues = context().create_temporary_result(ResultType::Color,
+                                                                   ResultPrecision::Full);
+    Result squared_y_prologues = context().create_temporary_result(ResultType::Color,
+                                                                   ResultPrecision::Full);
+    summed_area_table(context(),
+                      get_input("Image"),
+                      squared_blocks,
+                      squared_x_prologues,
+                      squared_y_prologues,
+                      SummedAreaTableOperation::Square);

    GPUShader *shader = context().get_shader(get_classic_summed_area_table_shader_name());
    GPU_shader_bind(shader);
@ -141,8 +153,13 @@ class ConvertKuwaharaOperation : public NodeOperation {
      size_input.bind_as_texture(shader, "size_tx");
    }

-    table.bind_as_texture(shader, "table_tx");
-    squared_table.bind_as_texture(shader, "squared_table_tx");
+    blocks.bind_as_texture(shader, "blocks_tx");
+    x_prologues.bind_as_texture(shader, "x_prologues_tx");
+    y_prologues.bind_as_texture(shader, "y_prologues_tx");
+
+    squared_blocks.bind_as_texture(shader, "squared_blocks_tx");
+    squared_x_prologues.bind_as_texture(shader, "squared_x_prologues_tx");
+    squared_y_prologues.bind_as_texture(shader, "squared_y_prologues_tx");

    const Domain domain = compute_domain();
    Result &output_image = get_result("Image");
@ -151,13 +168,21 @@ class ConvertKuwaharaOperation : public NodeOperation {

    compute_dispatch_threads_at_least(shader, domain.size);

-    table.unbind_as_texture();
-    squared_table.unbind_as_texture();
+    blocks.unbind_as_texture();
+    x_prologues.unbind_as_texture();
+    y_prologues.unbind_as_texture();
+    squared_blocks.unbind_as_texture();
+    squared_x_prologues.unbind_as_texture();
+    squared_y_prologues.unbind_as_texture();
    output_image.unbind_as_image();
    GPU_shader_unbind();

-    table.release();
-    squared_table.release();
+    blocks.release();
+    x_prologues.release();
+    y_prologues.release();
+    squared_blocks.release();
+    squared_x_prologues.release();
+    squared_y_prologues.release();
  }

  /* An implementation of the Anisotropic Kuwahara filter described in the paper: