Speedup classic Kuwahara filter by summed area table #111150

Merged
Habib Gahbiche merged 30 commits from zazizizou/blender:com-kuwahara-sat into main 2023-11-01 10:49:18 +01:00
2 changed files with 24 additions and 173 deletions
Showing only changes of commit fad4cffc41 - Show all commits

View File

@ -35,6 +35,7 @@ void KuwaharaNode::convert_to_operations(NodeConverter &converter,
kuwahara_classic->set_use_sat(false);
}
else {
kuwahara_classic->set_use_sat(true);
zazizizou marked this conversation as resolved Outdated

Add a kuwahara_classic->set_use_sat(true); just for clarity.

Add a `kuwahara_classic->set_use_sat(true);` just for clarity.
SummedAreaTableOperation *sat = new SummedAreaTableOperation();
sat->set_mode(SummedAreaTableOperation::eMode::Identity);
converter.add_operation(sat);

View File

@ -63,74 +63,26 @@ void SummedAreaTableOperation::update_memory_buffer(MemoryBuffer *output,
/* Note: although this is a single threaded call, multithreading is used. */
MemoryBuffer *image = inputs[0];
/* First pass: copy values from input to output and square values if necessary. */
/* First pass: copy input to output and sum horizontally. */
threading::parallel_for(IndexRange(area.ymin, area.ymax), 1, [&](const IndexRange range_y) {
zazizizou marked this conversation as resolved Outdated

It is sufficient to have a single parallel loop over rows and a serial loop over columns, too much parallelism will hurt performance.

This copy loop can be fused with the horizontal pass.

It is sufficient to have a single parallel loop over rows and a serial loop over columns, too much parallelism will hurt performance. This copy loop can be fused with the horizontal pass.
threading::parallel_for(IndexRange(area.xmin, area.xmax), 1, [&](const IndexRange range_x) {
for (int64_t y = *range_y.begin(); y < *range_y.end(); y++) {
for (int64_t x = *range_x.begin(); x < *range_x.end(); x++) {
float4 color;
image->read_elem(x, y, &color.x);
float *out = output->get_elem(x, y);
switch (mode_) {
case eMode::Squared: {
color *= color;
break;
}
case eMode::Identity: {
/* Pass. */
break;
}
default: {
BLI_assert_msg(0, "Mode not implemented");
break;
}
}
out[0] = color.x;
out[1] = color.y;
out[2] = color.z;
out[3] = color.w;
}
}
});
});
/* Second pass. */
threading::parallel_for(IndexRange(area.ymin, area.ymax), 1, [&](const IndexRange range_y) {
for (int64_t y = *range_y.begin(); y < *range_y.end(); y++) {
/* Track floating point error. See below. */
float4 running_compensation = {0.0f, 0.0f, 0.0f, 0.0f};
for (int x = area.xmin; x < area.xmax; x++) {
float4 color;
output->read_elem_checked(x - 1, y, &color.x);
float *out = output->get_elem(x, y);
out[0] += color.x;
out[1] += color.y;
out[2] += color.z;
out[3] += color.w;
for (const int y : range_y) {
float4 accumulated_color = float4(0.0f);
for (const int x : IndexRange(area.xmin, area.xmax)) {
const float4 color = float4(image->get_elem(x, y));
accumulated_color += mode_ == eMode::Squared? color * color : color;
copy_v4_v4(output->get_elem(x, y), accumulated_color);
}
}
});
/* Third pass: vertical sum. */
/* Second pass: vertical sum. */
threading::parallel_for(IndexRange(area.xmin, area.xmax), 1, [&](const IndexRange range_x) {
for (int64_t x = *range_x.begin(); x < *range_x.end(); x++) {
for (int y = area.ymin; y < area.ymax; y++) {
for(const int x : range_x) {
for(const int y : IndexRange(area.ymin, area.ymax)) {
float4 color;
zazizizou marked this conversation as resolved Outdated

Use a temporary accumulated_color variable and avoid reading the buffer again just like the above loop. Then, use get_elem instead of read_elem_checked.

Use a temporary `accumulated_color` variable and avoid reading the buffer again just like the above loop. Then, use `get_elem` instead of `read_elem_checked`.
output->read_elem_checked(x, y - 1, &color.x);
float *out = output->get_elem(x, y);
out[0] += color.x;
out[1] += color.y;
out[2] += color.z;
out[3] += color.w;
copy_v4_v4(out, float4(out) + color);
}
}
});
@ -138,77 +90,32 @@ void SummedAreaTableOperation::update_memory_buffer(MemoryBuffer *output,
MemoryBuffer *SummedAreaTableOperation::create_memory_buffer(rcti *area)
{
/* Note: although this is a single threaded call, multithreading is used. */
MemoryBuffer *output = new MemoryBuffer(DataType::Color, *area);
PixelSampler sampler = PixelSampler::Nearest;
/* First pass: copy values from input to output and square values if necessary. */
/* First pass: copy input to output and sum horizontally. */
threading::parallel_for(IndexRange(area->ymin, area->ymax), 1, [&](const IndexRange range_y) {
threading::parallel_for(IndexRange(area->xmin, area->xmax), 1, [&](const IndexRange range_x) {
for (float y = float(*range_y.begin()); y < float(*range_y.end()); y++) {
for (float x = float(*range_x.begin()); x < float(*range_x.end()); x++) {
float4 color;
image_reader_->read_sampled(&color.x, x, y, sampler);
float *out = output->get_elem(x, y);
switch (mode_) {
case eMode::Squared: {
color *= color;
break;
}
case eMode::Identity: {
/* Pass. */
break;
}
default: {
BLI_assert_msg(0, "Mode not implemented");
break;
}
}
out[0] = color.x;
out[1] = color.y;
out[2] = color.z;
out[3] = color.w;
}
}
});
});
/* Second pass. */
threading::parallel_for(IndexRange(area->ymin, area->ymax), 1, [&](const IndexRange range_y) {
for (int64_t y = *range_y.begin(); y < *range_y.end(); y++) {
/* Track floating point error. See below. */
float4 running_compensation = {0.0f, 0.0f, 0.0f, 0.0f};
for (int x = area->xmin; x < area->xmax; x++) {
for (const int y : range_y) {
float4 accumulated_color = float4(0.0f);
for (const int x : IndexRange(area->xmin, area->xmax)) {
float4 color;
output->read_elem_checked(x - 1, y, &color.x);
float *out = output->get_elem(x, y);
out[0] += color.x;
out[1] += color.y;
out[2] += color.z;
out[3] += color.w;
image_reader_->read_sampled(&color.x, x, y, sampler);
zazizizou marked this conversation as resolved Outdated

Use read instead of read_sampled. Same applies for all read_sampled calls below.

Use `read` instead of `read_sampled`. Same applies for all `read_sampled` calls below.
accumulated_color += mode_ == eMode::Squared? color * color : color;
zazizizou marked this conversation as resolved Outdated

This can be more compact.

  • Use a for each loop on ranges. for (const int y : sub_y_range) {
  • Accumulate a color instead of reading the previous output.
  • Use the get_elem function.
  • Use a copy function.
  threading::parallel_for(IndexRange(area.ymin, area.ymax), 1, [&](const IndexRange sub_y_range) {
    for (const int y : sub_y_range) {
      float4 accumulated_color = float4(0.0f);
      for (const int x : IndexRange(area.xmin, area.xmax)) {
        const float4 color = float4(image->get_elem(x, y));
        accumulated_color += color * color;
        copy_v4_v4(output->get_elem(x, y), accumulated_color);
      }
    }
  });
This can be more compact. - Use a for each loop on ranges. `for (const int y : sub_y_range) {` - Accumulate a color instead of reading the previous output. - Use the `get_elem` function. - Use a copy function. ```cpp threading::parallel_for(IndexRange(area.ymin, area.ymax), 1, [&](const IndexRange sub_y_range) { for (const int y : sub_y_range) { float4 accumulated_color = float4(0.0f); for (const int x : IndexRange(area.xmin, area.xmax)) { const float4 color = float4(image->get_elem(x, y)); accumulated_color += color * color; copy_v4_v4(output->get_elem(x, y), accumulated_color); } } }); ```

Will do, thanks for the tip :)

Will do, thanks for the tip :)
copy_v4_v4(output->get_elem(x, y), accumulated_color);
}
}
});
/* Third pass: vertical sum. */
/* Second pass: vertical sum. */
threading::parallel_for(IndexRange(area->xmin, area->xmax), 1, [&](const IndexRange range_x) {
for (int64_t x = *range_x.begin(); x < *range_x.end(); x++) {
for (int y = area->ymin; y < area->ymax; y++) {
for(const int x : range_x) {
for(const int y : IndexRange(area->ymin, area->ymax)) {
float4 color;
output->read_elem_checked(x, y - 1, &color.x);
zazizizou marked this conversation as resolved
Review

Same as above.

Same as above.
float *out = output->get_elem(x, y);
out[0] += color.x;
out[1] += color.y;
out[2] += color.z;
out[3] += color.w;
copy_v4_v4(out, float4(out) + color);
}
}
});
@ -216,63 +123,6 @@ MemoryBuffer *SummedAreaTableOperation::create_memory_buffer(rcti *area)
return output;
}
//MemoryBuffer *SummedAreaTableOperation::create_memory_buffer(rcti *rect)
//{
// MemoryBuffer *result = new MemoryBuffer(DataType::Color, *rect);
// PixelSampler sampler = PixelSampler::Nearest;
//
// /* Track floating point error. See below. */
// float4 running_compensation = {0.0f, 0.0f, 0.0f, 0.0f};
//
// for (BuffersIterator<float> it = result->iterate_with({}, *rect); !it.is_end(); ++it) {
// const int x = it.x;
// const int y = it.y;
//
// float4 color, upper, left, upper_left;
// image_reader_->read_sampled(color, x, y, sampler);
//
// result->read_elem_checked(x, y - 1, &upper.x);
// result->read_elem_checked(x - 1, y, &left.x);
// result->read_elem_checked(x - 1, y - 1, &upper_left.x);
//
// float4 sum = upper + left - upper_left;
//
// float4 v;
// switch (mode_) {
// case eMode::Squared: {
// v = color * color;
// break;
// }
// case eMode::Identity: {
// v = color;
// break;
// }
// default: {
// BLI_assert_msg(0, "Mode not implemented");
// break;
// }
// }
//
// /* Using Kahan Summation algorithm to compensate for floating point inaccuracies caused by
// * summing up large number of values.
// * The idea is to introduce a variable to keep track of the error (here called `running_error`)
// * and then correct the error in the next iteration. */
// float4 difference = v - running_compensation;
// float4 temp = sum + difference;
// /* `(temp - sum)` cancels the high-order part of `difference`. Subtracting `difference` again
// * recovers `difference` for the next iteration. */
// running_compensation = (temp - sum) - difference;
// sum = temp;
//
// it.out[0] = sum.x;
// it.out[1] = sum.y;
// it.out[2] = sum.z;
// it.out[3] = sum.w;
// }
//
// return result;
//}
void SummedAreaTableOperation::set_mode(eMode mode)
zazizizou marked this conversation as resolved Outdated

I think we should attempt to multithread the SAT computation. Not sure if there is anything stopping us from doing that, but a two pass prefix sum should be easy to implement and efficient to parallelize on the CPU.

| Image | -> Prefix sum from left to right -> | Horizontal Pass Result | -> Prefix sum from bottom to top -> | Desired SAT |

Each of the prefix sums can simply be a parallel loop over rows/columns.

I think we should attempt to multithread the SAT computation. Not sure if there is anything stopping us from doing that, but a two pass prefix sum should be easy to implement and efficient to parallelize on the CPU. ``` | Image | -> Prefix sum from left to right -> | Horizontal Pass Result | -> Prefix sum from bottom to top -> | Desired SAT | ``` Each of the prefix sums can simply be a parallel loop over rows/columns.

As discussed in the meeting, my concern was using SingleThreadedOperation for a multi-threaded execution. I will upload a patch using TBB.

As discussed in the meeting, my concern was using `SingleThreadedOperation` for a multi-threaded execution. I will upload a patch using TBB.
{
mode_ = mode;