2023-08-17 16:58:42 +02:00 · 2023-08-16 16:21:07 +02:00 · 2023-08-16 16:17:32 +02:00 · 2023-08-16 16:24:16 +02:00 · 2023-08-05 16:17:44 +02:00 · 2023-08-05 16:25:53 +02:00
13 changed files with 1156 additions and 314 deletions
--- a/source/blender/compositor/CMakeLists.txt
+++ b/source/blender/compositor/CMakeLists.txt
@ -346,6 +346,8 @@ if(WITH_COMPOSITOR_CPU)
    operations/COM_GaussianYBlurOperation.cc
    operations/COM_GaussianYBlurOperation.h
    operations/COM_KuwaharaAnisotropicOperation.cc
+    operations/COM_KuwaharaAnisotropicStructureTensorOperation.h
+    operations/COM_KuwaharaAnisotropicStructureTensorOperation.cc
    operations/COM_KuwaharaAnisotropicOperation.h
    operations/COM_KuwaharaClassicOperation.cc
    operations/COM_KuwaharaClassicOperation.h
--- a/source/blender/compositor/nodes/COM_KuwaharaNode.cc
+++ b/source/blender/compositor/nodes/COM_KuwaharaNode.cc
@ -2,15 +2,16 @@
 *
 * SPDX-License-Identifier: GPL-2.0-or-later */

+#include "DNA_node_types.h"
+#include "DNA_scene_types.h"
+
 #include "COM_KuwaharaNode.h"

-#include "COM_ConvertOperation.h"
-#include "COM_ConvolutionFilterOperation.h"
-#include "COM_FastGaussianBlurOperation.h"
+#include "COM_GaussianXBlurOperation.h"
+#include "COM_GaussianYBlurOperation.h"
 #include "COM_KuwaharaAnisotropicOperation.h"
+#include "COM_KuwaharaAnisotropicStructureTensorOperation.h"
 #include "COM_KuwaharaClassicOperation.h"
-#include "COM_MathBaseOperation.h"
-#include "COM_SetValueOperation.h"

 namespace blender::compositor {

@ -32,74 +33,46 @@ void KuwaharaNode::convert_to_operations(NodeConverter &converter,
    }

    case CMP_NODE_KUWAHARA_ANISOTROPIC: {
-      /* Edge detection on luminance. */
-      auto rgb_to_lum = new ConvertColorToBWOperation();
-      converter.add_operation(rgb_to_lum);
-      converter.map_input_socket(get_input_socket(0), rgb_to_lum->get_input_socket(0));
+      KuwaharaAnisotropicStructureTensorOperation *structure_tensor_operation =
+          new KuwaharaAnisotropicStructureTensorOperation();
+      converter.add_operation(structure_tensor_operation);
+      converter.map_input_socket(get_input_socket(0),
+                                 structure_tensor_operation->get_input_socket(0));

-      auto const_fact = new SetValueOperation();
-      const_fact->set_value(1.0f);
-      converter.add_operation(const_fact);
+      NodeBlurData blur_data;
+      blur_data.sizex = data->uniformity;
+      blur_data.sizey = data->uniformity;
+      blur_data.relative = false;
+      blur_data.filtertype = R_FILTER_GAUSS;

-      auto sobel_x = new ConvolutionFilterOperation();
-      sobel_x->set3x3Filter(1, 0, -1, 2, 0, -2, 1, 0, -1);
-      converter.add_operation(sobel_x);
-      converter.add_link(rgb_to_lum->get_output_socket(0), sobel_x->get_input_socket(0));
-      converter.add_link(const_fact->get_output_socket(0), sobel_x->get_input_socket(1));
+      GaussianXBlurOperation *blur_x_operation = new GaussianXBlurOperation();
+      blur_x_operation->set_data(&blur_data);
+      blur_x_operation->set_size(1.0f);

-      auto sobel_y = new ConvolutionFilterOperation();
-      sobel_y->set3x3Filter(1, 2, 1, 0, 0, 0, -1, -2, -1);
-      converter.add_operation(sobel_y);
-      converter.add_link(rgb_to_lum->get_output_socket(0), sobel_y->get_input_socket(0));
-      converter.add_link(const_fact->get_output_socket(0), sobel_y->get_input_socket(1));
+      converter.add_operation(blur_x_operation);
+      converter.add_link(structure_tensor_operation->get_output_socket(0),
+                         blur_x_operation->get_input_socket(0));

-      /* Compute intensity of edges. */
-      auto sobel_xx = new MathMultiplyOperation();
-      auto sobel_yy = new MathMultiplyOperation();
-      auto sobel_xy = new MathMultiplyOperation();
-      converter.add_operation(sobel_xx);
-      converter.add_operation(sobel_yy);
-      converter.add_operation(sobel_xy);
+      GaussianYBlurOperation *blur_y_operation = new GaussianYBlurOperation();
+      blur_y_operation->set_data(&blur_data);
+      blur_y_operation->set_size(1.0f);

-      converter.add_link(sobel_x->get_output_socket(0), sobel_xx->get_input_socket(0));
-      converter.add_link(sobel_x->get_output_socket(0), sobel_xx->get_input_socket(1));
+      converter.add_operation(blur_y_operation);
+      converter.add_link(blur_x_operation->get_output_socket(0),
+                         blur_y_operation->get_input_socket(0));

-      converter.add_link(sobel_y->get_output_socket(0), sobel_yy->get_input_socket(0));
-      converter.add_link(sobel_y->get_output_socket(0), sobel_yy->get_input_socket(1));
+      KuwaharaAnisotropicOperation *kuwahara_anisotropic_operation =
+          new KuwaharaAnisotropicOperation();
+      kuwahara_anisotropic_operation->data = *data;

-      converter.add_link(sobel_x->get_output_socket(0), sobel_xy->get_input_socket(0));
-      converter.add_link(sobel_y->get_output_socket(0), sobel_xy->get_input_socket(1));
+      converter.add_operation(kuwahara_anisotropic_operation);
+      converter.map_input_socket(get_input_socket(0),
+                                 kuwahara_anisotropic_operation->get_input_socket(0));
+      converter.add_link(blur_y_operation->get_output_socket(0),
+                         kuwahara_anisotropic_operation->get_input_socket(1));

-      /* Blurring for more robustness. */
-      const int sigma = data->smoothing;
-
-      auto blur_sobel_xx = new FastGaussianBlurOperation();
-      auto blur_sobel_yy = new FastGaussianBlurOperation();
-      auto blur_sobel_xy = new FastGaussianBlurOperation();
-
-      blur_sobel_yy->set_size(sigma, sigma);
-      blur_sobel_xx->set_size(sigma, sigma);
-      blur_sobel_xy->set_size(sigma, sigma);
-
-      converter.add_operation(blur_sobel_xx);
-      converter.add_operation(blur_sobel_yy);
-      converter.add_operation(blur_sobel_xy);
-
-      converter.add_link(sobel_xx->get_output_socket(0), blur_sobel_xx->get_input_socket(0));
-      converter.add_link(sobel_yy->get_output_socket(0), blur_sobel_yy->get_input_socket(0));
-      converter.add_link(sobel_xy->get_output_socket(0), blur_sobel_xy->get_input_socket(0));
-
-      /* Apply anisotropic Kuwahara filter. */
-      KuwaharaAnisotropicOperation *aniso = new KuwaharaAnisotropicOperation();
-      aniso->set_kernel_size(data->size + 4);
-      converter.map_input_socket(get_input_socket(0), aniso->get_input_socket(0));
-      converter.add_operation(aniso);
-
-      converter.add_link(blur_sobel_xx->get_output_socket(0), aniso->get_input_socket(1));
-      converter.add_link(blur_sobel_yy->get_output_socket(0), aniso->get_input_socket(2));
-      converter.add_link(blur_sobel_xy->get_output_socket(0), aniso->get_input_socket(3));
-
-      converter.map_output_socket(get_output_socket(0), aniso->get_output_socket(0));
+      converter.map_output_socket(get_output_socket(0),
+                                  kuwahara_anisotropic_operation->get_output_socket(0));

      break;
    }
--- a/source/blender/compositor/operations/COM_KuwaharaAnisotropicOperation.cc
+++ b/source/blender/compositor/operations/COM_KuwaharaAnisotropicOperation.cc
@ -4,9 +4,12 @@

 #include "COM_KuwaharaAnisotropicOperation.h"

+#include "BLI_math_base.h"
 #include "BLI_math_base.hh"
-#include "BLI_vector.hh"
-#include "IMB_colormanagement.h"
+#include "BLI_math_matrix_types.hh"
+#include "BLI_math_vector.h"
+#include "BLI_math_vector.hh"
+#include "BLI_math_vector_types.hh"

 namespace blender::compositor {

@ -14,280 +17,547 @@ KuwaharaAnisotropicOperation::KuwaharaAnisotropicOperation()
 {
  this->add_input_socket(DataType::Color);
  this->add_input_socket(DataType::Color);
-  this->add_input_socket(DataType::Color);
-  this->add_input_socket(DataType::Color);
-
  this->add_output_socket(DataType::Color);
-
-  this->n_div_ = 8;
-  this->set_kernel_size(5);
-
  this->flags_.is_fullframe_operation = true;
 }

 void KuwaharaAnisotropicOperation::init_execution()
 {
  image_reader_ = this->get_input_socket_reader(0);
-  s_xx_reader_ = this->get_input_socket_reader(1);
-  s_yy_reader_ = this->get_input_socket_reader(2);
-  s_xy_reader_ = this->get_input_socket_reader(3);
+  structure_tensor_reader_ = this->get_input_socket_reader(1);
 }

 void KuwaharaAnisotropicOperation::deinit_execution()
 {
  image_reader_ = nullptr;
+  structure_tensor_reader_ = nullptr;
 }

+/* An implementation of the Anisotropic Kuwahara filter described in the paper:
+ *
+ *   Kyprianidis, Jan Eric, Henry Kang, and Jurgen Dollner. "Image and video abstraction by
+ *   anisotropic Kuwahara filtering." 2009.
+ *
+ * But with the polynomial weighting functions described in the paper:
+ *
+ *   Kyprianidis, Jan Eric, et al. "Anisotropic Kuwahara Filtering with Polynomial Weighting
+ *   Functions." 2010.
+ *
+ * And the sector weight function described in the paper:
+ *
+ *  Kyprianidis, Jan Eric. "Image and video abstraction by multi-scale anisotropic Kuwahara
+ *  filtering." 2011.
+ */
 void KuwaharaAnisotropicOperation::execute_pixel_sampled(float output[4],
-                                                         float x,
-                                                         float y,
-                                                         PixelSampler sampler)
+                                                         float x_float,
+                                                         float y_float,
+                                                         PixelSampler /* sampler */)
 {
-  const int width = this->get_width();
-  const int height = this->get_height();
+  using namespace math;
+  const int x = x_float;
+  const int y = y_float;

-  BLI_assert(width == s_xx_reader_->get_width());
-  BLI_assert(height == s_xx_reader_->get_height());
-  BLI_assert(width == s_yy_reader_->get_width());
-  BLI_assert(height == s_yy_reader_->get_height());
-  BLI_assert(width == s_xy_reader_->get_width());
-  BLI_assert(height == s_xy_reader_->get_height());
+  /* The structure tensor is encoded in a float4 using a column major storage order, as can be
+   * seen in the KuwaharaAnisotropicStructureTensorOperation. */
+  float4 encoded_structure_tensor;
+  structure_tensor_reader_->read(encoded_structure_tensor, x, y, nullptr);
+  float dxdx = encoded_structure_tensor.x;
+  float dxdy = encoded_structure_tensor.y;
+  float dydy = encoded_structure_tensor.w;

-  /* Values recommended by authors in original paper. */
-  const float angle = 2.0 * M_PI / n_div_;
-  const float q = 3.0;
-  const float EPS = 1.0e-10;
+  /* Compute the first and second eigenvalues of the structure tensor using the equations in
+   * section "3.1 Orientation and Anisotropy Estimation" of the paper. */
+  float eigenvalue_first_term = (dxdx + dydy) / 2.0f;
+  float eigenvalue_square_root_term = sqrt(square(dxdx - dydy) + 4.0f * square(dxdy)) / 2.0f;
+  float first_eigenvalue = eigenvalue_first_term + eigenvalue_square_root_term;
+  float second_eigenvalue = eigenvalue_first_term - eigenvalue_square_root_term;

-  /* All channels are identical. Take first channel for simplicity. */
-  float tmp[4];
-  s_xx_reader_->read(tmp, x, y, nullptr);
-  const float a = tmp[1];
-  s_xy_reader_->read(tmp, x, y, nullptr);
-  const float b = tmp[1];
-  s_yy_reader_->read(tmp, x, y, nullptr);
-  const float c = tmp[1];
+  /* Compute the normalized eigenvector of the structure tensor oriented in direction of the
+   * minimum rate of change using the equations in section "3.1 Orientation and Anisotropy
+   * Estimation" of the paper. */
+  float2 eigenvector = float2(first_eigenvalue - dxdx, -dxdy);
+  float eigenvector_length = length(eigenvector);
+  float2 unit_eigenvector = eigenvector_length != 0.0f ? eigenvector / eigenvector_length :
+                                                         float2(1.0f);

-  /* Compute egenvalues of structure tensor. */
-  const double tr = a + c;
-  const double discr = sqrt((a - b) * (a - b) + 4 * b * c);
-  const double lambda1 = (tr + discr) / 2;
-  const double lambda2 = (tr - discr) / 2;
+  /* Compute the amount of anisotropy using equations in section "3.1 Orientation and Anisotropy
+   * Estimation" of the paper. The anisotropy ranges from 0 to 1, where 0 corresponds to
+   * isotropic and 1 corresponds to entirely anisotropic regions. */
+  float eigenvalue_sum = first_eigenvalue + second_eigenvalue;
+  float eigenvalue_difference = first_eigenvalue - second_eigenvalue;
+  float anisotropy = eigenvalue_sum > 0.0f ? eigenvalue_difference / eigenvalue_sum : 0.0f;

-  /* Compute orientation and its strength based on structure tensor. */
-  const double orientation = 0.5 * atan2(2 * b, a - c);
-  const double strength = (lambda1 == 0 && lambda2 == 0) ?
-                              0 :
-                              (lambda1 - lambda2) / (lambda1 + lambda2);
+  /* Compute the width and height of an ellipse that is more width-elongated for high anisotropy
+   * and more circular for low anisotropy, controlled using the eccentricity factor. Since the
+   * anisotropy is in the [0, 1] range, the width factor tends to 1 as the eccentricity tends to
+   * infinity and tends to infinity when the eccentricity tends to zero. This is based on the
+   * equations in section "3.2. Anisotropic Kuwahara Filtering" of the paper. */
+  float ellipse_width_factor = (get_eccentricity() + anisotropy) / get_eccentricity();
+  float ellipse_width = ellipse_width_factor * data.size;
+  float ellipse_height = data.size / ellipse_width_factor;

-  Vector<double> mean(n_div_);
-  Vector<double> sum(n_div_);
-  Vector<double> var(n_div_);
-  Vector<double> weight(n_div_);
+  /* Compute the cosine and sine of the angle that the eigenvector makes with the x axis. Since
+   * the eigenvector is normalized, its x and y components are the cosine and sine of the angle
+   * it makes with the x axis. */
+  float cosine = unit_eigenvector.x;
+  float sine = unit_eigenvector.y;

-  for (int ch = 0; ch < 3; ch++) {
-    mean.fill(0.0);
-    sum.fill(0.0);
-    var.fill(0.0);
-    weight.fill(0.0);
+  /* Compute an inverse transformation matrix that represents an ellipse of the given width and
+   * height and makes and an angle with the x axis of the given cosine and sine. This is an
+   * inverse matrix, so it transforms the ellipse into a disk of unit radius. */
+  float2x2 inverse_ellipse_matrix = float2x2(
+      float2(cosine / ellipse_width, -sine / ellipse_height),
+      float2(sine / ellipse_width, cosine / ellipse_height));

-    double sx = 1.0f / (strength + 1.0f);
-    double sy = (1.0f + strength) / 1.0f;
-    double theta = -orientation;
+  /* Compute the bounding box of a zero centered ellipse whose major axis is aligned with the
+   * eigenvector and has the given width and height. This is based on the equations described in:
+   *
+   *   https://iquilezles.org/articles/ellipses/
+   *
+   * Notice that we only compute the upper bound, the lower bound is just negative that since the
+   * ellipse is zero centered. Also notice that we take the ceiling of the bounding box, just to
+   * ensure the filter window is at least 1x1. */
+  float2 ellipse_major_axis = ellipse_width * unit_eigenvector;
+  float2 ellipse_minor_axis = ellipse_height * float2(unit_eigenvector.y, unit_eigenvector.x) *
+                              float2(-1, 1);
+  int2 ellipse_bounds = int2(ceil(sqrt(square(ellipse_major_axis) + square(ellipse_minor_axis))));

-    for (int dy = -kernel_size_; dy <= kernel_size_; dy++) {
-      for (int dx = -kernel_size_; dx <= kernel_size_; dx++) {
-        if (dx == 0 && dy == 0)
-          continue;
+  /* Compute the overlap polynomial parameters for 8-sector ellipse based on the equations in
+   * section "3 Alternative Weighting Functions" of the polynomial weights paper. More on this
+   * later in the code. */
+  const int number_of_sectors = 8;
+  float sector_center_overlap_parameter = 2.0f / data.size;
+  float sector_envelope_angle = ((3.0f / 2.0f) * M_PI) / number_of_sectors;
+  float cross_sector_overlap_parameter = (sector_center_overlap_parameter +
+                                          cos(sector_envelope_angle)) /
+                                         square(sin(sector_envelope_angle));

-        /* Rotate and scale the kernel. This is the "anisotropic" part. */
-        int dx2 = int(sx * (cos(theta) * dx - sin(theta) * dy));
-        int dy2 = int(sy * (sin(theta) * dx + cos(theta) * dy));
+  /* We need to compute the weighted mean of color and squared color of each of the 8 sectors of
+   * the ellipse, so we declare arrays for accumulating those and initialize them in the next
+   * code section. */
+  float4 weighted_mean_of_squared_color_of_sectors[8];
+  float4 weighted_mean_of_color_of_sectors[8];
+  float sum_of_weights_of_sectors[8];

-        /* Clamp image to avoid artifacts at borders. */
-        const int xx = math::clamp(int(x) + dx2, 0, width - 1);
-        const int yy = math::clamp(int(y) + dy2, 0, height - 1);
-
-        const double ddx2 = double(dx2);
-        const double ddy2 = double(dy2);
-        const double theta = atan2(ddy2, ddx2) + M_PI;
-        const int t = int(floor(theta / angle)) % n_div_;
-        double d2 = dx2 * dx2 + dy2 * dy2;
-        double g = exp(-d2 / (2.0 * kernel_size_));
-        float color[4];
-        image_reader_->read(color, xx, yy, nullptr);
-        const double v = color[ch];
-        /* TODO(@zazizizou): only compute lum once per region */
-        const float lum = IMB_colormanagement_get_luminance(color);
-        /* TODO(@zazizizou): only compute mean for the selected region */
-        mean[t] += g * v;
-        sum[t] += g * lum;
-        var[t] += g * lum * lum;
-        weight[t] += g;
-      }
-    }
-
-    /* Calculate weighted average */
-    double de = 0.0;
-    double nu = 0.0;
-    for (int i = 0; i < n_div_; i++) {
-      double weight_inv = 1.0 / weight[i];
-      mean[i] = weight[i] != 0 ? mean[i] * weight_inv : 0.0;
-      sum[i] = weight[i] != 0 ? sum[i] * weight_inv : 0.0;
-      var[i] = weight[i] != 0 ? var[i] * weight_inv : 0.0;
-      var[i] = var[i] - sum[i] * sum[i];
-      var[i] = var[i] > FLT_EPSILON ? sqrt(var[i]) : FLT_EPSILON;
-      double w = powf(var[i], -q);
-
-      de += mean[i] * w;
-      nu += w;
-    }
-
-    double val = nu > EPS ? de / nu : 0.0;
-    output[ch] = val;
+  /* The center pixel (0, 0) is exempt from the main loop below for reasons that are explained in
+   * the first if statement in the loop, so we need to accumulate its color, squared color, and
+   * weight separately first. Luckily, the zero coordinates of the center pixel zeros out most of
+   * the complex computations below, and it can easily be shown that the weight for the center
+   * pixel in all sectors is simply (1 / number_of_sectors). */
+  float4 center_color;
+  image_reader_->read(center_color, x, y, nullptr);
+  float4 center_color_squared = center_color * center_color;
+  float center_weight = 1.0f / number_of_sectors;
+  float4 weighted_center_color = center_color * center_weight;
+  float4 weighted_center_color_squared = center_color_squared * center_weight;
+  for (int i = 0; i < number_of_sectors; i++) {
+    weighted_mean_of_squared_color_of_sectors[i] = weighted_center_color_squared;
+    weighted_mean_of_color_of_sectors[i] = weighted_center_color;
+    sum_of_weights_of_sectors[i] = center_weight;
  }

-  /* No changes for alpha channel. */
-  image_reader_->read_sampled(tmp, x, y, sampler);
-  output[3] = tmp[3];
-}
-
-void KuwaharaAnisotropicOperation::set_kernel_size(int kernel_size)
-{
-  /* Filter will be split into n_div.
-   * Add n_div / 2 to avoid artifacts such as random black pixels in image. */
-  kernel_size_ = kernel_size + n_div_ / 2;
-}
-
-int KuwaharaAnisotropicOperation::get_kernel_size()
-{
-  return kernel_size_;
-}
-
-int KuwaharaAnisotropicOperation::get_n_div()
-{
-  return n_div_;
+  /* Loop over the window of pixels inside the bounding box of the ellipse. However, we utilize
+   * the fact that ellipses are mirror symmetric along the horizontal axis, so we reduce the
+   * window to only the upper two quadrants, and compute each two mirrored pixels at the same
+   * time using the same weight as an optimization. */
+  for (int j = 0; j <= ellipse_bounds.y; j++) {
+    for (int i = -ellipse_bounds.x; i <= ellipse_bounds.x; i++) {
+      /* Since we compute each two mirrored pixels at the same time, we need to also exempt the
+       * pixels whose x coordinates are negative and their y coordinates are zero, that's because
+       * those are mirrored versions of the pixels whose x coordinates are positive and their y
+       * coordinates are zero, and we don't want to compute and accumulate them twice. Moreover,
+       * we also need to exempt the center pixel with zero coordinates for the same reason,
+       * however, since the mirror of the center pixel is itself, it need to be accumulated
+       * separately, hence why we did that in the code section just before this loop. */
+      if (j == 0 && i <= 0) {
+        continue;
+      }
+
+      /* Map the pixels of the ellipse into a unit disk, exempting any points that are not part
+       * of the ellipse or disk. */
+      float2 disk_point = inverse_ellipse_matrix * float2(i, j);
+      float disk_point_length_squared = dot(disk_point, disk_point);
+      if (disk_point_length_squared > 1.0f) {
+        continue;
+      }
+
+      /* While each pixel belongs to a single sector in the ellipse, we expand the definition of
+       * a sector a bit to also overlap with other sectors as illustrated in Figure 8 of the
+       * polynomial weights paper. So each pixel may contribute to multiple sectors, and thus we
+       * compute its weight in each of the 8 sectors. */
+      float sector_weights[8];
+
+      /* We evaluate the weighting polynomial at each of the 8 sectors by rotating the disk point
+       * by 45 degrees and evaluating the weighting polynomial at each incremental rotation. To
+       * avoid potentially expensive rotations, we utilize the fact that rotations by 90 degrees
+       * are simply swapping of the coordinates and negating the x component. We also note that
+       * since the y term of the weighting polynomial is squared, it is not affected by the sign
+       * and can be computed once for the x and once for the y coordinates. So we compute every
+       * other even-indexed 4 weights by successive 90 degree rotations as discussed. */
+      float2 polynomial = sector_center_overlap_parameter -
+                          cross_sector_overlap_parameter * square(disk_point);
+      sector_weights[0] = square(max(0.0f, disk_point.y + polynomial.x));
+      sector_weights[2] = square(max(0.0f, -disk_point.x + polynomial.y));
+      sector_weights[4] = square(max(0.0f, -disk_point.y + polynomial.x));
+      sector_weights[6] = square(max(0.0f, disk_point.x + polynomial.y));
+
+      /* Then we rotate the disk point by 45 degrees, which is a simple expression involving a
+       * constant as can be demonstrated by applying a 45 degree rotation matrix. */
+      float2 rotated_disk_point = M_SQRT1_2 *
+                                  float2(disk_point.x - disk_point.y, disk_point.x + disk_point.y);
+
+      /* Finally, we compute every other odd-index 4 weights starting from the 45 degreed rotated
+       * disk point. */
+      float2 rotated_polynomial = sector_center_overlap_parameter -
+                                  cross_sector_overlap_parameter * square(rotated_disk_point);
+      sector_weights[1] = square(max(0.0f, rotated_disk_point.y + rotated_polynomial.x));
+      sector_weights[3] = square(max(0.0f, -rotated_disk_point.x + rotated_polynomial.y));
+      sector_weights[5] = square(max(0.0f, -rotated_disk_point.y + rotated_polynomial.x));
+      sector_weights[7] = square(max(0.0f, rotated_disk_point.x + rotated_polynomial.y));
+
+      /* We compute a radial Gaussian weighting component such that pixels further away from the
+       * sector center gets attenuated, and we also divide by the sum of sector weights to
+       * normalize them, since the radial weight will eventually be multiplied to the sector
+       * weight below. */
+      float sector_weights_sum = sector_weights[0] + sector_weights[1] + sector_weights[2] +
+                                 sector_weights[3] + sector_weights[4] + sector_weights[5] +
+                                 sector_weights[6] + sector_weights[7];
+      float radial_gaussian_weight = exp(-M_PI * disk_point_length_squared) / sector_weights_sum;
+
+      /* Load the color of the pixel and its mirrored pixel and compute their square. */
+      float4 upper_color;
+      image_reader_->read(upper_color,
+                          clamp(x + i, 0, int(this->get_width()) - 1),
+                          clamp(y + j, 0, int(this->get_height()) - 1),
+                          nullptr);
+      float4 lower_color;
+      image_reader_->read(lower_color,
+                          clamp(x - i, 0, int(this->get_width()) - 1),
+                          clamp(y - j, 0, int(this->get_height()) - 1),
+                          nullptr);
+      float4 upper_color_squared = upper_color * upper_color;
+      float4 lower_color_squared = lower_color * lower_color;
+
+      for (int k = 0; k < number_of_sectors; k++) {
+        float weight = sector_weights[k] * radial_gaussian_weight;
+
+        /* Accumulate the pixel to each of the sectors multiplied by the sector weight. */
+        int upper_index = k;
+        sum_of_weights_of_sectors[upper_index] += weight;
+        weighted_mean_of_color_of_sectors[upper_index] += upper_color * weight;
+        weighted_mean_of_squared_color_of_sectors[upper_index] += upper_color_squared * weight;
+
+        /* Accumulate the mirrored pixel to each of the sectors multiplied by the sector weight.
+         */
+        int lower_index = (k + number_of_sectors / 2) % number_of_sectors;
+        sum_of_weights_of_sectors[lower_index] += weight;
+        weighted_mean_of_color_of_sectors[lower_index] += lower_color * weight;
+        weighted_mean_of_squared_color_of_sectors[lower_index] += lower_color_squared * weight;
+      }
+    }
+  }
+
+  /* Compute the weighted sum of mean of sectors, such that sectors with lower standard deviation
+   * gets more significant weight than sectors with higher standard deviation. */
+  float sum_of_weights = 0.0f;
+  float4 weighted_sum = float4(0.0f);
+  for (int i = 0; i < number_of_sectors; i++) {
+    weighted_mean_of_color_of_sectors[i] /= sum_of_weights_of_sectors[i];
+    weighted_mean_of_squared_color_of_sectors[i] /= sum_of_weights_of_sectors[i];
+
+    float4 color_mean = weighted_mean_of_color_of_sectors[i];
+    float4 squared_color_mean = weighted_mean_of_squared_color_of_sectors[i];
+    float4 color_variance = abs(squared_color_mean - color_mean * color_mean);
+
+    float standard_deviation = dot(sqrt(color_variance.xyz()), float3(1.0));
+
+    /* Compute the sector weight based on the weight function introduced in section "3.3.1
+     * Single-scale Filtering" of the multi-scale paper. Use a threshold of 0.02 to avoid zero
+     * division and avoid artifacts in homogeneous regions as demonstrated in the paper. */
+    float weight = 1.0f / pow(max(0.02f, standard_deviation), get_sharpness());
+
+    sum_of_weights += weight;
+    weighted_sum += color_mean * weight;
+  }
+  weighted_sum /= sum_of_weights;
+
+  copy_v4_v4(output, weighted_sum);
 }

+/* An implementation of the Anisotropic Kuwahara filter described in the paper:
+ *
+ *   Kyprianidis, Jan Eric, Henry Kang, and Jurgen Dollner. "Image and video abstraction by
+ *   anisotropic Kuwahara filtering." 2009.
+ *
+ * But with the polynomial weighting functions described in the paper:
+ *
+ *   Kyprianidis, Jan Eric, et al. "Anisotropic Kuwahara Filtering with Polynomial Weighting
+ *   Functions." 2010.
+ *
+ * And the sector weight function described in the paper:
+ *
+ *  Kyprianidis, Jan Eric. "Image and video abstraction by multi-scale anisotropic Kuwahara
+ *  filtering." 2011.
+ */
 void KuwaharaAnisotropicOperation::update_memory_buffer_partial(MemoryBuffer *output,
                                                                const rcti &area,
                                                                Span<MemoryBuffer *> inputs)
 {
-  /* Implementation based on Kyprianidis, Jan & Kang, Henry & Döllner, Jürgen. (2009).
-   * "Image and Video Abstraction by Anisotropic Kuwahara Filtering".
-   * Comput. Graph. Forum. 28. 1955-1963. 10.1111/j.1467-8659.2009.01574.x.
-   * Used reference implementation from lime image processing library (MIT license). */
-
-  MemoryBuffer *image = inputs[0];
-  MemoryBuffer *s_xx = inputs[1];
-  MemoryBuffer *s_yy = inputs[2];
-  MemoryBuffer *s_xy = inputs[3];
-
-  const int width = image->get_width();
-  const int height = image->get_height();
-
-  BLI_assert(width == s_xx->get_width());
-  BLI_assert(height == s_xx->get_height());
-  BLI_assert(width == s_yy->get_width());
-  BLI_assert(height == s_yy->get_height());
-  BLI_assert(width == s_xy->get_width());
-  BLI_assert(height == s_xy->get_height());
-
-  /* Values recommended by authors in original paper. */
-  const float angle = 2.0 * M_PI / n_div_;
-  const float q = 3.0;
-  const float EPS = 1.0e-10;
-
+  using namespace math;
  for (BuffersIterator<float> it = output->iterate_with(inputs, area); !it.is_end(); ++it) {
-    const int x = it.x;
-    const int y = it.y;
+    /* The structure tensor is encoded in a float4 using a column major storage order, as can be
+     * seen in the KuwaharaAnisotropicStructureTensorOperation. */
+    float4 encoded_structure_tensor = float4(inputs[1]->get_elem(it.x, it.y));
+    float dxdx = encoded_structure_tensor.x;
+    float dxdy = encoded_structure_tensor.y;
+    float dydy = encoded_structure_tensor.w;

-    /* All channels are identical. Take first channel for simplicity. */
-    const float a = s_xx->get_value(x, y, 0);
-    const float b = s_xy->get_value(x, y, 0);
-    const float c = s_yy->get_value(x, y, 0);
+    /* Compute the first and second eigenvalues of the structure tensor using the equations in
+     * section "3.1 Orientation and Anisotropy Estimation" of the paper. */
+    float eigenvalue_first_term = (dxdx + dydy) / 2.0f;
+    float eigenvalue_square_root_term = sqrt(square(dxdx - dydy) + 4.0f * square(dxdy)) / 2.0f;
+    float first_eigenvalue = eigenvalue_first_term + eigenvalue_square_root_term;
+    float second_eigenvalue = eigenvalue_first_term - eigenvalue_square_root_term;

-    /* Compute egenvalues of structure tensor */
-    const double tr = a + c;
-    const double discr = sqrt((a - b) * (a - b) + 4 * b * c);
-    const double lambda1 = (tr + discr) / 2;
-    const double lambda2 = (tr - discr) / 2;
+    /* Compute the normalized eigenvector of the structure tensor oriented in direction of the
+     * minimum rate of change using the equations in section "3.1 Orientation and Anisotropy
+     * Estimation" of the paper. */
+    float2 eigenvector = float2(first_eigenvalue - dxdx, -dxdy);
+    float eigenvector_length = length(eigenvector);
+    float2 unit_eigenvector = eigenvector_length != 0.0f ? eigenvector / eigenvector_length :
+                                                           float2(1.0f);

-    /* Compute orientation and its strength based on structure tensor. */
-    const double orientation = 0.5 * atan2(2 * b, a - c);
-    const double strength = (lambda1 == 0 && lambda2 == 0) ?
-                                0 :
-                                (lambda1 - lambda2) / (lambda1 + lambda2);
+    /* Compute the amount of anisotropy using equations in section "3.1 Orientation and Anisotropy
+     * Estimation" of the paper. The anisotropy ranges from 0 to 1, where 0 corresponds to
+     * isotropic and 1 corresponds to entirely anisotropic regions. */
+    float eigenvalue_sum = first_eigenvalue + second_eigenvalue;
+    float eigenvalue_difference = first_eigenvalue - second_eigenvalue;
+    float anisotropy = eigenvalue_sum > 0.0f ? eigenvalue_difference / eigenvalue_sum : 0.0f;

-    Vector<double> mean(n_div_);
-    Vector<double> sum(n_div_);
-    Vector<double> var(n_div_);
-    Vector<double> weight(n_div_);
+    /* Compute the width and height of an ellipse that is more width-elongated for high anisotropy
+     * and more circular for low anisotropy, controlled using the eccentricity factor. Since the
+     * anisotropy is in the [0, 1] range, the width factor tends to 1 as the eccentricity tends to
+     * infinity and tends to infinity when the eccentricity tends to zero. This is based on the
+     * equations in section "3.2. Anisotropic Kuwahara Filtering" of the paper. */
+    float ellipse_width_factor = (get_eccentricity() + anisotropy) / get_eccentricity();
+    float ellipse_width = ellipse_width_factor * data.size;
+    float ellipse_height = data.size / ellipse_width_factor;

-    for (int ch = 0; ch < 3; ch++) {
-      mean.fill(0.0);
-      sum.fill(0.0);
-      var.fill(0.0);
-      weight.fill(0.0);
+    /* Compute the cosine and sine of the angle that the eigenvector makes with the x axis. Since
+     * the eigenvector is normalized, its x and y components are the cosine and sine of the angle
+     * it makes with the x axis. */
+    float cosine = unit_eigenvector.x;
+    float sine = unit_eigenvector.y;

-      double sx = 1.0f / (strength + 1.0f);
-      double sy = (1.0f + strength) / 1.0f;
-      double theta = -orientation;
+    /* Compute an inverse transformation matrix that represents an ellipse of the given width and
+     * height and makes and an angle with the x axis of the given cosine and sine. This is an
+     * inverse matrix, so it transforms the ellipse into a disk of unit radius. */
+    float2x2 inverse_ellipse_matrix = float2x2(
+        float2(cosine / ellipse_width, -sine / ellipse_height),
+        float2(sine / ellipse_width, cosine / ellipse_height));

-      for (int dy = -kernel_size_; dy <= kernel_size_; dy++) {
-        for (int dx = -kernel_size_; dx <= kernel_size_; dx++) {
-          if (dx == 0 && dy == 0)
-            continue;
+    /* Compute the bounding box of a zero centered ellipse whose major axis is aligned with the
+     * eigenvector and has the given width and height. This is based on the equations described in:
+     *
+     *   https://iquilezles.org/articles/ellipses/
+     *
+     * Notice that we only compute the upper bound, the lower bound is just negative that since the
+     * ellipse is zero centered. Also notice that we take the ceiling of the bounding box, just to
+     * ensure the filter window is at least 1x1. */
+    float2 ellipse_major_axis = ellipse_width * unit_eigenvector;
+    float2 ellipse_minor_axis = ellipse_height * float2(unit_eigenvector.y, unit_eigenvector.x) *
+                                float2(-1, 1);
+    int2 ellipse_bounds = int2(
+        ceil(sqrt(square(ellipse_major_axis) + square(ellipse_minor_axis))));

-          /* Rotate and scale the kernel. This is the "anisotropic" part. */
-          int dx2 = int(sx * (cos(theta) * dx - sin(theta) * dy));
-          int dy2 = int(sy * (sin(theta) * dx + cos(theta) * dy));
+    /* Compute the overlap polynomial parameters for 8-sector ellipse based on the equations in
+     * section "3 Alternative Weighting Functions" of the polynomial weights paper. More on this
+     * later in the code. */
+    int number_of_sectors = 8;
+    float sector_center_overlap_parameter = 2.0f / data.size;
+    float sector_envelope_angle = ((3.0f / 2.0f) * M_PI) / number_of_sectors;
+    float cross_sector_overlap_parameter = (sector_center_overlap_parameter +
+                                            cos(sector_envelope_angle)) /
+                                           square(sin(sector_envelope_angle));

-          /* Clamp image to avoid artifacts at borders. */
-          const int xx = math::clamp(x + dx2, 0, width - 1);
-          const int yy = math::clamp(y + dy2, 0, height - 1);
+    /* We need to compute the weighted mean of color and squared color of each of the 8 sectors of
+     * the ellipse, so we declare arrays for accumulating those and initialize them in the next
+     * code section. */
+    float4 weighted_mean_of_squared_color_of_sectors[8];
+    float4 weighted_mean_of_color_of_sectors[8];
+    float sum_of_weights_of_sectors[8];

-          const double ddx2 = double(dx2);
-          const double ddy2 = double(dy2);
-          const double theta = atan2(ddy2, ddx2) + M_PI;
-          const int t = int(floor(theta / angle)) % n_div_;
-          double d2 = dx2 * dx2 + dy2 * dy2;
-          double g = exp(-d2 / (2.0 * kernel_size_));
-          const double v = image->get_value(xx, yy, ch);
-          float color[4];
-          image->read_elem(xx, yy, color);
-          /* TODO(@zazizizou): only compute lum once per region. */
-          const float lum = IMB_colormanagement_get_luminance(color);
-          /* TODO(@zazizizou): only compute mean for the selected region. */
-          mean[t] += g * v;
-          sum[t] += g * lum;
-          var[t] += g * lum * lum;
-          weight[t] += g;
-        }
-      }
-
-      /* Calculate weighted average. */
-      double de = 0.0;
-      double nu = 0.0;
-      for (int i = 0; i < n_div_; i++) {
-        double weight_inv = 1.0 / weight[i];
-        mean[i] = weight[i] != 0 ? mean[i] * weight_inv : 0.0;
-        sum[i] = weight[i] != 0 ? sum[i] * weight_inv : 0.0;
-        var[i] = weight[i] != 0 ? var[i] * weight_inv : 0.0;
-        var[i] = var[i] - sum[i] * sum[i];
-        var[i] = var[i] > FLT_EPSILON ? sqrt(var[i]) : FLT_EPSILON;
-        double w = powf(var[i], -q);
-
-        de += mean[i] * w;
-        nu += w;
-      }
-
-      double val = nu > EPS ? de / nu : 0.0;
-      it.out[ch] = val;
+    /* The center pixel (0, 0) is exempt from the main loop below for reasons that are explained in
+     * the first if statement in the loop, so we need to accumulate its color, squared color, and
+     * weight separately first. Luckily, the zero coordinates of the center pixel zeros out most of
+     * the complex computations below, and it can easily be shown that the weight for the center
+     * pixel in all sectors is simply (1 / number_of_sectors). */
+    float4 center_color = float4(inputs[0]->get_elem(it.x, it.y));
+    float4 center_color_squared = center_color * center_color;
+    float center_weight = 1.0f / number_of_sectors;
+    float4 weighted_center_color = center_color * center_weight;
+    float4 weighted_center_color_squared = center_color_squared * center_weight;
+    for (int i = 0; i < number_of_sectors; i++) {
+      weighted_mean_of_squared_color_of_sectors[i] = weighted_center_color_squared;
+      weighted_mean_of_color_of_sectors[i] = weighted_center_color;
+      sum_of_weights_of_sectors[i] = center_weight;
    }

-    /* No changes for alpha channel. */
-    it.out[3] = image->get_value(x, y, 3);
+    /* Loop over the window of pixels inside the bounding box of the ellipse. However, we utilize
+     * the fact that ellipses are mirror symmetric along the horizontal axis, so we reduce the
+     * window to only the upper two quadrants, and compute each two mirrored pixels at the same
+     * time using the same weight as an optimization. */
+    for (int j = 0; j <= ellipse_bounds.y; j++) {
+      for (int i = -ellipse_bounds.x; i <= ellipse_bounds.x; i++) {
+        /* Since we compute each two mirrored pixels at the same time, we need to also exempt the
+         * pixels whose x coordinates are negative and their y coordinates are zero, that's because
+         * those are mirrored versions of the pixels whose x coordinates are positive and their y
+         * coordinates are zero, and we don't want to compute and accumulate them twice. Moreover,
+         * we also need to exempt the center pixel with zero coordinates for the same reason,
+         * however, since the mirror of the center pixel is itself, it need to be accumulated
+         * separately, hence why we did that in the code section just before this loop. */
+        if (j == 0 && i <= 0) {
+          continue;
+        }
+
+        /* Map the pixels of the ellipse into a unit disk, exempting any points that are not part
+         * of the ellipse or disk. */
+        float2 disk_point = inverse_ellipse_matrix * float2(i, j);
+        float disk_point_length_squared = dot(disk_point, disk_point);
+        if (disk_point_length_squared > 1.0f) {
+          continue;
+        }
+
+        /* While each pixel belongs to a single sector in the ellipse, we expand the definition of
+         * a sector a bit to also overlap with other sectors as illustrated in Figure 8 of the
+         * polynomial weights paper. So each pixel may contribute to multiple sectors, and thus we
+         * compute its weight in each of the 8 sectors. */
+        float sector_weights[8];
+
+        /* We evaluate the weighting polynomial at each of the 8 sectors by rotating the disk point
+         * by 45 degrees and evaluating the weighting polynomial at each incremental rotation. To
+         * avoid potentially expensive rotations, we utilize the fact that rotations by 90 degrees
+         * are simply swapping of the coordinates and negating the x component. We also note that
+         * since the y term of the weighting polynomial is squared, it is not affected by the sign
+         * and can be computed once for the x and once for the y coordinates. So we compute every
+         * other even-indexed 4 weights by successive 90 degree rotations as discussed. */
+        float2 polynomial = sector_center_overlap_parameter -
+                            cross_sector_overlap_parameter * square(disk_point);
+        sector_weights[0] = square(max(0.0f, disk_point.y + polynomial.x));
+        sector_weights[2] = square(max(0.0f, -disk_point.x + polynomial.y));
+        sector_weights[4] = square(max(0.0f, -disk_point.y + polynomial.x));
+        sector_weights[6] = square(max(0.0f, disk_point.x + polynomial.y));
+
+        /* Then we rotate the disk point by 45 degrees, which is a simple expression involving a
+         * constant as can be demonstrated by applying a 45 degree rotation matrix. */
+        float2 rotated_disk_point = M_SQRT1_2 * float2(disk_point.x - disk_point.y,
+                                                       disk_point.x + disk_point.y);
+
+        /* Finally, we compute every other odd-index 4 weights starting from the 45 degreed rotated
+         * disk point. */
+        float2 rotated_polynomial = sector_center_overlap_parameter -
+                                    cross_sector_overlap_parameter * square(rotated_disk_point);
+        sector_weights[1] = square(max(0.0f, rotated_disk_point.y + rotated_polynomial.x));
+        sector_weights[3] = square(max(0.0f, -rotated_disk_point.x + rotated_polynomial.y));
+        sector_weights[5] = square(max(0.0f, -rotated_disk_point.y + rotated_polynomial.x));
+        sector_weights[7] = square(max(0.0f, rotated_disk_point.x + rotated_polynomial.y));
+
+        /* We compute a radial Gaussian weighting component such that pixels further away from the
+         * sector center gets attenuated, and we also divide by the sum of sector weights to
+         * normalize them, since the radial weight will eventually be multiplied to the sector
+         * weight below. */
+        float sector_weights_sum = sector_weights[0] + sector_weights[1] + sector_weights[2] +
+                                   sector_weights[3] + sector_weights[4] + sector_weights[5] +
+                                   sector_weights[6] + sector_weights[7];
+        float radial_gaussian_weight = exp(-M_PI * disk_point_length_squared) / sector_weights_sum;
+
+        /* Load the color of the pixel and its mirrored pixel and compute their square. */
+        float4 upper_color = float4(
+            inputs[0]->get_elem(clamp(it.x + i, 0, inputs[0]->get_width() - 1),
+                                clamp(it.y + j, 0, inputs[0]->get_height() - 1)));
+        float4 lower_color = float4(
+            inputs[0]->get_elem(clamp(it.x - i, 0, inputs[0]->get_width() - 1),
+                                clamp(it.y - j, 0, inputs[0]->get_height() - 1)));
+        float4 upper_color_squared = upper_color * upper_color;
+        float4 lower_color_squared = lower_color * lower_color;
+
+        for (int k = 0; k < number_of_sectors; k++) {
+          float weight = sector_weights[k] * radial_gaussian_weight;
+
+          /* Accumulate the pixel to each of the sectors multiplied by the sector weight. */
+          int upper_index = k;
+          sum_of_weights_of_sectors[upper_index] += weight;
+          weighted_mean_of_color_of_sectors[upper_index] += upper_color * weight;
+          weighted_mean_of_squared_color_of_sectors[upper_index] += upper_color_squared * weight;
+
+          /* Accumulate the mirrored pixel to each of the sectors multiplied by the sector weight.
+           */
+          int lower_index = (k + number_of_sectors / 2) % number_of_sectors;
+          sum_of_weights_of_sectors[lower_index] += weight;
+          weighted_mean_of_color_of_sectors[lower_index] += lower_color * weight;
+          weighted_mean_of_squared_color_of_sectors[lower_index] += lower_color_squared * weight;
+        }
+      }
+    }
+
+    /* Compute the weighted sum of mean of sectors, such that sectors with lower standard deviation
+     * gets more significant weight than sectors with higher standard deviation. */
+    float sum_of_weights = 0.0f;
+    float4 weighted_sum = float4(0.0f);
+    for (int i = 0; i < number_of_sectors; i++) {
+      weighted_mean_of_color_of_sectors[i] /= sum_of_weights_of_sectors[i];
+      weighted_mean_of_squared_color_of_sectors[i] /= sum_of_weights_of_sectors[i];
+
+      float4 color_mean = weighted_mean_of_color_of_sectors[i];
+      float4 squared_color_mean = weighted_mean_of_squared_color_of_sectors[i];
+      float4 color_variance = abs(squared_color_mean - color_mean * color_mean);
+
+      float standard_deviation = dot(sqrt(color_variance.xyz()), float3(1.0));
+
+      /* Compute the sector weight based on the weight function introduced in section "3.3.1
+       * Single-scale Filtering" of the multi-scale paper. Use a threshold of 0.02 to avoid zero
+       * division and avoid artifacts in homogeneous regions as demonstrated in the paper. */
+      float weight = 1.0f / pow(max(0.02f, standard_deviation), get_sharpness());
+
+      sum_of_weights += weight;
+      weighted_sum += color_mean * weight;
+    }
+    weighted_sum /= sum_of_weights;
+
+    copy_v4_v4(it.out, weighted_sum);
  }
 }

+/* The sharpness controls the sharpness of the transitions between the kuwahara sectors, which is
+ * controlled by the weighting function pow(standard_deviation, -sharpness) as can be seen in the
+ * compute function. The transition is completely smooth when the sharpness is zero and completely
+ * sharp when it is infinity. But realistically, the sharpness doesn't change much beyond the value
+ * of 16 due to its exponential nature, so we just assume a maximum sharpness of 16.
+ *
+ * The stored sharpness is in the range [0, 1], so we multiply by 16 to get it in the range
+ * [0, 16], however, we also square it before multiplication to slow down the rate of change near
+ * zero to counter its exponential nature for more intuitive user control. */
+float KuwaharaAnisotropicOperation::get_sharpness()
+{
+  return data.sharpness * data.sharpness * 16.0f;
+}
+
+/* The eccentricity controls how much the image anisotropy affects the eccentricity of the
+ * kuwahara sectors, which is controlled by the following factor that gets multiplied to the
+ * radius to get the ellipse width and divides the radius to get the ellipse height:
+ *
+ *   (eccentricity + anisotropy) / eccentricity
+ *
+ * Since the anisotropy is in the [0, 1] range, the factor tends to 1 as the eccentricity tends
+ * to infinity and tends to infinity when the eccentricity tends to zero. The stored eccentricity
+ * is in the range [0, 2], we map that to the range [infinity, 0.5] by taking the reciprocal,
+ * satisfying the aforementioned limits. The upper limit doubles the computed default
+ * eccentricity, which users can use to enhance the directionality of the filter. Instead of
+ * actual infinity, we just use an eccentricity of 1 / 0.01 since the result is very similar to
+ * that of infinity. */
+float KuwaharaAnisotropicOperation::get_eccentricity()
+{
+  return 1.0f / math::max(0.01f, data.eccentricity);
+}
+
 }  // namespace blender::compositor
--- a/source/blender/compositor/operations/COM_KuwaharaAnisotropicOperation.h
+++ b/source/blender/compositor/operations/COM_KuwaharaAnisotropicOperation.h
@ -4,33 +4,29 @@

 #pragma once

+#include "DNA_node_types.h"
+
 #include "COM_MultiThreadedOperation.h"

 namespace blender::compositor {

 class KuwaharaAnisotropicOperation : public MultiThreadedOperation {
  SocketReader *image_reader_;
-  SocketReader *s_xx_reader_;
-  SocketReader *s_yy_reader_;
-  SocketReader *s_xy_reader_;
-
-  int kernel_size_;
-  int n_div_;
+  SocketReader *structure_tensor_reader_;

 public:
+  NodeKuwaharaData data;
+
  KuwaharaAnisotropicOperation();

  void init_execution() override;
  void deinit_execution() override;
  void execute_pixel_sampled(float output[4], float x, float y, PixelSampler sampler) override;
-
-  void set_kernel_size(int kernel_size);
-  int get_kernel_size();
-  int get_n_div();
-
  void update_memory_buffer_partial(MemoryBuffer *output,
                                    const rcti &area,
                                    Span<MemoryBuffer *> inputs) override;
+  float get_sharpness();
+  float get_eccentricity();
 };

 }  // namespace blender::compositor
--- a/source/blender/compositor/operations/COM_KuwaharaAnisotropicStructureTensorOperation.cc
+++ b/source/blender/compositor/operations/COM_KuwaharaAnisotropicStructureTensorOperation.cc
@ -0,0 +1,155 @@
+/* SPDX-FileCopyrightText: 2023 Blender Foundation
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "COM_KuwaharaAnisotropicStructureTensorOperation.h"
+
+#include "BLI_math_base.hh"
+#include "BLI_math_vector.h"
+#include "BLI_math_vector.hh"
+#include "BLI_math_vector_types.hh"
+
+namespace blender::compositor {
+
+KuwaharaAnisotropicStructureTensorOperation::KuwaharaAnisotropicStructureTensorOperation()
+{
+  this->add_input_socket(DataType::Color);
+  this->add_output_socket(DataType::Color);
+  this->flags_.is_fullframe_operation = true;
+}
+
+void KuwaharaAnisotropicStructureTensorOperation::init_execution()
+{
+  image_reader_ = this->get_input_socket_reader(0);
+}
+
+void KuwaharaAnisotropicStructureTensorOperation::deinit_execution()
+{
+  image_reader_ = nullptr;
+}
+
+/* Computes the structure tensor of the image using a Dirac delta window function as described in
+ * section "3.2 Local Structure Estimation" of the paper:
+ *
+ *   Kyprianidis, Jan Eric. "Image and video abstraction by multi-scale anisotropic Kuwahara
+ *   filtering." 2011.
+ *
+ * The structure tensor should then be smoothed using a Gaussian function to eliminate high
+ * frequency details. */
+void KuwaharaAnisotropicStructureTensorOperation::execute_pixel_sampled(float output[4],
+                                                                        float x_float,
+                                                                        float y_float,
+                                                                        PixelSampler /* sampler */)
+{
+  using math::max, math::min, math::dot;
+  const int x = x_float;
+  const int y = y_float;
+  const int width = this->get_width();
+  const int height = this->get_height();
+
+  /* The weight kernels of the filter optimized for rotational symmetry described in section "3.2.1
+   * Gradient Calculation". */
+  const float corner_weight = 0.182f;
+  const float center_weight = 1.0f - 2.0f * corner_weight;
+
+  float4 input_color;
+  float3 x_partial_derivative = float3(0.0f);
+  image_reader_->read(input_color, max(0, x - 1), min(height - 1, y + 1), nullptr);
+  x_partial_derivative += input_color.xyz() * -corner_weight;
+  image_reader_->read(input_color, max(0, x - 1), y, nullptr);
+  x_partial_derivative += input_color.xyz() * -center_weight;
+  image_reader_->read(input_color, max(0, x - 1), max(0, y - 1), nullptr);
+  x_partial_derivative += input_color.xyz() * -corner_weight;
+  image_reader_->read(input_color, min(width, x + 1), min(height - 1, y + 1), nullptr);
+  x_partial_derivative += input_color.xyz() * corner_weight;
+  image_reader_->read(input_color, min(width, x + 1), y, nullptr);
+  x_partial_derivative += input_color.xyz() * center_weight;
+  image_reader_->read(input_color, min(width, x + 1), max(0, y - 1), nullptr);
+  x_partial_derivative += input_color.xyz() * corner_weight;
+
+  float3 y_partial_derivative = float3(0.0f);
+  image_reader_->read(input_color, max(0, x - 1), min(height - 1, y + 1), nullptr);
+  y_partial_derivative += input_color.xyz() * corner_weight;
+  image_reader_->read(input_color, x, min(height - 1, y + 1), nullptr);
+  y_partial_derivative += input_color.xyz() * center_weight;
+  image_reader_->read(input_color, min(width, x + 1), min(height - 1, y + 1), nullptr);
+  y_partial_derivative += input_color.xyz() * corner_weight;
+  image_reader_->read(input_color, max(0, x - 1), max(0, y - 1), nullptr);
+  y_partial_derivative += input_color.xyz() * -corner_weight;
+  image_reader_->read(input_color, x, max(0, y - 1), nullptr);
+  y_partial_derivative += input_color.xyz() * -center_weight;
+  image_reader_->read(input_color, min(width, x + 1), max(0, y - 1), nullptr);
+  y_partial_derivative += input_color.xyz() * -corner_weight;
+
+  /* We encode the structure tensor in a float4 using a column major storage order. */
+  float4 structure_tensor = float4(dot(x_partial_derivative, x_partial_derivative),
+                                   dot(x_partial_derivative, y_partial_derivative),
+                                   dot(x_partial_derivative, y_partial_derivative),
+                                   dot(y_partial_derivative, y_partial_derivative));
+  copy_v4_v4(output, structure_tensor);
+}
+
+/* Computes the structure tensor of the image using a Dirac delta window function as described in
+ * section "3.2 Local Structure Estimation" of the paper:
+ *
+ *   Kyprianidis, Jan Eric. "Image and video abstraction by multi-scale anisotropic Kuwahara
+ *   filtering." 2011.
+ *
+ * The structure tensor should then be smoothed using a Gaussian function to eliminate high
+ * frequency details. */
+void KuwaharaAnisotropicStructureTensorOperation::update_memory_buffer_partial(
+    MemoryBuffer *output, const rcti &area, Span<MemoryBuffer *> inputs)
+{
+  using math::max, math::min, math::dot;
+  MemoryBuffer *image = inputs[0];
+  const int width = image->get_width();
+  const int height = image->get_height();
+
+  /* The weight kernels of the filter optimized for rotational symmetry described in section
+   * "3.2.1 Gradient Calculation". */
+  const float corner_weight = 0.182f;
+  const float center_weight = 1.0f - 2.0f * corner_weight;
+
+  for (BuffersIterator<float> it = output->iterate_with(inputs, area); !it.is_end(); ++it) {
+    const int x = it.x;
+    const int y = it.y;
+
+    float3 input_color;
+    float3 x_partial_derivative = float3(0.0f);
+    input_color = float3(image->get_elem(max(0, x - 1), min(height - 1, y + 1)));
+    x_partial_derivative += input_color * -corner_weight;
+    input_color = float3(image->get_elem(max(0, x - 1), y));
+    x_partial_derivative += input_color * -center_weight;
+    input_color = float3(image->get_elem(max(0, x - 1), max(0, y - 1)));
+    x_partial_derivative += input_color * -corner_weight;
+    input_color = float3(image->get_elem(min(width - 1, x + 1), min(height - 1, y + 1)));
+    x_partial_derivative += input_color * corner_weight;
+    input_color = float3(image->get_elem(min(width - 1, x + 1), y));
+    x_partial_derivative += input_color * center_weight;
+    input_color = float3(image->get_elem(min(width - 1, x + 1), max(0, y - 1)));
+    x_partial_derivative += input_color * corner_weight;
+
+    float3 y_partial_derivative = float3(0.0f);
+    input_color = float3(image->get_elem(max(0, x - 1), min(height - 1, y + 1)));
+    y_partial_derivative += input_color * corner_weight;
+    input_color = float3(image->get_elem(x, min(height - 1, y + 1)));
+    y_partial_derivative += input_color * center_weight;
+    input_color = float3(image->get_elem(min(width - 1, x + 1), min(height - 1, y + 1)));
+    y_partial_derivative += input_color * corner_weight;
+    input_color = float3(image->get_elem(max(0, x - 1), max(0, y - 1)));
+    y_partial_derivative += input_color * -corner_weight;
+    input_color = float3(image->get_elem(x, max(0, y - 1)));
+    y_partial_derivative += input_color * -center_weight;
+    input_color = float3(image->get_elem(min(width - 1, x + 1), max(0, y - 1)));
+    y_partial_derivative += input_color * -corner_weight;
+
+    /* We encode the structure tensor in a float4 using a column major storage order. */
+    float4 structure_tensor = float4(dot(x_partial_derivative, x_partial_derivative),
+                                     dot(x_partial_derivative, y_partial_derivative),
+                                     dot(x_partial_derivative, y_partial_derivative),
+                                     dot(y_partial_derivative, y_partial_derivative));
+    copy_v4_v4(it.out, structure_tensor);
+  }
+}
+
+}  // namespace blender::compositor
--- a/source/blender/compositor/operations/COM_KuwaharaAnisotropicStructureTensorOperation.h
+++ b/source/blender/compositor/operations/COM_KuwaharaAnisotropicStructureTensorOperation.h
@ -0,0 +1,25 @@
+/* SPDX-FileCopyrightText: 2023 Blender Foundation
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#include "COM_MultiThreadedOperation.h"
+
+namespace blender::compositor {
+
+class KuwaharaAnisotropicStructureTensorOperation : public MultiThreadedOperation {
+  SocketReader *image_reader_;
+
+ public:
+  KuwaharaAnisotropicStructureTensorOperation();
+
+  void init_execution() override;
+  void deinit_execution() override;
+  void execute_pixel_sampled(float output[4], float x, float y, PixelSampler sampler) override;
+  void update_memory_buffer_partial(MemoryBuffer *output,
+                                    const rcti &area,
+                                    Span<MemoryBuffer *> inputs) override;
+};
+
+}  // namespace blender::compositor
--- a/source/blender/compositor/realtime_compositor/CMakeLists.txt
+++ b/source/blender/compositor/realtime_compositor/CMakeLists.txt
@ -143,6 +143,8 @@ set(GLSL_SRC
  shaders/compositor_keying_extract_chroma.glsl
  shaders/compositor_keying_replace_chroma.glsl
  shaders/compositor_keying_tweak_matte.glsl
+  shaders/compositor_kuwahara_anisotropic.glsl
+  shaders/compositor_kuwahara_anisotropic_compute_structure_tensor.glsl
  shaders/compositor_kuwahara_classic.glsl
  shaders/compositor_map_uv.glsl
  shaders/compositor_morphological_distance.glsl
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_kuwahara_anisotropic.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_kuwahara_anisotropic.glsl
@ -0,0 +1,237 @@
+#pragma BLENDER_REQUIRE(gpu_shader_math_base_lib.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+/* An implementation of the Anisotropic Kuwahara filter described in the paper:
+ *
+ *   Kyprianidis, Jan Eric, Henry Kang, and Jurgen Dollner. "Image and video abstraction by
+ *   anisotropic Kuwahara filtering." 2009.
+ *
+ * But with the polynomial weighting functions described in the paper:
+ *
+ *   Kyprianidis, Jan Eric, et al. "Anisotropic Kuwahara Filtering with Polynomial Weighting
+ *   Functions." 2010.
+ *
+ * And the sector weight function described in the paper:
+ *
+ *  Kyprianidis, Jan Eric. "Image and video abstraction by multi-scale anisotropic Kuwahara
+ *  filtering." 2011.
+ */
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+
+  /* The structure tensor is encoded in a vec4 using a column major storage order, as can be seen
+   * in the compositor_kuwahara_anisotropic_compute_structure_tensor.glsl shader. */
+  vec4 encoded_structure_tensor = texture_load(structure_tensor_tx, texel);
+  float dxdx = encoded_structure_tensor.x;
+  float dxdy = encoded_structure_tensor.y;
+  float dydy = encoded_structure_tensor.w;
+
+  /* Compute the first and second eigenvalues of the structure tensor using the equations in
+   * section "3.1 Orientation and Anisotropy Estimation" of the paper. */
+  float eigenvalue_first_term = (dxdx + dydy) / 2.0;
+  float eigenvalue_square_root_term = sqrt(square(dxdx - dydy) + 4.0 * square(dxdy)) / 2.0;
+  float first_eigenvalue = eigenvalue_first_term + eigenvalue_square_root_term;
+  float second_eigenvalue = eigenvalue_first_term - eigenvalue_square_root_term;
+
+  /* Compute the normalized eigenvector of the structure tensor oriented in direction of the
+   * minimum rate of change using the equations in section "3.1 Orientation and Anisotropy
+   * Estimation" of the paper. */
+  vec2 eigenvector = vec2(first_eigenvalue - dxdx, -dxdy);
+  float eigenvector_length = length(eigenvector);
+  vec2 unit_eigenvector = eigenvector_length != 0.0 ? eigenvector / eigenvector_length : vec2(1.0);
+
+  /* Compute the amount of anisotropy using equations in section "3.1 Orientation and Anisotropy
+   * Estimation" of the paper. The anisotropy ranges from 0 to 1, where 0 corresponds to isotropic
+   * and 1 corresponds to entirely anisotropic regions. */
+  float eigenvalue_sum = first_eigenvalue + second_eigenvalue;
+  float eigenvalue_difference = first_eigenvalue - second_eigenvalue;
+  float anisotropy = eigenvalue_sum > 0.0 ? eigenvalue_difference / eigenvalue_sum : 0.0;
+
+  /* Compute the width and height of an ellipse that is more width-elongated for high anisotropy
+   * and more circular for low anisotropy, controlled using the eccentricity factor. Since the
+   * anisotropy is in the [0, 1] range, the width factor tends to 1 as the eccentricity tends to
+   * infinity and tends to infinity when the eccentricity tends to zero. This is based on the
+   * equations in section "3.2. Anisotropic Kuwahara Filtering" of the paper. */
+  float ellipse_width_factor = (eccentricity + anisotropy) / eccentricity;
+  float ellipse_width = ellipse_width_factor * radius;
+  float ellipse_height = radius / ellipse_width_factor;
+
+  /* Compute the cosine and sine of the angle that the eigenvector makes with the x axis. Since the
+   * eigenvector is normalized, its x and y components are the cosine and sine of the angle it
+   * makes with the x axis. */
+  float cosine = unit_eigenvector.x;
+  float sine = unit_eigenvector.y;
+
+  /* Compute an inverse transformation matrix that represents an ellipse of the given width and
+   * height and makes and an angle with the x axis of the given cosine and sine. This is an inverse
+   * matrix, so it transforms the ellipse into a disk of unit radius. */
+  mat2 inverse_ellipse_matrix = mat2(cosine / ellipse_width,
+                                     -sine / ellipse_height,
+                                     sine / ellipse_width,
+                                     cosine / ellipse_height);
+
+  /* Compute the bounding box of a zero centered ellipse whose major axis is aligned with the
+   * eigenvector and has the given width and height. This is based on the equations described in:
+   *
+   *   https://iquilezles.org/articles/ellipses/
+   *
+   * Notice that we only compute the upper bound, the lower bound is just negative that since the
+   * ellipse is zero centered. Also notice that we take the ceiling of the bounding box, just to
+   * ensure the filter window is at least 1x1. */
+  vec2 ellipse_major_axis = ellipse_width * unit_eigenvector;
+  vec2 ellipse_minor_axis = ellipse_height * unit_eigenvector.yx * vec2(-1, 1);
+  ivec2 ellipse_bounds = ivec2(
+      ceil(sqrt(square(ellipse_major_axis) + square(ellipse_minor_axis))));
+
+  /* Compute the overlap polynomial parameters for 8-sector ellipse based on the equations in
+   * section "3 Alternative Weighting Functions" of the polynomial weights paper. More on this
+   * later in the code. */
+  const int number_of_sectors = 8;
+  float sector_center_overlap_parameter = 2.0 / radius;
+  float sector_envelope_angle = ((3.0 / 2.0) * M_PI) / number_of_sectors;
+  float cross_sector_overlap_parameter = (sector_center_overlap_parameter +
+                                          cos(sector_envelope_angle)) /
+                                         square(sin(sector_envelope_angle));
+
+  /* We need to compute the weighted mean of color and squared color of each of the 8 sectors of
+   * the ellipse, so we declare arrays for accumulating those and initialize them in the next code
+   * section. */
+  vec4 weighted_mean_of_squared_color_of_sectors[8];
+  vec4 weighted_mean_of_color_of_sectors[8];
+  float sum_of_weights_of_sectors[8];
+
+  /* The center pixel (0, 0) is exempt from the main loop below for reasons that are explained in
+   * the first if statement in the loop, so we need to accumulate its color, squared color, and
+   * weight separately first. Luckily, the zero coordinates of the center pixel zeros out most of
+   * the complex computations below, and it can easily be shown that the weight for the center
+   * pixel in all sectors is simply (1 / number_of_sectors). */
+  vec4 center_color = texture_load(input_tx, texel);
+  vec4 center_color_squared = center_color * center_color;
+  float center_weight = 1.0 / number_of_sectors;
+  vec4 weighted_center_color = center_color * center_weight;
+  vec4 weighted_center_color_squared = center_color_squared * center_weight;
+  for (int i = 0; i < number_of_sectors; i++) {
+    weighted_mean_of_squared_color_of_sectors[i] = weighted_center_color_squared;
+    weighted_mean_of_color_of_sectors[i] = weighted_center_color;
+    sum_of_weights_of_sectors[i] = center_weight;
+  }
+
+  /* Loop over the window of pixels inside the bounding box of the ellipse. However, we utilize the
+   * fact that ellipses are mirror symmetric along the horizontal axis, so we reduce the window to
+   * only the upper two quadrants, and compute each two mirrored pixels at the same time using the
+   * same weight as an optimization. */
+  for (int j = 0; j <= ellipse_bounds.y; j++) {
+    for (int i = -ellipse_bounds.x; i <= ellipse_bounds.x; i++) {
+      /* Since we compute each two mirrored pixels at the same time, we need to also exempt the
+       * pixels whose x coordinates are negative and their y coordinates are zero, that's because
+       * those are mirrored versions of the pixels whose x coordinates are positive and their y
+       * coordinates are zero, and we don't want to compute and accumulate them twice. Moreover, we
+       * also need to exempt the center pixel with zero coordinates for the same reason, however,
+       * since the mirror of the center pixel is itself, it need to be accumulated separately,
+       * hence why we did that in the code section just before this loop. */
+      if (j == 0 && i <= 0) {
+        continue;
+      }
+
+      /* Map the pixels of the ellipse into a unit disk, exempting any points that are not part of
+       * the ellipse or disk. */
+      vec2 disk_point = inverse_ellipse_matrix * vec2(i, j);
+      float disk_point_length_squared = dot(disk_point, disk_point);
+      if (disk_point_length_squared > 1.0) {
+        continue;
+      }
+
+      /* While each pixel belongs to a single sector in the ellipse, we expand the definition of
+       * a sector a bit to also overlap with other sectors as illustrated in Figure 8 of the
+       * polynomial weights paper. So each pixel may contribute to multiple sectors, and thus we
+       * compute its weight in each of the 8 sectors. */
+      float sector_weights[8];
+
+      /* We evaluate the weighting polynomial at each of the 8 sectors by rotating the disk point
+       * by 45 degrees and evaluating the weighting polynomial at each incremental rotation. To
+       * avoid potentially expensive rotations, we utilize the fact that rotations by 90 degrees
+       * are simply swapping of the coordinates and negating the x component. We also note that
+       * since the y term of the weighting polynomial is squared, it is not affected by the sign
+       * and can be computed once for the x and once for the y coordinates. So we compute every
+       * other even-indexed 4 weights by successive 90 degree rotations as discussed. */
+      vec2 polynomial = sector_center_overlap_parameter -
+                        cross_sector_overlap_parameter * square(disk_point);
+      sector_weights[0] = square(max(0.0, disk_point.y + polynomial.x));
+      sector_weights[2] = square(max(0.0, -disk_point.x + polynomial.y));
+      sector_weights[4] = square(max(0.0, -disk_point.y + polynomial.x));
+      sector_weights[6] = square(max(0.0, disk_point.x + polynomial.y));
+
+      /* Then we rotate the disk point by 45 degrees, which is a simple expression involving a
+       * constant as can be demonstrated by applying a 45 degree rotation matrix. */
+      vec2 rotated_disk_point = M_SQRT1_2 *
+                                vec2(disk_point.x - disk_point.y, disk_point.x + disk_point.y);
+
+      /* Finally, we compute every other odd-index 4 weights starting from the 45 degreed rotated
+       * disk point. */
+      vec2 rotated_polynomial = sector_center_overlap_parameter -
+                                cross_sector_overlap_parameter * square(rotated_disk_point);
+      sector_weights[1] = square(max(0.0, rotated_disk_point.y + rotated_polynomial.x));
+      sector_weights[3] = square(max(0.0, -rotated_disk_point.x + rotated_polynomial.y));
+      sector_weights[5] = square(max(0.0, -rotated_disk_point.y + rotated_polynomial.x));
+      sector_weights[7] = square(max(0.0, rotated_disk_point.x + rotated_polynomial.y));
+
+      /* We compute a radial Gaussian weighting component such that pixels further away from the
+       * sector center gets attenuated, and we also divide by the sum of sector weights to
+       * normalize them, since the radial weight will eventually be multiplied to the sector weight
+       * below. */
+      float sector_weights_sum = sector_weights[0] + sector_weights[1] + sector_weights[2] +
+                                 sector_weights[3] + sector_weights[4] + sector_weights[5] +
+                                 sector_weights[6] + sector_weights[7];
+      float radial_gaussian_weight = exp(-M_PI * disk_point_length_squared) / sector_weights_sum;
+
+      /* Load the color of the pixel and its mirrored pixel and compute their square. */
+      vec4 upper_color = texture_load(input_tx, texel + ivec2(i, j));
+      vec4 lower_color = texture_load(input_tx, texel - ivec2(i, j));
+      vec4 upper_color_squared = upper_color * upper_color;
+      vec4 lower_color_squared = lower_color * lower_color;
+
+      for (int k = 0; k < number_of_sectors; k++) {
+        float weight = sector_weights[k] * radial_gaussian_weight;
+
+        /* Accumulate the pixel to each of the sectors multiplied by the sector weight. */
+        int upper_index = k;
+        sum_of_weights_of_sectors[upper_index] += weight;
+        weighted_mean_of_color_of_sectors[upper_index] += upper_color * weight;
+        weighted_mean_of_squared_color_of_sectors[upper_index] += upper_color_squared * weight;
+
+        /* Accumulate the mirrored pixel to each of the sectors multiplied by the sector weight. */
+        int lower_index = (k + number_of_sectors / 2) % number_of_sectors;
+        sum_of_weights_of_sectors[lower_index] += weight;
+        weighted_mean_of_color_of_sectors[lower_index] += lower_color * weight;
+        weighted_mean_of_squared_color_of_sectors[lower_index] += lower_color_squared * weight;
+      }
+    }
+  }
+
+  /* Compute the weighted sum of mean of sectors, such that sectors with lower standard deviation
+   * gets more significant weight than sectors with higher standard deviation. */
+  float sum_of_weights = 0.0;
+  vec4 weighted_sum = vec4(0.0);
+  for (int i = 0; i < number_of_sectors; i++) {
+    weighted_mean_of_color_of_sectors[i] /= sum_of_weights_of_sectors[i];
+    weighted_mean_of_squared_color_of_sectors[i] /= sum_of_weights_of_sectors[i];
+
+    vec4 color_mean = weighted_mean_of_color_of_sectors[i];
+    vec4 squared_color_mean = weighted_mean_of_squared_color_of_sectors[i];
+    vec4 color_variance = abs(squared_color_mean - color_mean * color_mean);
+
+    float standard_deviation = dot(sqrt(color_variance.rgb), vec3(1.0));
+
+    /* Compute the sector weight based on the weight function introduced in section "3.3.1
+     * Single-scale Filtering" of the multi-scale paper. Use a threshold of 0.02 to avoid zero
+     * division and avoid artifacts in homogeneous regions as demonstrated in the paper. */
+    float weight = 1.0 / pow(max(0.02, standard_deviation), sharpness);
+
+    sum_of_weights += weight;
+    weighted_sum += color_mean * weight;
+  }
+  weighted_sum /= sum_of_weights;
+
+  imageStore(output_img, texel, weighted_sum);
+}
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_kuwahara_anisotropic_compute_structure_tensor.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_kuwahara_anisotropic_compute_structure_tensor.glsl
@ -0,0 +1,42 @@
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+/* Computes the structure tensor of the image using a Dirac delta window function as described in
+ * section "3.2 Local Structure Estimation" of the paper:
+ *
+ *   Kyprianidis, Jan Eric. "Image and video abstraction by multi-scale anisotropic Kuwahara
+ *   filtering." 2011.
+ *
+ * The structure tensor should then be smoothed using a Gaussian function to eliminate high
+ * frequency details. */
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+
+  /* The weight kernels of the filter optimized for rotational symmetry described in section "3.2.1
+   * Gradient Calculation". */
+  const float corner_weight = 0.182;
+  const float center_weight = 1.0 - 2.0 * corner_weight;
+
+  vec3 x_partial_derivative = texture_load(input_tx, texel + ivec2(-1, 1)).rgb * -corner_weight +
+                              texture_load(input_tx, texel + ivec2(-1, 0)).rgb * -center_weight +
+                              texture_load(input_tx, texel + ivec2(-1, -1)).rgb * -corner_weight +
+                              texture_load(input_tx, texel + ivec2(1, 1)).rgb * corner_weight +
+                              texture_load(input_tx, texel + ivec2(1, 0)).rgb * center_weight +
+                              texture_load(input_tx, texel + ivec2(1, -1)).rgb * corner_weight;
+
+  vec3 y_partial_derivative = texture_load(input_tx, texel + ivec2(-1, 1)).rgb * corner_weight +
+                              texture_load(input_tx, texel + ivec2(0, 1)).rgb * center_weight +
+                              texture_load(input_tx, texel + ivec2(1, 1)).rgb * corner_weight +
+                              texture_load(input_tx, texel + ivec2(-1, -1)).rgb * -corner_weight +
+                              texture_load(input_tx, texel + ivec2(0, -1)).rgb * -center_weight +
+                              texture_load(input_tx, texel + ivec2(1, -1)).rgb * -corner_weight;
+
+  float dxdx = dot(x_partial_derivative, x_partial_derivative);
+  float dxdy = dot(x_partial_derivative, y_partial_derivative);
+  float dydy = dot(y_partial_derivative, y_partial_derivative);
+
+  /* We encode the structure tensor in a vec4 using a column major storage order. */
+  vec4 structure_tensor = vec4(dxdx, dxdy, dxdy, dydy);
+
+  imageStore(structure_tensor_img, texel, structure_tensor);
+}
--- a/source/blender/compositor/realtime_compositor/shaders/infos/compositor_kuwahara_info.hh
+++ b/source/blender/compositor/realtime_compositor/shaders/infos/compositor_kuwahara_info.hh
@ -21,3 +21,21 @@ GPU_SHADER_CREATE_INFO(compositor_kuwahara_classic_summed_area_table)
    .sampler(0, ImageType::FLOAT_2D, "table_tx")
    .sampler(1, ImageType::FLOAT_2D, "squared_table_tx")
    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(compositor_kuwahara_anisotropic_compute_structure_tensor)
+    .local_group_size(16, 16)
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "structure_tensor_img")
+    .compute_source("compositor_kuwahara_anisotropic_compute_structure_tensor.glsl")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(compositor_kuwahara_anisotropic)
+    .local_group_size(16, 16)
+    .push_constant(Type::INT, "radius")
+    .push_constant(Type::FLOAT, "eccentricity")
+    .push_constant(Type::FLOAT, "sharpness")
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .sampler(1, ImageType::FLOAT_2D, "structure_tensor_tx")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .compute_source("compositor_kuwahara_anisotropic.glsl")
+    .do_static_compilation(true);
--- a/source/blender/makesdna/DNA_node_types.h
+++ b/source/blender/makesdna/DNA_node_types.h
@ -999,7 +999,9 @@ typedef struct NodeBilateralBlurData {
 typedef struct NodeKuwaharaData {
  short size;
  short variation;
-  int smoothing;
+  int uniformity;
+  float sharpness;
+  float eccentricity;
 } NodeKuwaharaData;

 typedef struct NodeAntiAliasingData {
--- a/source/blender/makesrna/intern/rna_nodetree.cc
+++ b/source/blender/makesrna/intern/rna_nodetree.cc
@ -8826,14 +8826,35 @@ static void def_cmp_kuwahara(StructRNA *srna)
  RNA_def_property_ui_text(prop, "", "Variation of Kuwahara filter to use");
  RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");

-  prop = RNA_def_property(srna, "smoothing", PROP_INT, PROP_NONE);
-  RNA_def_property_int_sdna(prop, nullptr, "smoothing");
+  prop = RNA_def_property(srna, "uniformity", PROP_INT, PROP_NONE);
+  RNA_def_property_int_sdna(prop, nullptr, "uniformity");
  RNA_def_property_range(prop, 0.0, 50.0);
  RNA_def_property_ui_range(prop, 0, 50, 1, -1);
  RNA_def_property_ui_text(prop,
-                           "Smoothing",
-                           "Smoothing degree before applying filter. Higher values remove details "
-                           "and give smoother edges");
+                           "Uniformity",
+                           "Controls the uniformity of the direction of the filter. Higher values "
+                           "produces more uniform directions");
+  RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
+
+  prop = RNA_def_property(srna, "sharpness", PROP_FLOAT, PROP_FACTOR);
+  RNA_def_property_float_sdna(prop, nullptr, "sharpness");
+  RNA_def_property_range(prop, 0.0f, 1.0f);
+  RNA_def_property_ui_range(prop, 0.0f, 1.0f, 0.1, 3);
+  RNA_def_property_ui_text(prop,
+                           "Sharpness",
+                           "Controls the sharpness of the filter. 0 means completely smooth while "
+                           "1 means completely sharp");
+  RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
+
+  prop = RNA_def_property(srna, "eccentricity", PROP_FLOAT, PROP_FACTOR);
+  RNA_def_property_float_sdna(prop, nullptr, "eccentricity");
+  RNA_def_property_range(prop, 0.0f, 2.0f);
+  RNA_def_property_ui_range(prop, 0.0f, 2.0f, 0.1, 3);
+  RNA_def_property_ui_text(
+      prop,
+      "Eccentricity",
+      "Controls how directional the filter is. 0 means the filter is completely omnidirectional "
+      "while 2 means it is maximally directed along the edges of the image");
  RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
 }

--- a/source/blender/nodes/composite/nodes/node_composite_kuwahara.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_kuwahara.cc
@ -6,6 +6,8 @@
 * \ingroup cmpnodes
 */

+#include "BLI_math_base.hh"
+
 #include "RNA_access.hh"

 #include "UI_interface.hh"
@ -15,6 +17,7 @@
 #include "COM_utilities.hh"

 #include "COM_algorithm_summed_area_table.hh"
+#include "COM_algorithm_symmetric_separable_blur.hh"

 #include "node_composite_util.hh"

@ -38,8 +41,10 @@ static void node_composit_init_kuwahara(bNodeTree * /*ntree*/, bNode *node)
  node->storage = data;

  /* Set defaults. */
-  data->size = 4;
-  data->smoothing = 2;
+  data->size = 6;
+  data->uniformity = 4;
+  data->eccentricity = 1.0;
+  data->sharpness = 0.5;
 }

 static void node_composit_buts_kuwahara(uiLayout *layout, bContext * /*C*/, PointerRNA *ptr)
@ -54,7 +59,9 @@ static void node_composit_buts_kuwahara(uiLayout *layout, bContext * /*C*/, Poin
  const int variation = RNA_enum_get(ptr, "variation");

  if (variation == CMP_NODE_KUWAHARA_ANISOTROPIC) {
-    uiItemR(col, ptr, "smoothing", UI_ITEM_NONE, nullptr, ICON_NONE);
+    uiItemR(col, ptr, "uniformity", UI_ITEM_NONE, nullptr, ICON_NONE);
+    uiItemR(col, ptr, "sharpness", UI_ITEM_NONE, nullptr, ICON_NONE);
+    uiItemR(col, ptr, "eccentricity", UI_ITEM_NONE, nullptr, ICON_NONE);
  }
 }

@ -142,10 +149,102 @@ class ConvertKuwaharaOperation : public NodeOperation {
    squared_table.release();
  }

+  /* An implementation of the Anisotropic Kuwahara filter described in the paper:
+   *
+   *   Kyprianidis, Jan Eric, Henry Kang, and Jurgen Dollner. "Image and video abstraction by
+   *   anisotropic Kuwahara filtering." 2009.
+   */
  void execute_anisotropic()
  {
-    get_input("Image").pass_through(get_result("Image"));
-    context().set_info_message("Viewport compositor setup not fully supported");
+    Result structure_tensor = compute_structure_tensor();
+    Result smoothed_structure_tensor = Result::Temporary(ResultType::Color, texture_pool());
+    symmetric_separable_blur(context(),
+                             structure_tensor,
+                             smoothed_structure_tensor,
+                             float2(node_storage(bnode()).uniformity));
+    structure_tensor.release();
+
+    GPUShader *shader = shader_manager().get("compositor_kuwahara_anisotropic");
+    GPU_shader_bind(shader);
+
+    GPU_shader_uniform_1i(shader, "radius", node_storage(bnode()).size);
+    GPU_shader_uniform_1f(shader, "eccentricity", get_eccentricity());
+    GPU_shader_uniform_1f(shader, "sharpness", get_sharpness());
+
+    Result &input = get_input("Image");
+    input.bind_as_texture(shader, "input_tx");
+
+    smoothed_structure_tensor.bind_as_texture(shader, "structure_tensor_tx");
+
+    const Domain domain = compute_domain();
+    Result &output_image = get_result("Image");
+    output_image.allocate_texture(domain);
+    output_image.bind_as_image(shader, "output_img");
+
+    compute_dispatch_threads_at_least(shader, domain.size);
+
+    input.unbind_as_texture();
+    smoothed_structure_tensor.unbind_as_texture();
+    output_image.unbind_as_image();
+    GPU_shader_unbind();
+
+    smoothed_structure_tensor.release();
+  }
+
+  Result compute_structure_tensor()
+  {
+    GPUShader *shader = shader_manager().get(
+        "compositor_kuwahara_anisotropic_compute_structure_tensor");
+    GPU_shader_bind(shader);
+
+    Result &input = get_input("Image");
+    input.bind_as_texture(shader, "input_tx");
+
+    const Domain domain = compute_domain();
+    Result structure_tensor = Result::Temporary(ResultType::Color, texture_pool());
+    structure_tensor.allocate_texture(domain);
+    structure_tensor.bind_as_image(shader, "structure_tensor_img");
+
+    compute_dispatch_threads_at_least(shader, domain.size);
+
+    input.unbind_as_texture();
+    structure_tensor.unbind_as_image();
+    GPU_shader_unbind();
+
+    return structure_tensor;
+  }
+
+  /* The sharpness controls the sharpness of the transitions between the kuwahara sectors, which is
+   * controlled by the weighting function pow(standard_deviation, -sharpness) as can be seen in the
+   * shader. The transition is completely smooth when the sharpness is zero and completely sharp
+   * when it is infinity. But realistically, the sharpness doesn't change much beyond the value of
+   * 16 due to its exponential nature, so we just assume a maximum sharpness of 16.
+   *
+   * The stored sharpness is in the range [0, 1], so we multiply by 16 to get it in the range
+   * [0, 16], however, we also square it before multiplication to slow down the rate of change near
+   * zero to counter its exponential nature for more intuitive user control. */
+  float get_sharpness()
+  {
+    const float sharpness_factor = node_storage(bnode()).sharpness;
+    return sharpness_factor * sharpness_factor * 16.0f;
+  }
+
+  /* The eccentricity controls how much the image anisotropy affects the eccentricity of the
+   * kuwahara sectors, which is controlled by the following factor that gets multiplied to the
+   * radius to get the ellipse width and divides the radius to get the ellipse height:
+   *
+   *   (eccentricity + anisotropy) / eccentricity
+   *
+   * Since the anisotropy is in the [0, 1] range, the factor tends to 1 as the eccentricity tends
+   * to infinity and tends to infinity when the eccentricity tends to zero. The stored eccentricity
+   * is in the range [0, 2], we map that to the range [infinity, 0.5] by taking the reciprocal,
+   * satisfying the aforementioned limits. The upper limit doubles the computed default
+   * eccentricity, which users can use to enhance the directionality of the filter. Instead of
+   * actual infinity, we just use an eccentricity of 1 / 0.01 since the result is very similar to
+   * that of infinity. */
+  float get_eccentricity()
+  {
+    return 1.0f / math::max(0.01f, node_storage(bnode()).eccentricity);
  }
 };