2023-05-24 18:11:47 +02:00
134 changed files with 1321 additions and 581 deletions
--- a/build_files/utils/make_update.py
+++ b/build_files/utils/make_update.py
@ -434,7 +434,13 @@ def external_scripts_update(args: argparse.Namespace,
            # Switch to branch and pull.
            if submodule_branch:
                if make_utils.git_branch(args.git_command) != submodule_branch:
-                    if make_utils.git_remote_exist(args.git_command, "origin"):
+                    # If the local branch exists just check out to it.
+                    # If there is no local branch but only remote specify an explicit remote.
+                    # Without this explicit specification Git attempts to set-up tracking
+                    # automatically and fails when the branch is available in multiple remotes.
+                    if make_utils.git_local_branch_exists(args.git_command, submodule_branch):
+                        call([args.git_command, "checkout", submodule_branch])
+                    elif make_utils.git_remote_exist(args.git_command, "origin"):
                        call([args.git_command, "checkout", "-t", f"origin/{submodule_branch}"])
                    elif make_utils.git_remote_exist(args.git_command, "upstream"):
                        call([args.git_command, "checkout", "-t", f"upstream/{submodule_branch}"])
--- a/build_files/utils/make_utils.py
+++ b/build_files/utils/make_utils.py
@ -54,9 +54,15 @@ def check_output(cmd: Sequence[str], exit_on_error: bool = True) -> str:
    return output.strip()


+def git_local_branch_exists(git_command: str, branch: str) -> bool:
+    return (
+        call([git_command, "rev-parse", "--verify", branch], exit_on_error=False, silent=True) == 0
+    )
+
+
 def git_branch_exists(git_command: str, branch: str) -> bool:
    return (
-        call([git_command, "rev-parse", "--verify", branch], exit_on_error=False, silent=True) == 0 or
+        git_local_branch_exists(git_command, branch) or
        call([git_command, "rev-parse", "--verify", "remotes/upstream/" + branch], exit_on_error=False, silent=True) == 0 or
        call([git_command, "rev-parse", "--verify", "remotes/origin/" + branch], exit_on_error=False, silent=True) == 0
    )
--- a/intern/cycles/blender/pointcloud.cpp
+++ b/intern/cycles/blender/pointcloud.cpp
@ -222,7 +222,10 @@ static void export_pointcloud_motion(PointCloud *pointcloud,

  /* Export motion points. */
  const int num_points = pointcloud->num_points();
-  float3 *mP = attr_mP->data_float3() + motion_step * num_points;
+  // Point cloud attributes are stored as float4 with the radius
+  // in the w element. This is explict now as float3 is no longer
+  // interchangeable with float4 as it is packed now.
+  float4 *mP = attr_mP->data_float4() + motion_step * num_points;
  bool have_motion = false;
  const array<float3> &pointcloud_points = pointcloud->get_points();

@ -231,11 +234,9 @@ static void export_pointcloud_motion(PointCloud *pointcloud,
  std::optional<BL::FloatAttribute> b_attr_radius = find_radius_attribute(b_pointcloud);

  for (int i = 0; i < std::min(num_points, b_points_num); i++) {
-    const float3 co = get_float3(b_attr_position.data[i].vector());
+    const float3 P = get_float3(b_attr_position.data[i].vector());
    const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.01f;
-    float3 P = co;
-    P.w = radius;
-    mP[i] = P;
+    mP[i] = make_float4(P.x, P.y, P.z, radius);
    have_motion = have_motion || (P != pointcloud_points[i]);
  }

--- a/intern/cycles/bvh/build.cpp
+++ b/intern/cycles/bvh/build.cpp
@ -180,9 +180,9 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair
        curve.bounds_grow(k, &hair->get_curve_keys()[0], curve_radius, bounds);
        const size_t num_keys = hair->get_curve_keys().size();
        const size_t num_steps = hair->get_motion_steps();
-        const float3 *key_steps = curve_attr_mP->data_float3();
+        const float4 *key_steps = curve_attr_mP->data_float4();
        for (size_t step = 0; step < num_steps - 1; step++) {
-          curve.bounds_grow(k, key_steps + step * num_keys, curve_radius, bounds);
+          curve.bounds_grow(k, key_steps + step * num_keys, bounds);
        }
        if (bounds.valid()) {
          int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k);
@ -200,7 +200,7 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair
        const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
        const size_t num_steps = hair->get_motion_steps();
        const float3 *curve_keys = &hair->get_curve_keys()[0];
-        const float3 *key_steps = curve_attr_mP->data_float3();
+        const float4 *key_steps = curve_attr_mP->data_float4();
        const size_t num_keys = hair->get_curve_keys().size();
        /* Calculate bounding box of the previous time step.
         * Will be reused later to avoid duplicated work on
--- a/intern/cycles/bvh/embree.cpp
+++ b/intern/cycles/bvh/embree.cpp
@ -254,20 +254,15 @@ void BVHEmbree::add_triangles(const Object *ob, const Mesh *mesh, int i)
  rtcSetGeometryBuildQuality(geom_id, build_quality);
  rtcSetGeometryTimeStepCount(geom_id, num_motion_steps);

-  unsigned *rtc_indices = (unsigned *)rtcSetNewGeometryBuffer(
-      geom_id, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, sizeof(int) * 3, num_triangles);
-  assert(rtc_indices);
-  if (!rtc_indices) {
-    VLOG_WARNING << "Embree could not create new geometry buffer for mesh " << mesh->name.c_str()
-                 << ".\n";
-    return;
-  }
-  for (size_t j = 0; j < num_triangles; ++j) {
-    Mesh::Triangle t = mesh->get_triangle(j);
-    rtc_indices[j * 3] = t.v[0];
-    rtc_indices[j * 3 + 1] = t.v[1];
-    rtc_indices[j * 3 + 2] = t.v[2];
-  }
+  const int *triangles = mesh->get_triangles().data();
+  rtcSetSharedGeometryBuffer(geom_id,
+                             RTC_BUFFER_TYPE_INDEX,
+                             0,
+                             RTC_FORMAT_UINT3,
+                             triangles,
+                             0,
+                             sizeof(int) * 3,
+                             num_triangles);

  set_tri_vertex_buffer(geom_id, mesh, false);

@ -309,28 +304,46 @@ void BVHEmbree::set_tri_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh, con
      verts = &attr_mP->data_float3()[t_ * num_verts];
    }

-    float *rtc_verts = (update) ?
-                           (float *)rtcGetGeometryBufferData(geom_id, RTC_BUFFER_TYPE_VERTEX, t) :
-                           (float *)rtcSetNewGeometryBuffer(geom_id,
-                                                            RTC_BUFFER_TYPE_VERTEX,
-                                                            t,
-                                                            RTC_FORMAT_FLOAT3,
-                                                            sizeof(float) * 3,
-                                                            num_verts + 1);
-
-    assert(rtc_verts);
-    if (rtc_verts) {
-      for (size_t j = 0; j < num_verts; ++j) {
-        rtc_verts[0] = verts[j].x;
-        rtc_verts[1] = verts[j].y;
-        rtc_verts[2] = verts[j].z;
-        rtc_verts += 3;
-      }
-    }
-
    if (update) {
      rtcUpdateGeometryBuffer(geom_id, RTC_BUFFER_TYPE_VERTEX, t);
    }
+    else {
+      rtcSetSharedGeometryBuffer(geom_id,
+                                 RTC_BUFFER_TYPE_VERTEX,
+                                 t,
+                                 RTC_FORMAT_FLOAT3,
+                                 verts,
+                                 0,
+                                 sizeof(float3),
+                                 num_verts + 1);
+    }
+  }
+}
+
+/**
+ * Packs the hair motion curve data control variables (CVs) into float4s as [x y z radius]
+ */
+template<typename T>
+void pack_motion_verts(size_t num_curves,
+                       const Hair *hair,
+                       const T *verts,
+                       const float *curve_radius,
+                       float4 *rtc_verts)
+{
+  for (size_t j = 0; j < num_curves; ++j) {
+    Hair::Curve c = hair->get_curve(j);
+    int fk = c.first_key;
+    int k = 1;
+    for (; k < c.num_keys + 1; ++k, ++fk) {
+      rtc_verts[k].x = verts[fk].x;
+      rtc_verts[k].y = verts[fk].y;
+      rtc_verts[k].z = verts[fk].z;
+      rtc_verts[k].w = curve_radius[fk];
+    }
+    /* Duplicate Embree's Catmull-Rom spline CVs at the start and end of each curve. */
+    rtc_verts[0] = rtc_verts[1];
+    rtc_verts[k] = rtc_verts[k - 1];
+    rtc_verts += c.num_keys + 2;
  }
 }

@ -360,15 +373,10 @@ void BVHEmbree::set_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair, c
  const int t_mid = (num_motion_steps - 1) / 2;
  const float *curve_radius = &hair->get_curve_radius()[0];
  for (int t = 0; t < num_motion_steps; ++t) {
-    const float3 *verts;
-    if (t == t_mid || attr_mP == NULL) {
-      verts = &hair->get_curve_keys()[0];
-    }
-    else {
-      int t_ = (t > t_mid) ? (t - 1) : t;
-      verts = &attr_mP->data_float3()[t_ * num_keys];
-    }
-
+    // As float4 and float3 are no longer interchangeable the 2 types need to be
+    // handled separately. Attributes are float4s where the radius is stored in w and
+    // the middle motion vector is from the mesh points which are stored float3s with
+    // the radius stored in another array.
    float4 *rtc_verts = (update) ? (float4 *)rtcGetGeometryBufferData(
                                       geom_id, RTC_BUFFER_TYPE_VERTEX, t) :
                                   (float4 *)rtcSetNewGeometryBuffer(geom_id,
@ -381,18 +389,14 @@ void BVHEmbree::set_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair, c
    assert(rtc_verts);
    if (rtc_verts) {
      const size_t num_curves = hair->num_curves();
-      for (size_t j = 0; j < num_curves; ++j) {
-        Hair::Curve c = hair->get_curve(j);
-        int fk = c.first_key;
-        int k = 1;
-        for (; k < c.num_keys + 1; ++k, ++fk) {
-          rtc_verts[k] = float3_to_float4(verts[fk]);
-          rtc_verts[k].w = curve_radius[fk];
-        }
-        /* Duplicate Embree's Catmull-Rom spline CVs at the start and end of each curve. */
-        rtc_verts[0] = rtc_verts[1];
-        rtc_verts[k] = rtc_verts[k - 1];
-        rtc_verts += c.num_keys + 2;
+      if (t == t_mid || attr_mP == NULL) {
+        const float3 *verts = &hair->get_curve_keys()[0];
+        pack_motion_verts<float3>(num_curves, hair, verts, curve_radius, rtc_verts);
+      }
+      else {
+        int t_ = (t > t_mid) ? (t - 1) : t;
+        const float4 *verts = &attr_mP->data_float4()[t_ * num_keys];
+        pack_motion_verts<float4>(num_curves, hair, verts, curve_radius, rtc_verts);
      }
    }

@ -402,6 +406,20 @@ void BVHEmbree::set_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair, c
  }
 }

+/**
+ * Pack the motion points into a float4 as [x y z radius]
+ */
+template<typename T>
+void pack_motion_points(size_t num_points, const T *verts, const float *radius, float4 *rtc_verts)
+{
+  for (size_t j = 0; j < num_points; ++j) {
+    rtc_verts[j].x = verts[j].x;
+    rtc_verts[j].y = verts[j].y;
+    rtc_verts[j].z = verts[j].z;
+    rtc_verts[j].w = radius[j];
+  }
+}
+
 void BVHEmbree::set_point_vertex_buffer(RTCGeometry geom_id,
                                        const PointCloud *pointcloud,
                                        const bool update)
@ -421,15 +439,10 @@ void BVHEmbree::set_point_vertex_buffer(RTCGeometry geom_id,
  const int t_mid = (num_motion_steps - 1) / 2;
  const float *radius = pointcloud->get_radius().data();
  for (int t = 0; t < num_motion_steps; ++t) {
-    const float3 *verts;
-    if (t == t_mid || attr_mP == NULL) {
-      verts = pointcloud->get_points().data();
-    }
-    else {
-      int t_ = (t > t_mid) ? (t - 1) : t;
-      verts = &attr_mP->data_float3()[t_ * num_points];
-    }
-
+    // As float4 and float3 are no longer interchangeable the 2 types need to be
+    // handled separately. Attributes are float4s where the radius is stored in w and
+    // the middle motion vector is from the mesh points which are stored float3s with
+    // the radius stored in another array.
    float4 *rtc_verts = (update) ? (float4 *)rtcGetGeometryBufferData(
                                       geom_id, RTC_BUFFER_TYPE_VERTEX, t) :
                                   (float4 *)rtcSetNewGeometryBuffer(geom_id,
@ -441,9 +454,14 @@ void BVHEmbree::set_point_vertex_buffer(RTCGeometry geom_id,

    assert(rtc_verts);
    if (rtc_verts) {
-      for (size_t j = 0; j < num_points; ++j) {
-        rtc_verts[j] = float3_to_float4(verts[j]);
-        rtc_verts[j].w = radius[j];
+      if (t == t_mid || attr_mP == NULL) {
+        const float3 *verts = pointcloud->get_points().data();
+        pack_motion_points<float3>(num_points, verts, radius, rtc_verts);
+      }
+      else {
+        int t_ = (t > t_mid) ? (t - 1) : t;
+        const float4 *verts = &attr_mP->data_float4()[t_ * num_points];
+        pack_motion_points<float4>(num_points, verts, radius, rtc_verts);
      }
    }

--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@ -108,9 +108,10 @@ template<> struct device_type_traits<uint2> {
 };

 template<> struct device_type_traits<uint3> {
-  static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements = 3;
-  static_assert(sizeof(uint3) == num_elements * datatype_size(data_type));
+  /* uint3 has different size depending on the device, can't use it for interchanging
+   * memory between CPU and GPU.
+   *
+   * Leave body empty to trigger a compile error if used. */
 };

 template<> struct device_type_traits<uint4> {
@ -132,9 +133,10 @@ template<> struct device_type_traits<int2> {
 };

 template<> struct device_type_traits<int3> {
-  static const DataType data_type = TYPE_INT;
-  static const size_t num_elements = 4;
-  static_assert(sizeof(int3) == num_elements * datatype_size(data_type));
+  /* int3 has different size depending on the device, can't use it for interchanging
+   * memory between CPU and GPU.
+   *
+   * Leave body empty to trigger a compile error if used. */
 };

 template<> struct device_type_traits<int4> {
--- a/intern/cycles/kernel/data_arrays.h
+++ b/intern/cycles/kernel/data_arrays.h
@ -29,7 +29,7 @@ KERNEL_DATA_ARRAY(DecomposedTransform, camera_motion)
 /* triangles */
 KERNEL_DATA_ARRAY(uint, tri_shader)
 KERNEL_DATA_ARRAY(packed_float3, tri_vnormal)
-KERNEL_DATA_ARRAY(uint4, tri_vindex)
+KERNEL_DATA_ARRAY(packed_uint3, tri_vindex)
 KERNEL_DATA_ARRAY(uint, tri_patch)
 KERNEL_DATA_ARRAY(float2, tri_patch_uv)
 KERNEL_DATA_ARRAY(packed_float3, tri_verts)
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@ -126,10 +126,10 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
  isect->v = barycentrics.y;

  /* Record geometric normal */
-  const uint tri_vindex = kernel_data_fetch(tri_vindex, isect->prim).w;
-  const float3 tri_a = float3(kernel_data_fetch(tri_verts, tri_vindex + 0));
-  const float3 tri_b = float3(kernel_data_fetch(tri_verts, tri_vindex + 1));
-  const float3 tri_c = float3(kernel_data_fetch(tri_verts, tri_vindex + 2));
+  const packed_uint3 tri_vindex = kernel_data_fetch(tri_vindex, isect->prim);
+  const float3 tri_a = float3(kernel_data_fetch(tri_verts, tri_vindex.x));
+  const float3 tri_b = float3(kernel_data_fetch(tri_verts, tri_vindex.y));
+  const float3 tri_c = float3(kernel_data_fetch(tri_verts, tri_vindex.z));
  payload.local_isect.Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));

  /* Continue tracing (without this the trace call would return after the first hit) */
--- a/intern/cycles/kernel/device/optix/bvh.h
+++ b/intern/cycles/kernel/device/optix/bvh.h
@ -120,10 +120,11 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
  isect->v = barycentrics.y;

  /* Record geometric normal. */
-  const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w;
-  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0);
-  const float3 tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1);
-  const float3 tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+  const packed_uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex.x);
+  const float3 tri_b = kernel_data_fetch(tri_verts, tri_vindex.y);
+  const float3 tri_c = kernel_data_fetch(tri_verts, tri_vindex.z);
+
  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));

  /* Continue tracing (without this the trace call would return after the first hit). */
--- a/intern/cycles/kernel/geom/motion_triangle.h
+++ b/intern/cycles/kernel/geom/motion_triangle.h
@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 /* Time interpolation of vertex positions and normals */

 ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
-                                                      uint4 tri_vindex,
+                                                      uint3 tri_vindex,
                                                      int offset,
                                                      int numverts,
                                                      int numsteps,
@ -30,9 +30,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
 {
  if (step == numsteps) {
    /* center step: regular vertex location */
-    verts[0] = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
-    verts[1] = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
-    verts[2] = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
+    verts[0] = kernel_data_fetch(tri_verts, tri_vindex.x);
+    verts[1] = kernel_data_fetch(tri_verts, tri_vindex.y);
+    verts[2] = kernel_data_fetch(tri_verts, tri_vindex.z);
  }
  else {
    /* center step not store in this array */
@ -48,7 +48,7 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
 }

 ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,
-                                                        uint4 tri_vindex,
+                                                        uint3 tri_vindex,
                                                        int offset,
                                                        int numverts,
                                                        int numsteps,
@ -92,7 +92,8 @@ ccl_device_inline void motion_triangle_vertices(

  /* fetch vertex coordinates */
  float3 next_verts[3];
-  uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+
+  uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);

  motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
  motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step + 1, next_verts);
@ -121,7 +122,8 @@ ccl_device_inline void motion_triangle_vertices_and_normals(

  /* Fetch vertex coordinates. */
  float3 next_verts[3];
-  uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+
+  uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);

  motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
  motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step + 1, next_verts);
@ -167,7 +169,8 @@ ccl_device_inline float3 motion_triangle_smooth_normal(

  /* fetch normals */
  float3 normals[3], next_normals[3];
-  uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+
+  uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);

  motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
  motion_triangle_normals_for_step(
--- a/intern/cycles/kernel/geom/motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/motion_triangle_shader.h
@ -47,7 +47,9 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals kg,
  kernel_assert(offset != ATTR_STD_NOT_FOUND);
  /* Fetch vertex coordinates. */
  float3 verts[3], next_verts[3];
-  uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+
+  uint3 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+
  motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
  motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step + 1, next_verts);
  /* Interpolate between steps. */
--- a/intern/cycles/kernel/geom/subd_triangle.h
+++ b/intern/cycles/kernel/geom/subd_triangle.h
@ -13,7 +13,7 @@ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg,
                                              ccl_private const ShaderData *sd,
                                              float2 uv[3])
 {
-  uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+  uint3 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);

  uv[0] = kernel_data_fetch(tri_patch_uv, tri_vindex.x);
  uv[1] = kernel_data_fetch(tri_patch_uv, tri_vindex.y);
--- a/intern/cycles/kernel/geom/triangle.h
+++ b/intern/cycles/kernel/geom/triangle.h
@ -15,10 +15,10 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3 triangle_normal(KernelGlobals kg, ccl_private ShaderData *sd)
 {
  /* load triangle vertices */
-  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
-  const float3 v0 = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
-  const float3 v1 = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
-  const float3 v2 = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
+  const uint3 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+  const float3 v0 = kernel_data_fetch(tri_verts, tri_vindex.x);
+  const float3 v1 = kernel_data_fetch(tri_verts, tri_vindex.y);
+  const float3 v2 = kernel_data_fetch(tri_verts, tri_vindex.z);

  /* return normal */
  if (object_negative_scale_applied(sd->object_flag)) {
@ -40,10 +40,11 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
                                             ccl_private int *shader)
 {
  /* load triangle vertices */
-  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
-  float3 v0 = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
-  float3 v1 = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
-  float3 v2 = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
+  const uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  float3 v0 = kernel_data_fetch(tri_verts, tri_vindex.x);
+  float3 v1 = kernel_data_fetch(tri_verts, tri_vindex.y);
+  float3 v2 = kernel_data_fetch(tri_verts, tri_vindex.z);
+
  /* compute point */
  float w = 1.0f - u - v;
  *P = (w * v0 + u * v1 + v * v2);
@ -64,10 +65,10 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,

 ccl_device_inline void triangle_vertices(KernelGlobals kg, int prim, float3 P[3])
 {
-  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
-  P[0] = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
-  P[1] = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
-  P[2] = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
+  const uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  P[0] = kernel_data_fetch(tri_verts, tri_vindex.x);
+  P[1] = kernel_data_fetch(tri_verts, tri_vindex.y);
+  P[2] = kernel_data_fetch(tri_verts, tri_vindex.z);
 }

 /* Triangle vertex locations and vertex normals */
@ -77,10 +78,11 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals kg,
                                                     float3 P[3],
                                                     float3 N[3])
 {
-  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
-  P[0] = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
-  P[1] = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
-  P[2] = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
+  const uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  P[0] = kernel_data_fetch(tri_verts, tri_vindex.x);
+  P[1] = kernel_data_fetch(tri_verts, tri_vindex.y);
+  P[2] = kernel_data_fetch(tri_verts, tri_vindex.z);
+
  N[0] = kernel_data_fetch(tri_vnormal, tri_vindex.x);
  N[1] = kernel_data_fetch(tri_vnormal, tri_vindex.y);
  N[2] = kernel_data_fetch(tri_vnormal, tri_vindex.z);
@ -92,7 +94,8 @@ ccl_device_inline float3
 triangle_smooth_normal(KernelGlobals kg, float3 Ng, int prim, float u, float v)
 {
  /* load triangle vertices */
-  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  const uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+
  float3 n0 = kernel_data_fetch(tri_vnormal, tri_vindex.x);
  float3 n1 = kernel_data_fetch(tri_vnormal, tri_vindex.y);
  float3 n2 = kernel_data_fetch(tri_vnormal, tri_vindex.z);
@ -106,7 +109,8 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
    KernelGlobals kg, ccl_private const ShaderData *sd, float3 Ng, int prim, float u, float v)
 {
  /* load triangle vertices */
-  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  const uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+
  float3 n0 = kernel_data_fetch(tri_vnormal, tri_vindex.x);
  float3 n1 = kernel_data_fetch(tri_vnormal, tri_vindex.y);
  float3 n2 = kernel_data_fetch(tri_vnormal, tri_vindex.z);
@ -131,10 +135,10 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals kg,
                                       ccl_private float3 *dPdv)
 {
  /* fetch triangle vertex coordinates */
-  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
-  const float3 p0 = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
-  const float3 p1 = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
-  const float3 p2 = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
+  const uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  const float3 p0 = kernel_data_fetch(tri_verts, tri_vindex.x);
+  const float3 p1 = kernel_data_fetch(tri_verts, tri_vindex.y);
+  const float3 p2 = kernel_data_fetch(tri_verts, tri_vindex.z);

  /* compute derivatives of P w.r.t. uv */
  *dPdu = (p1 - p0);
@ -153,7 +157,8 @@ ccl_device float triangle_attribute_float(KernelGlobals kg,
    float f0, f1, f2;

    if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
-      const uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+      const uint3 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+
      f0 = kernel_data_fetch(attributes_float, desc.offset + tri_vindex.x);
      f1 = kernel_data_fetch(attributes_float, desc.offset + tri_vindex.y);
      f2 = kernel_data_fetch(attributes_float, desc.offset + tri_vindex.z);
@ -203,7 +208,8 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals kg,
    float2 f0, f1, f2;

    if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
-      const uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+      const uint3 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+
      f0 = kernel_data_fetch(attributes_float2, desc.offset + tri_vindex.x);
      f1 = kernel_data_fetch(attributes_float2, desc.offset + tri_vindex.y);
      f2 = kernel_data_fetch(attributes_float2, desc.offset + tri_vindex.z);
@ -253,7 +259,8 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,
    float3 f0, f1, f2;

    if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
-      const uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+      const uint3 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+
      f0 = kernel_data_fetch(attributes_float3, desc.offset + tri_vindex.x);
      f1 = kernel_data_fetch(attributes_float3, desc.offset + tri_vindex.y);
      f2 = kernel_data_fetch(attributes_float3, desc.offset + tri_vindex.z);
@ -304,7 +311,8 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,
    float4 f0, f1, f2;

    if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
-      const uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+      const uint3 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+
      f0 = kernel_data_fetch(attributes_float4, desc.offset + tri_vindex.x);
      f1 = kernel_data_fetch(attributes_float4, desc.offset + tri_vindex.y);
      f2 = kernel_data_fetch(attributes_float4, desc.offset + tri_vindex.z);
--- a/intern/cycles/kernel/geom/triangle_intersect.h
+++ b/intern/cycles/kernel/geom/triangle_intersect.h
@ -24,10 +24,11 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg,
                                          int prim,
                                          int prim_addr)
 {
-  const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w;
-  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
-               tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
-               tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+  const uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex.x),
+               tri_b = kernel_data_fetch(tri_verts, tri_vindex.y),
+               tri_c = kernel_data_fetch(tri_verts, tri_vindex.z);
+
  float t, u, v;
  if (ray_triangle_intersect(P, dir, tmin, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
 #ifdef __VISIBILITY_FLAG__
@ -68,10 +69,11 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
                                                ccl_private uint *lcg_state,
                                                int max_hits)
 {
-  const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w;
-  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
-               tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
-               tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+  const uint3 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex.x),
+               tri_b = kernel_data_fetch(tri_verts, tri_vindex.y),
+               tri_c = kernel_data_fetch(tri_verts, tri_vindex.z);
+
  float t, u, v;
  if (!ray_triangle_intersect(P, dir, tmin, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
    return false;
@ -141,10 +143,10 @@ ccl_device_inline float3 triangle_point_from_uv(KernelGlobals kg,
                                                const float u,
                                                const float v)
 {
-  const uint tri_vindex = kernel_data_fetch(tri_vindex, isect_prim).w;
-  const packed_float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
-                      tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
-                      tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+  const uint3 tri_vindex = kernel_data_fetch(tri_vindex, isect_prim);
+  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex.x),
+               tri_b = kernel_data_fetch(tri_verts, tri_vindex.y),
+               tri_c = kernel_data_fetch(tri_verts, tri_vindex.z);

  /* This appears to give slightly better precision than interpolating with w = (1 - u - v). */
  float3 P = tri_a + u * (tri_b - tri_a) + v * (tri_c - tri_a);
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@ -56,10 +56,10 @@ ccl_device_forceinline float3 integrate_surface_ray_offset(KernelGlobals kg,
   *   or dot(sd->Ng, ray_D)  is small. Detect such cases and skip test?
   * - Instead of ray offset, can we tweak P to lie within the triangle?
   */
-  const uint tri_vindex = kernel_data_fetch(tri_vindex, sd->prim).w;
-  const packed_float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
-                      tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
-                      tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+  const uint3 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+  const packed_float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex.x),
+                      tri_b = kernel_data_fetch(tri_verts, tri_vindex.y),
+                      tri_c = kernel_data_fetch(tri_verts, tri_vindex.z);

  float3 local_ray_P = ray_P;
  float3 local_ray_D = ray_D;
--- a/intern/cycles/scene/attribute.cpp
+++ b/intern/cycles/scene/attribute.cpp
@ -167,6 +167,12 @@ size_t Attribute::data_sizeof() const
    return sizeof(float2);
  else if (type == TypeDesc::TypeMatrix)
    return sizeof(Transform);
+  // The float3 type is not interchangeable with float4
+  // as it is now a packed type.
+  else if (type == TypeDesc::TypeFloat4)
+    return sizeof(float4);
+  else if (type == TypeRGBA)
+    return sizeof(float4);
  else
    return sizeof(float3);
 }
@ -300,7 +306,8 @@ void Attribute::add_with_weight(void *dst, void *src, float weight)
    *((float2 *)dst) += *((float2 *)src) * weight;
  }
  else if (same_storage(type, TypeDesc::TypeVector)) {
-    *((float4 *)dst) += *((float4 *)src) * weight;
+    // Points are float3s and not float4s
+    *((float3 *)dst) += *((float3 *)src) * weight;
  }
  else {
    assert(!"not implemented for this type");
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@ -1095,10 +1095,10 @@ void GeometryManager::device_update_mesh(Device *,
    /* normals */
    progress.set_status("Updating Mesh", "Computing normals");

-    packed_float3 *tri_verts = dscene->tri_verts.alloc(tri_size * 3);
+    packed_float3 *tri_verts = dscene->tri_verts.alloc(vert_size);
    uint *tri_shader = dscene->tri_shader.alloc(tri_size);
    packed_float3 *vnormal = dscene->tri_vnormal.alloc(vert_size);
-    uint4 *tri_vindex = dscene->tri_vindex.alloc(tri_size);
+    packed_uint3 *tri_vindex = dscene->tri_vindex.alloc(tri_size);
    uint *tri_patch = dscene->tri_patch.alloc(tri_size);
    float2 *tri_patch_uv = dscene->tri_patch_uv.alloc(vert_size);

@ -1123,7 +1123,7 @@ void GeometryManager::device_update_mesh(Device *,

        if (mesh->verts_is_modified() || mesh->triangles_is_modified() ||
            mesh->vert_patch_uv_is_modified() || copy_all_data) {
-          mesh->pack_verts(&tri_verts[mesh->prim_offset * 3],
+          mesh->pack_verts(&tri_verts[mesh->vert_offset],
                           &tri_vindex[mesh->prim_offset],
                           &tri_patch[mesh->prim_offset],
                           &tri_patch_uv[mesh->vert_offset]);
--- a/intern/cycles/scene/hair.cpp
+++ b/intern/cycles/scene/hair.cpp
@ -15,6 +15,27 @@
 CCL_NAMESPACE_BEGIN

 /* Hair Curve */
+void Hair::Curve::bounds_grow(const int k, const float4 *keys, BoundBox &bounds) const
+{
+  float3 P[4];
+
+  P[0] = float4_to_float3(keys[max(first_key + k - 1, first_key)]);
+  P[1] = float4_to_float3(keys[first_key + k]);
+  P[2] = float4_to_float3(keys[first_key + k + 1]);
+  P[3] = float4_to_float3(keys[min(first_key + k + 2, first_key + num_keys - 1)]);
+
+  float3 lower;
+  float3 upper;
+
+  curvebounds(&lower.x, &upper.x, P, 0);
+  curvebounds(&lower.y, &upper.y, P, 1);
+  curvebounds(&lower.z, &upper.z, P, 2);
+
+  float mr = max(keys[1].w, keys[2].w);
+
+  bounds.grow(lower, mr);
+  bounds.grow(upper, mr);
+}

 void Hair::Curve::bounds_grow(const int k,
                              const float3 *curve_keys,
@ -96,7 +117,7 @@ void Hair::Curve::bounds_grow(float4 keys[4], BoundBox &bounds) const

 void Hair::Curve::motion_keys(const float3 *curve_keys,
                              const float *curve_radius,
-                              const float3 *key_steps,
+                              const float4 *key_steps,
                              size_t num_curve_keys,
                              size_t num_steps,
                              float time,
@ -122,7 +143,7 @@ void Hair::Curve::motion_keys(const float3 *curve_keys,

 void Hair::Curve::cardinal_motion_keys(const float3 *curve_keys,
                                       const float *curve_radius,
-                                       const float3 *key_steps,
+                                       const float4 *key_steps,
                                       size_t num_curve_keys,
                                       size_t num_steps,
                                       float time,
@ -170,7 +191,7 @@ void Hair::Curve::cardinal_motion_keys(const float3 *curve_keys,

 void Hair::Curve::keys_for_step(const float3 *curve_keys,
                                const float *curve_radius,
-                                const float3 *key_steps,
+                                const float4 *key_steps,
                                size_t num_curve_keys,
                                size_t num_steps,
                                size_t step,
@ -214,7 +235,7 @@ void Hair::Curve::keys_for_step(const float3 *curve_keys,

 void Hair::Curve::cardinal_keys_for_step(const float3 *curve_keys,
                                         const float *curve_radius,
-                                         const float3 *key_steps,
+                                         const float4 *key_steps,
                                         size_t num_curve_keys,
                                         size_t num_steps,
                                         size_t step,
@ -384,10 +405,12 @@ void Hair::compute_bounds()
    Attribute *curve_attr = attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
    if (use_motion_blur && curve_attr) {
      size_t steps_size = curve_keys.size() * (motion_steps - 1);
-      float3 *key_steps = curve_attr->data_float3();
+      // Attribute data is stored as a float4 and is not
+      // interchangeable with float3
+      float4 *key_steps = curve_attr->data_float4();

      for (size_t i = 0; i < steps_size; i++)
-        bnds.grow(key_steps[i]);
+        bnds.grow(float4_to_float3(key_steps[i]));
    }

    if (!bnds.valid()) {
@ -399,10 +422,12 @@ void Hair::compute_bounds()

      if (use_motion_blur && curve_attr) {
        size_t steps_size = curve_keys.size() * (motion_steps - 1);
-        float3 *key_steps = curve_attr->data_float3();
+        // Attribute data is stored as a float4 which is not
+        // interchangeable with float4
+        float4 *key_steps = curve_attr->data_float4();

        for (size_t i = 0; i < steps_size; i++)
-          bnds.grow_safe(key_steps[i]);
+          bnds.grow_safe(float4_to_float3(key_steps[i]));
      }
    }
  }
--- a/intern/cycles/scene/hair.h
+++ b/intern/cycles/scene/hair.h
@ -28,7 +28,9 @@ class Hair : public Geometry {
                     const float3 *curve_keys,
                     const float *curve_radius,
                     BoundBox &bounds) const;
+    void bounds_grow(const int k, const float4 *keys, BoundBox &bounds) const;
    void bounds_grow(float4 keys[4], BoundBox &bounds) const;
+    void bounds_grow(float3 keys[4], BoundBox &bounds) const;
    void bounds_grow(const int k,
                     const float3 *curve_keys,
                     const float *curve_radius,
@ -37,7 +39,7 @@ class Hair : public Geometry {

    void motion_keys(const float3 *curve_keys,
                     const float *curve_radius,
-                     const float3 *key_steps,
+                     const float4 *key_steps,
                     size_t num_curve_keys,
                     size_t num_steps,
                     float time,
@ -46,7 +48,7 @@ class Hair : public Geometry {
                     float4 r_keys[2]) const;
    void cardinal_motion_keys(const float3 *curve_keys,
                              const float *curve_radius,
-                              const float3 *key_steps,
+                              const float4 *key_steps,
                              size_t num_curve_keys,
                              size_t num_steps,
                              float time,
@ -58,7 +60,7 @@ class Hair : public Geometry {

    void keys_for_step(const float3 *curve_keys,
                       const float *curve_radius,
-                       const float3 *key_steps,
+                       const float4 *key_steps,
                       size_t num_curve_keys,
                       size_t num_steps,
                       size_t step,
@ -67,7 +69,7 @@ class Hair : public Geometry {
                       float4 r_keys[2]) const;
    void cardinal_keys_for_step(const float3 *curve_keys,
                                const float *curve_radius,
-                                const float3 *key_steps,
+                                const float4 *key_steps,
                                size_t num_curve_keys,
                                size_t num_steps,
                                size_t step,
--- a/intern/cycles/scene/mesh.cpp
+++ b/intern/cycles/scene/mesh.cpp
@ -587,10 +587,14 @@ void Mesh::add_vertex_normals()
      }
    }

-    for (size_t i = 0; i < verts_size; i++) {
-      vN[i] = normalize(vN[i]);
-      if (flip) {
-        vN[i] = -vN[i];
+    if (flip) {
+      for (size_t i = 0; i < verts_size; i++) {
+        vN[i] = -normalize(vN[i]);
+      }
+    }
+    else {
+      for (size_t i = 0; i < verts_size; i++) {
+        vN[i] = normalize(vN[i]);
      }
    }
  }
@ -611,16 +615,21 @@ void Mesh::add_vertex_normals()
      memset(mN, 0, verts.size() * sizeof(float3));

      for (size_t i = 0; i < triangles_size; i++) {
+        Triangle tri = get_triangle(i);
+        float3 fN = tri.compute_normal(mP);
        for (size_t j = 0; j < 3; j++) {
-          float3 fN = get_triangle(i).compute_normal(mP);
-          mN[get_triangle(i).v[j]] += fN;
+          mN[tri.v[j]] += fN;
        }
      }

-      for (size_t i = 0; i < verts_size; i++) {
-        mN[i] = normalize(mN[i]);
-        if (flip) {
-          mN[i] = -mN[i];
+      if (flip) {
+        for (size_t i = 0; i < verts_size; i++) {
+          mN[i] = -normalize(mN[i]);
+        }
+      }
+      else {
+        for (size_t i = 0; i < verts_size; i++) {
+          mN[i] = normalize(mN[i]);
        }
      }
    }
@ -645,10 +654,14 @@ void Mesh::add_vertex_normals()
      }
    }

-    for (size_t i = 0; i < verts_size; i++) {
-      vN[i] = normalize(vN[i]);
-      if (flip) {
-        vN[i] = -vN[i];
+    if (flip) {
+      for (size_t i = 0; i < verts_size; i++) {
+        vN[i] = -normalize(vN[i]);
+      }
+    }
+    else {
+      for (size_t i = 0; i < verts_size; i++) {
+        vN[i] = normalize(vN[i]);
      }
    }
  }
@ -724,43 +737,53 @@ void Mesh::pack_normals(packed_float3 *vnormal)
  float3 *vN = attr_vN->data_float3();
  size_t verts_size = verts.size();

-  for (size_t i = 0; i < verts_size; i++) {
-    float3 vNi = vN[i];
-
-    if (do_transform)
-      vNi = safe_normalize(transform_direction(&ntfm, vNi));
-
-    vnormal[i] = make_float3(vNi.x, vNi.y, vNi.z);
+  if (do_transform) {
+    for (size_t i = 0; i < verts_size; i++) {
+      vnormal[i] = safe_normalize(transform_direction(&ntfm, vN[i]));
+    }
+  }
+  else {
+    for (size_t i = 0; i < verts_size; i++) {
+      vnormal[i] = vN[i];
+    }
  }
 }

 void Mesh::pack_verts(packed_float3 *tri_verts,
-                      uint4 *tri_vindex,
+                      packed_uint3 *tri_vindex,
                      uint *tri_patch,
                      float2 *tri_patch_uv)
 {
  size_t verts_size = verts.size();
-
+  size_t triangles_size = num_triangles();
+  const int *p_tris = triangles.data();
+  int off = 0;
  if (verts_size && get_num_subd_faces()) {
    float2 *vert_patch_uv_ptr = vert_patch_uv.data();

    for (size_t i = 0; i < verts_size; i++) {
+      tri_verts[i] = verts[i];
      tri_patch_uv[i] = vert_patch_uv_ptr[i];
    }
+    for (size_t i = 0; i < triangles_size; i++) {
+      tri_vindex[i] = make_packed_uint3(p_tris[off + 0] + vert_offset,
+                                        p_tris[off + 1] + vert_offset,
+                                        p_tris[off + 2] + vert_offset);
+      tri_patch[i] = triangle_patch[i] * 8 + patch_offset;
+      off += 3;
+    }
  }
-
-  size_t triangles_size = num_triangles();
-
-  for (size_t i = 0; i < triangles_size; i++) {
-    const Triangle t = get_triangle(i);
-    tri_vindex[i] = make_uint4(
-        t.v[0] + vert_offset, t.v[1] + vert_offset, t.v[2] + vert_offset, 3 * (prim_offset + i));
-
-    tri_patch[i] = (!get_num_subd_faces()) ? -1 : (triangle_patch[i] * 8 + patch_offset);
-
-    tri_verts[i * 3] = verts[t.v[0]];
-    tri_verts[i * 3 + 1] = verts[t.v[1]];
-    tri_verts[i * 3 + 2] = verts[t.v[2]];
+  else {
+    for (size_t i = 0; i < verts_size; i++) {
+      tri_verts[i] = verts[i];
+    }
+    for (size_t i = 0; i < triangles_size; i++) {
+      tri_vindex[i] = make_packed_uint3(p_tris[off + 0] + vert_offset,
+                                        p_tris[off + 1] + vert_offset,
+                                        p_tris[off + 2] + vert_offset);
+      tri_patch[i] = -1;
+      off += 3;
+    }
  }
 }

--- a/intern/cycles/scene/mesh.h
+++ b/intern/cycles/scene/mesh.h
@ -216,7 +216,7 @@ class Mesh : public Geometry {
  void pack_shaders(Scene *scene, uint *shader);
  void pack_normals(packed_float3 *vnormal);
  void pack_verts(packed_float3 *tri_verts,
-                  uint4 *tri_vindex,
+                  packed_uint3 *tri_vindex,
                  uint *tri_patch,
                  float2 *tri_patch_uv);
  void pack_patches(uint *patch_data);
--- a/intern/cycles/scene/mesh_subdivision.cpp
+++ b/intern/cycles/scene/mesh_subdivision.cpp
@ -262,9 +262,14 @@ class OsdData {
        }
        else if (attr.same_storage(attr.type, TypeFloat2)) {
          primvar_refiner.Interpolate(i + 1, (OsdValue<float2> *)src, (OsdValue<float2> *&)dest);
+          // float3 is not interchangeable with float4 and so needs to be handled
+          // separately
+        }
+        else if (attr.same_storage(attr.type, TypeFloat4)) {
+          primvar_refiner.Interpolate(i + 1, (OsdValue<float4> *)src, (OsdValue<float4> *&)dest);
        }
        else {
-          primvar_refiner.Interpolate(i + 1, (OsdValue<float4> *)src, (OsdValue<float4> *&)dest);
+          primvar_refiner.Interpolate(i + 1, (OsdValue<float3> *)src, (OsdValue<float3> *&)dest);
        }

        src = dest;
@ -281,11 +286,20 @@ class OsdData {
              (OsdValue<float2> *)&attr.buffer[0],
              (OsdValue<float2> *)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
        }
-        else {
+        else if (attr.same_storage(attr.type, TypeFloat4)) {
+          // float3 is not interchangeable with float4 and so needs to be handled
+          // separately
          patch_table->ComputeLocalPointValues(
              (OsdValue<float4> *)&attr.buffer[0],
              (OsdValue<float4> *)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
        }
+        else {
+          // float3 is not interchangeable with float4 and so needs to be handled
+          // separately
+          patch_table->ComputeLocalPointValues(
+              (OsdValue<float3> *)&attr.buffer[0],
+              (OsdValue<float3> *)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
+        }
      }
    }
    else if (attr.element == ATTR_ELEMENT_CORNER || attr.element == ATTR_ELEMENT_CORNER_BYTE) {
--- a/intern/cycles/scene/pointcloud.cpp
+++ b/intern/cycles/scene/pointcloud.cpp
@ -152,8 +152,14 @@ void PointCloud::copy_center_to_motion_step(const int motion_step)
  if (attr_mP) {
    float3 *points_data = points.data();
    size_t numpoints = points.size();
-    memcpy(
-        attr_mP->data_float3() + motion_step * numpoints, points_data, sizeof(float3) * numpoints);
+    float *radius_data = radius.data();
+
+    float4 *attrib_P = attr_mP->data_float4() + motion_step * numpoints;
+    for (int i = 0; i < numpoints; i++) {
+      float3 P = points_data[i];
+      float r = radius_data[i];
+      attrib_P[i] = make_float4(P.x, P.y, P.z, r);
+    }
  }
 }

--- a/intern/cycles/scene/scene.h
+++ b/intern/cycles/scene/scene.h
@ -72,7 +72,7 @@ class DeviceScene {
  device_vector<packed_float3> tri_verts;
  device_vector<uint> tri_shader;
  device_vector<packed_float3> tri_vnormal;
-  device_vector<uint4> tri_vindex;
+  device_vector<packed_uint3> tri_vindex;
  device_vector<uint> tri_patch;
  device_vector<float2> tri_patch_uv;

--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@ -84,7 +84,8 @@ ccl_device_inline float3 operator/(const float3 a, const float f)
 #  if defined(__KERNEL_SSE__)
  return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
 #  else
-  return make_float3(a.x / f, a.y / f, a.z / f);
+  float invf = 1.0f / f;
+  return make_float3(a.x * invf, a.y * invf, a.z * invf);
 #  endif
 }

--- a/intern/cycles/util/types_int3.h
+++ b/intern/cycles/util/types_int3.h
@ -50,4 +50,45 @@ ccl_device_inline int3 make_int3(int x, int y, int z);
 ccl_device_inline int3 make_int3(int i);
 ccl_device_inline void print_int3(ccl_private const char *label, const int3 a);

+#if defined(__KERNEL_METAL__)
+/* Metal has native packed_int3. */
+#elif defined(__KERNEL_CUDA__)
+/* CUDA int3 is already packed. */
+typedef int3 packed_int3;
+#else
+/* HIP int3 is not packed (https://github.com/ROCm-Developer-Tools/HIP/issues/706). */
+struct packed_int3 {
+  int x, y, z;
+
+  ccl_device_inline_method packed_int3(){};
+
+  ccl_device_inline_method packed_int3(const int px, const int py, const int pz)
+      : x(px), y(py), z(pz){};
+
+  ccl_device_inline_method packed_int3(const int3 &a) : x(a.x), y(a.y), z(a.z)
+  {
+  }
+
+  ccl_device_inline_method operator int3() const
+  {
+    return make_int3(x, y, z);
+  }
+
+  ccl_device_inline_method packed_int3 &operator=(const int3 &a)
+  {
+    x = a.x;
+    y = a.y;
+    z = a.z;
+    return *this;
+  }
+
+#  ifndef __KERNEL_GPU__
+  __forceinline int operator[](int i) const;
+  __forceinline int &operator[](int i);
+#  endif
+};
+
+static_assert(sizeof(packed_int3) == 12, "packed_int3 expected to be exactly 12 bytes");
+#endif
+
 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_int3_impl.h
+++ b/intern/cycles/util/types_int3_impl.h
@ -54,6 +54,18 @@ __forceinline int &int3::operator[](int i)
  util_assert(i < 3);
  return *(&x + i);
 }
+
+__forceinline int packed_int3::operator[](int i) const
+{
+  util_assert(i < 3);
+  return *(&x + i);
+}
+
+__forceinline int &packed_int3::operator[](int i)
+{
+  util_assert(i < 3);
+  return *(&x + i);
+}
 #  endif

 ccl_device_inline int3 make_int3(int x, int y, int z)
@ -80,6 +92,12 @@ ccl_device_inline int3 make_int3(int i)
 #endif
 }

+ccl_device_inline packed_int3 make_packed_int3(int x, int y, int z)
+{
+  packed_int3 a = {x, y, z};
+  return a;
+}
+
 ccl_device_inline void print_int3(ccl_private const char *label, const int3 a)
 {
 #ifdef __KERNEL_PRINTF__
--- a/intern/cycles/util/types_uint3.h
+++ b/intern/cycles/util/types_uint3.h
@ -22,4 +22,44 @@ struct uint3 {
 ccl_device_inline uint3 make_uint3(uint x, uint y, uint z);
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

+#if defined(__KERNEL_METAL__)
+/* Metal has native packed_float3. */
+#elif defined(__KERNEL_CUDA__)
+/* CUDA uint3 is already packed. */
+typedef uint3 packed_uint3;
+#else
+/* HIP uint3 is not packed (https://github.com/ROCm-Developer-Tools/HIP/issues/706). */
+struct packed_uint3 {
+  uint x, y, z;
+
+  ccl_device_inline_method packed_uint3(){};
+
+  ccl_device_inline_method packed_uint3(const uint px, const uint py, const uint pz)
+      : x(px), y(py), z(pz){};
+
+  ccl_device_inline_method packed_uint3(const uint3 &a) : x(a.x), y(a.y), z(a.z)
+  {
+  }
+
+  ccl_device_inline_method operator uint3() const
+  {
+    return make_uint3(x, y, z);
+  }
+
+  ccl_device_inline_method packed_uint3 &operator=(const uint3 &a)
+  {
+    x = a.x;
+    y = a.y;
+    z = a.z;
+    return *this;
+  }
+
+#  ifndef __KERNEL_GPU__
+  __forceinline uint operator[](uint i) const;
+  __forceinline uint &operator[](uint i);
+#  endif
+};
+
+static_assert(sizeof(packed_uint3) == 12, "packed_uint3 expected to be exactly 12 bytes");
+#endif
 CCL_NAMESPACE_END
--- a/intern/cycles/util/types_uint3_impl.h
+++ b/intern/cycles/util/types_uint3_impl.h
@ -22,6 +22,18 @@ __forceinline uint &uint3::operator[](uint i)
  util_assert(i < 3);
  return *(&x + i);
 }
+
+__forceinline uint packed_uint3::operator[](uint i) const
+{
+  util_assert(i < 3);
+  return *(&x + i);
+}
+
+__forceinline uint &packed_uint3::operator[](uint i)
+{
+  util_assert(i < 3);
+  return *(&x + i);
+}
 #  endif

 ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
@ -29,6 +41,12 @@ ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
  uint3 a = {x, y, z};
  return a;
 }
+
+ccl_device_inline packed_uint3 make_packed_uint3(uint x, uint y, uint z)
+{
+  packed_uint3 a = {x, y, z};
+  return a;
+}
 #endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
--- a/intern/ghost/GHOST_C-api.h
+++ b/intern/ghost/GHOST_C-api.h
@ -1248,8 +1248,8 @@ void GHOST_GetVulkanHandles(GHOST_ContextHandle context,
 void GHOST_GetVulkanCommandBuffer(GHOST_ContextHandle context, void *r_command_buffer);

 /**
- * Gets the Vulkan backbuffer related resource handles associated with the Vulkan context.
- * Needs to be called after each swap event as the backbuffer will change.
+ * Gets the Vulkan back-buffer related resource handles associated with the Vulkan context.
+ * Needs to be called after each swap event as the back-buffer will change.
 *
 * Should should only be called when using a Vulkan context with an active swap chain.
 * Other contexts will not return any handles and leave the
--- a/intern/ghost/GHOST_IContext.h
+++ b/intern/ghost/GHOST_IContext.h
@ -91,8 +91,8 @@ class GHOST_IContext {
  virtual GHOST_TSuccess getVulkanCommandBuffer(void *r_command_buffer) = 0;

  /**
-   * Gets the Vulkan backbuffer related resource handles associated with the Vulkan context.
-   * Needs to be called after each swap event as the backbuffer will change.
+   * Gets the Vulkan back-buffer related resource handles associated with the Vulkan context.
+   * Needs to be called after each swap event as the back-buffer will change.
   *
   * \param r_image: After calling this function the VkImage
   *     referenced by this parameter will contain the VKImage handle
--- a/intern/ghost/intern/GHOST_ContextVK.cpp
+++ b/intern/ghost/intern/GHOST_ContextVK.cpp
@ -535,7 +535,7 @@ static GHOST_TSuccess getGraphicQueueFamily(VkPhysicalDevice device, uint32_t *r
  for (const auto &queue_family : queue_families) {
    /* Every vulkan implementation by spec must have one queue family that support both graphics
     * and compute pipelines. We select this one; compute only queue family hints at async compute
-     * implementations.*/
+     * implementations. */
    if ((queue_family.queueFlags & VK_QUEUE_GRAPHICS_BIT) &&
        (queue_family.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
      return GHOST_kSuccess;
--- a/scripts/startup/bl_ui/space_outliner.py
+++ b/scripts/startup/bl_ui/space_outliner.py
@ -484,6 +484,10 @@ class OUTLINER_PT_filter(Panel):
            row = sub.row()
            row.label(icon='CAMERA_DATA')
            row.prop(space, "use_filter_object_camera", text="Cameras")
+        if bpy.data.grease_pencils:
+            row = sub.row()
+            row.label(icon='STROKE')
+            row.prop(space, "use_filter_object_grease_pencil", text="Grease Pencil")
        row = sub.row()
        row.label(icon='EMPTY_DATA')
        row.prop(space, "use_filter_object_empty", text="Empties")
--- a/source/blender/asset_system/intern/asset_essentials_library.cc
+++ b/source/blender/asset_system/intern/asset_essentials_library.cc
@ -16,6 +16,9 @@ StringRefNull essentials_directory_path()
 {
  static std::string path = []() {
    const char *datafiles_path = BKE_appdir_folder_id(BLENDER_DATAFILES, "assets");
+    if (datafiles_path == nullptr) {
+      return "";
+    }
    return datafiles_path;
  }();
  return path;
--- a/source/blender/asset_system/intern/asset_library_service.cc
+++ b/source/blender/asset_system/intern/asset_library_service.cc
@ -63,6 +63,9 @@ AssetLibrary *AssetLibraryService::get_asset_library(
  switch (type) {
    case ASSET_LIBRARY_ESSENTIALS: {
      const StringRefNull root_path = essentials_directory_path();
+      if (root_path.is_empty()) {
+        return nullptr;
+      }

      AssetLibrary *library = get_asset_library_on_disk(root_path);
      library->import_method_ = ASSET_IMPORT_APPEND_REUSE;
--- a/source/blender/blenkernel/BKE_nla.h
+++ b/source/blender/blenkernel/BKE_nla.h
@ -162,9 +162,9 @@ void BKE_nlastrips_sort_strips(ListBase *strips);
 void BKE_nlastrips_add_strip_unsafe(ListBase *strips, struct NlaStrip *strip);

 /**
- *  NULL checks incoming strip and verifies no overlap / invalid
- *  configuration against other strips in NLA Track before calling
- *  #BKE_nlastrips_add_strip_unsafe.
+ * NULL checks incoming strip and verifies no overlap / invalid
+ * configuration against other strips in NLA Track before calling
+ * #BKE_nlastrips_add_strip_unsafe.
 */
 bool BKE_nlastrips_add_strip(ListBase *strips, struct NlaStrip *strip);

--- a/source/blender/blenkernel/intern/anim_sys.c
+++ b/source/blender/blenkernel/intern/anim_sys.c
@ -3287,7 +3287,7 @@ static bool is_action_track_evaluated_without_nla(const AnimData *adt,
 * sure why. Preferably, it would be as simple as checking for `(adt->act_Track == nlt)` but that
 * doesn't work either, neither does comparing indices.
 *
- *  This function is a temporary work around. The first disabled track is always the tweaked track.
+ * This function is a temporary work around. The first disabled track is always the tweaked track.
 */
 static NlaTrack *nlatrack_find_tweaked(const AnimData *adt)
 {
--- a/source/blender/blenkernel/intern/cloth.cc
+++ b/source/blender/blenkernel/intern/cloth.cc
@ -641,7 +641,7 @@ static void cloth_apply_vgroup(ClothModifierData *clmd, Mesh *mesh)
          if (dvert->dw[j].def_nr == (clmd->sim_parms->vgroup_mass - 1)) {
            verts->goal = dvert->dw[j].weight;

-            /* goalfac= 1.0f; */ /* UNUSED */
+            // goalfac = 1.0f; /* UNUSED */

            /* Kicking goal factor to simplify things...who uses that anyway? */
            // ABS (clmd->sim_parms->maxgoal - clmd->sim_parms->mingoal);
@ -1779,7 +1779,7 @@ static bool cloth_build_springs(ClothModifierData *clmd, Mesh *mesh)
          index2 = ((tspring->ij == tspring2->kl) ? (tspring->kl) : (tspring->ij));

          /* Check for existing spring. */
-          /* Check also if startpoint is equal to endpoint. */
+          /* Check also if start-point is equal to endpoint. */
          if ((index2 != tspring2->ij) && !BLI_edgeset_haskey(edgeset, tspring2->ij, index2)) {
            spring = (ClothSpring *)MEM_callocN(sizeof(ClothSpring), "cloth spring");

@ -1889,7 +1889,8 @@ static bool cloth_build_springs(ClothModifierData *clmd, Mesh *mesh)

  /* NOTE: the edges may already exist so run reinsert. */

-  /* insert other near springs in edgeset AFTER bending springs are calculated (for selfcolls) */
+  /* Insert other near springs in `edgeset` AFTER bending springs are calculated
+   * (for self-collision). */
  for (int i = 0; i < numedges; i++) { /* struct springs */
    BLI_edgeset_add(edgeset, edges[i].v1, edges[i].v2);
  }
--- a/source/blender/blenkernel/intern/curves_geometry.cc
+++ b/source/blender/blenkernel/intern/curves_geometry.cc
@ -985,7 +985,6 @@ void CurvesGeometry::tag_normals_changed()
 }
 void CurvesGeometry::tag_radii_changed()
 {
-  this->runtime->bounds_cache.tag_dirty();
 }

 static void translate_positions(MutableSpan<float3> positions, const float3 &translation)
@ -1068,19 +1067,8 @@ bool CurvesGeometry::bounds_min_max(float3 &min, float3 &max) const
    return false;
  }

-  this->runtime->bounds_cache.ensure([&](Bounds<float3> &r_bounds) {
-    const Span<float3> positions = this->evaluated_positions();
-    if (this->attributes().contains("radius")) {
-      const VArraySpan<float> radii = this->attributes().lookup<float>("radius");
-      Array<float> evaluated_radii(this->evaluated_points_num());
-      this->ensure_can_interpolate_to_evaluated();
-      this->interpolate_to_evaluated(radii, evaluated_radii.as_mutable_span());
-      r_bounds = *bounds::min_max_with_radii(positions, evaluated_radii.as_span());
-    }
-    else {
-      r_bounds = *bounds::min_max(positions);
-    }
-  });
+  this->runtime->bounds_cache.ensure(
+      [&](Bounds<float3> &r_bounds) { r_bounds = *bounds::min_max(this->evaluated_positions()); });

  const Bounds<float3> &bounds = this->runtime->bounds_cache.data();
  min = math::min(bounds.min, min);
--- a/source/blender/blenkernel/intern/data_transfer.cc
+++ b/source/blender/blenkernel/intern/data_transfer.cc
@ -251,7 +251,7 @@ int BKE_object_data_transfer_dttype_to_srcdst_index(const int dtdata_type)
 /* ********** */

 /**
- * When transfering color attributes, also transfer the active color attribute string.
+ * When transferring color attributes, also transfer the active color attribute string.
 * If a match can't be found, use the first color layer that can be found (to ensure a valid string
 * is set).
 */
@ -295,7 +295,7 @@ static void data_transfer_mesh_attributes_transfer_active_color_string(
 }

 /**
- * When transfering color attributes, also transfer the default color attribute string.
+ * When transferring color attributes, also transfer the default color attribute string.
 * If a match cant be found, use the first color layer that can be found (to ensure a valid string
 * is set).
 */
@ -1211,8 +1211,8 @@ void BKE_object_data_transfer_layout(struct Depsgraph *depsgraph,
                                           fromlayers,
                                           tolayers,
                                           nullptr);
-      /* Make sure we have active/defaut color layers if none existed before.
-       * Use the active/defaut from src (if it was transferred), otherwise the first. */
+      /* Make sure we have active/default color layers if none existed before.
+       * Use the active/default from src (if it was transferred), otherwise the first. */
      if (ELEM(cddata_type, CD_PROP_COLOR, CD_PROP_BYTE_COLOR)) {
        data_transfer_mesh_attributes_transfer_active_color_string(
            me_dst, me_src, ATTR_DOMAIN_MASK_POINT, cddata_type);
@ -1259,8 +1259,8 @@ void BKE_object_data_transfer_layout(struct Depsgraph *depsgraph,
                                           fromlayers,
                                           tolayers,
                                           nullptr);
-      /* Make sure we have active/defaut color layers if none existed before.
-       * Use the active/defaut from src (if it was transferred), otherwise the first. */
+      /* Make sure we have active/default color layers if none existed before.
+       * Use the active/default from src (if it was transferred), otherwise the first. */
      if (ELEM(cddata_type, CD_PROP_COLOR, CD_PROP_BYTE_COLOR)) {
        data_transfer_mesh_attributes_transfer_active_color_string(
            me_dst, me_src, ATTR_DOMAIN_MASK_CORNER, cddata_type);
--- a/source/blender/blenkernel/intern/geometry_component_mesh.cc
+++ b/source/blender/blenkernel/intern/geometry_component_mesh.cc
@ -211,7 +211,8 @@ void adapt_mesh_domain_corner_to_point_impl(const Mesh &mesh,

  /* Deselect loose vertices without corners that are still selected from the 'true' default. */
  /* The record fact says that the value is true.
-   *Writing to the array from different threads is okay because each thread sets the same value. */
+   * Writing to the array from different threads is okay because each thread sets the same value.
+   */
  threading::parallel_for(loose_verts.index_range(), 2048, [&](const IndexRange range) {
    for (const int vert_index : range) {
      if (loose_verts[vert_index]) {
--- a/source/blender/blenkernel/intern/gpencil_geom.cc
+++ b/source/blender/blenkernel/intern/gpencil_geom.cc
@ -1015,12 +1015,14 @@ bool BKE_gpencil_stroke_smooth_point(bGPDstroke *gps,
    return false;
  }

-  /* Overview of the algorithm here and in the following smooth functions:
-   *  The smooth functions return the new attribute in question for a single point.
-   *  The result is stored in r_gps->points[point_index], while the data is read from gps.
-   *  To get a correct result, duplicate the stroke point data and read from the copy,
-   *  while writing to the real stroke. Not doing that will result in acceptable, but
-   *  asymmetric results.
+  /* - Overview of the algorithm here and in the following smooth functions:
+   *
+   *   The smooth functions return the new attribute in question for a single point.
+   *   The result is stored in r_gps->points[point_index], while the data is read from gps.
+   *   To get a correct result, duplicate the stroke point data and read from the copy,
+   *   while writing to the real stroke. Not doing that will result in acceptable, but
+   *   asymmetric results.
+   *
   * This algorithm works as long as all points are being smoothed. If there is
   * points that should not get smoothed, use the old repeat smooth pattern with
   * the parameter "iterations" set to 1 or 2. (2 matches the old algorithm).
@ -3237,7 +3239,7 @@ bGPDstroke *BKE_gpencil_stroke_delete_tagged_points(bGPdata *gpd,

        pts = new_stroke->points;
        for (j = 0; j < new_stroke->totpoints; j++, pts++) {
-          /* Some points have time = 0, so check to not get negative time values.*/
+          /* Some points have time = 0, so check to not get negative time values. */
          pts->time = max_ff(pts->time - delta, 0.0f);
          /* set flag for select again later */
          if (select == true) {
--- a/source/blender/blenkernel/intern/shrinkwrap.cc
+++ b/source/blender/blenkernel/intern/shrinkwrap.cc
@ -1406,7 +1406,7 @@ void shrinkwrapModifier_deform(ShrinkwrapModifierData *smd,
  calc.aux_target = DEG_get_evaluated_object(ctx->depsgraph, smd->auxTarget);

  if (mesh != nullptr && smd->shrinkType == MOD_SHRINKWRAP_PROJECT) {
-    /* Setup arrays to get vertexs positions, normals and deform weights */
+    /* Setup arrays to get vertex positions, normals and deform weights */
    calc.vert_positions = BKE_mesh_vert_positions_for_write(mesh);
    calc.vert_normals = BKE_mesh_vertex_normals_ensure(mesh);

--- a/source/blender/blenkernel/intern/subdiv_foreach.cc
+++ b/source/blender/blenkernel/intern/subdiv_foreach.cc
@ -167,15 +167,6 @@ static void subdiv_foreach_ctx_count(SubdivForeachTaskContext *ctx)
  for (int poly_index = 0; poly_index < coarse_mesh->totpoly; poly_index++) {
    const MPoly *coarse_poly = &ctx->coarse_polys[poly_index];
    const int num_ptex_faces_per_poly = num_ptex_faces_per_poly_get(coarse_poly);
-    for (int corner = 0; corner < coarse_poly->totloop; corner++) {
-      const MLoop *loop = &ctx->coarse_loops[coarse_poly->loopstart + corner];
-      const bool is_edge_used = BLI_BITMAP_TEST_BOOL(ctx->coarse_edges_used_map, loop->e);
-      /* Edges which aren't counted yet. */
-      if (!is_edge_used) {
-        BLI_BITMAP_ENABLE(ctx->coarse_edges_used_map, loop->e);
-        ctx->num_subdiv_vertices += num_subdiv_vertices_per_coarse_edge;
-      }
-    }
    /* Inner vertices of polygon. */
    if (num_ptex_faces_per_poly == 1) {
      ctx->num_subdiv_vertices += num_inner_vertices_per_quad;
@ -197,12 +188,10 @@ static void subdiv_foreach_ctx_count(SubdivForeachTaskContext *ctx)
                                  num_polys_per_ptex_get(no_quad_patch_resolution);
    }
  }
-  /* Calculate extra vertices created by loose edges. */
-  for (int edge_index = 0; edge_index < coarse_mesh->totedge; edge_index++) {
-    if (!BLI_BITMAP_TEST_BOOL(ctx->coarse_edges_used_map, edge_index)) {
-      ctx->num_subdiv_vertices += num_subdiv_vertices_per_coarse_edge;
-    }
-  }
+
+  /* Add vertices used by outer edges on subdivided faces and loose edges. */
+  ctx->num_subdiv_vertices += num_subdiv_vertices_per_coarse_edge * coarse_mesh->totedge;
+
  ctx->num_subdiv_loops = ctx->num_subdiv_polygons * 4;
 }

@ -270,8 +259,6 @@ static void subdiv_foreach_ctx_init(Subdiv *subdiv, SubdivForeachTaskContext *ct
  subdiv_foreach_ctx_init_offsets(ctx);
  /* Calculate number of geometry in the result subdivision mesh. */
  subdiv_foreach_ctx_count(ctx);
-  /* Re-set maps which were used at this step. */
-  BLI_bitmap_set_all(ctx->coarse_edges_used_map, false, coarse_mesh->totedge);
  ctx->face_ptex_offset = BKE_subdiv_face_ptex_offset_get(subdiv);
 }

--- a/source/blender/blenkernel/intern/subdiv_mesh.cc
+++ b/source/blender/blenkernel/intern/subdiv_mesh.cc
@ -17,6 +17,7 @@
 #include "BLI_bitmap.h"
 #include "BLI_math_vector.h"
 #include "BLI_math_vector_types.hh"
+#include "BLI_task.hh"

 #include "BKE_customdata.h"
 #include "BKE_key.h"
@ -68,6 +69,10 @@ struct SubdivMeshContext {
  int *accumulated_counters;
  bool have_displacement;

+  /* Write optimal display edge tags into a boolean array rather than the final bit vector
+   * to avoid race conditions when setting bits. */
+  blender::Array<bool> subdiv_display_edges;
+
  /* Lazily initialize a map from vertices to connected edges. */
  std::mutex vert_to_edge_map_mutex;
  int *vert_to_edge_buffer;
@ -536,8 +541,7 @@ static bool subdiv_mesh_topology_info(const SubdivForeachContext *foreach_contex
  subdiv_context->subdiv_mesh->runtime->subsurf_face_dot_tags.clear();
  subdiv_context->subdiv_mesh->runtime->subsurf_face_dot_tags.resize(num_vertices);
  if (subdiv_context->settings->use_optimal_display) {
-    subdiv_context->subdiv_mesh->runtime->subsurf_optimal_display_edges.clear();
-    subdiv_context->subdiv_mesh->runtime->subsurf_optimal_display_edges.resize(num_edges);
+    subdiv_context->subdiv_display_edges = blender::Array<bool>(num_edges, false);
  }
  return true;
 }
@ -799,7 +803,7 @@ static void subdiv_copy_edge_data(SubdivMeshContext *ctx,
  CustomData_copy_data(
      &ctx->coarse_mesh->edata, &ctx->subdiv_mesh->edata, coarse_edge_index, subdiv_edge_index, 1);
  if (ctx->settings->use_optimal_display) {
-    ctx->subdiv_mesh->runtime->subsurf_optimal_display_edges[subdiv_edge_index].set();
+    ctx->subdiv_display_edges[subdiv_edge_index] = true;
  }
 }

@ -1158,6 +1162,7 @@ Mesh *BKE_subdiv_to_mesh(Subdiv *subdiv,
                         const SubdivToMeshSettings *settings,
                         const Mesh *coarse_mesh)
 {
+  using namespace blender;
  BKE_subdiv_stats_begin(&subdiv->stats, SUBDIV_STATS_SUBDIV_TO_MESH);
  /* Make sure evaluator is up to date with possible new topology, and that
   * it is refined for the new positions of coarse vertices. */
@ -1196,6 +1201,20 @@ Mesh *BKE_subdiv_to_mesh(Subdiv *subdiv,
  BKE_subdiv_foreach_subdiv_geometry(subdiv, &foreach_context, settings, coarse_mesh);
  BKE_subdiv_stats_end(&subdiv->stats, SUBDIV_STATS_SUBDIV_TO_MESH_GEOMETRY);
  Mesh *result = subdiv_context.subdiv_mesh;
+
+  /* Move the optimal display edge array to the final bit vector. */
+  if (!subdiv_context.subdiv_display_edges.is_empty()) {
+    const Span<bool> span = subdiv_context.subdiv_display_edges;
+    BitVector<> &bit_vector = result->runtime->subsurf_optimal_display_edges;
+    bit_vector.clear();
+    bit_vector.resize(subdiv_context.subdiv_display_edges.size());
+    threading::parallel_for_aligned(span.index_range(), 4096, 64, [&](const IndexRange range) {
+      for (const int i : range) {
+        bit_vector[i].set(span[i]);
+      }
+    });
+  }
+
  // BKE_mesh_validate(result, true, true);
  BKE_subdiv_stats_end(&subdiv->stats, SUBDIV_STATS_SUBDIV_TO_MESH);
  /* Using normals from the limit surface gives different results than Blender's vertex normal
--- a/source/blender/blenlib/intern/storage.c
+++ b/source/blender/blenlib/intern/storage.c
@ -57,8 +57,8 @@
 #include "BLI_threads.h"
 #include "BLI_utildefines.h"

+/* NOTE: The implementation for Apple lives in storage_apple.mm. */
 #if !defined(__APPLE__)
-/* The implementation for Apple lives in storage_apple.mm.*/
 bool BLI_change_working_dir(const char *dir)
 {
  BLI_assert(BLI_thread_is_main());
--- a/source/blender/blenlib/intern/storage_apple.mm
+++ b/source/blender/blenlib/intern/storage_apple.mm
@ -189,7 +189,7 @@ const char *BLI_expand_tilde(const char *path_with_tilde)

 char *BLI_current_working_dir(char *dir, const size_t maxncpy)
 {
-  /* Can't just copy to the *dir pointer, as [path getCString gets grumpy.*/
+  /* Can't just copy to the *dir pointer, as [path getCString gets grumpy. */
  char path_expanded[PATH_MAX];
  @autoreleasepool {
    NSString *path = [[NSFileManager defaultManager] currentDirectoryPath];
--- a/source/blender/compositor/nodes/COM_CompositorNode.cc
+++ b/source/blender/compositor/nodes/COM_CompositorNode.cc
@ -15,7 +15,8 @@ void CompositorNode::convert_to_operations(NodeConverter &converter,
                                           const CompositorContext &context) const
 {
  const bNode *editor_node = this->get_bnode();
-  bool is_active = (editor_node->flag & NODE_DO_OUTPUT_RECALC) || context.is_rendering();
+  bool is_active = ((editor_node->flag & NODE_DO_OUTPUT_RECALC) || context.is_rendering()) &&
+                   (editor_node->flag & NODE_DO_OUTPUT);
  bool ignore_alpha = (editor_node->custom2 & CMP_NODE_OUTPUT_IGNORE_ALPHA) != 0;

  NodeInput *image_socket = this->get_input_socket(0);
--- a/source/blender/compositor/nodes/COM_SplitViewerNode.cc
+++ b/source/blender/compositor/nodes/COM_SplitViewerNode.cc
@ -17,7 +17,7 @@ void SplitViewerNode::convert_to_operations(NodeConverter &converter,
                                            const CompositorContext &context) const
 {
  const bNode *editor_node = this->get_bnode();
-  bool do_output = (editor_node->flag & NODE_DO_OUTPUT_RECALC || context.is_rendering()) &&
+  bool is_active = (editor_node->flag & NODE_DO_OUTPUT_RECALC || context.is_rendering()) &&
                   (editor_node->flag & NODE_DO_OUTPUT);

  NodeInput *image1Socket = this->get_input_socket(0);
@ -54,7 +54,7 @@ void SplitViewerNode::convert_to_operations(NodeConverter &converter,

  converter.add_preview(split_viewer_operation->get_output_socket());

-  if (do_output) {
+  if (is_active) {
    converter.register_viewer(viewer_operation);
  }
 }
--- a/source/blender/compositor/nodes/COM_ViewerNode.cc
+++ b/source/blender/compositor/nodes/COM_ViewerNode.cc
@ -16,7 +16,7 @@ void ViewerNode::convert_to_operations(NodeConverter &converter,
                                       const CompositorContext &context) const
 {
  const bNode *editor_node = this->get_bnode();
-  bool do_output = (editor_node->flag & NODE_DO_OUTPUT_RECALC || context.is_rendering()) &&
+  bool is_active = (editor_node->flag & NODE_DO_OUTPUT_RECALC || context.is_rendering()) &&
                   (editor_node->flag & NODE_DO_OUTPUT);
  bool ignore_alpha = (editor_node->custom2 & CMP_NODE_OUTPUT_IGNORE_ALPHA) != 0;

@ -61,7 +61,7 @@ void ViewerNode::convert_to_operations(NodeConverter &converter,

  converter.add_node_input_preview(image_socket);

-  if (do_output) {
+  if (is_active) {
    converter.register_viewer(viewer_operation);
  }
 }
--- a/source/blender/compositor/operations/COM_BokehImageOperation.h
+++ b/source/blender/compositor/operations/COM_BokehImageOperation.h
@ -8,8 +8,8 @@
 namespace blender::compositor {

 /**
- * \brief The BokehImageOperation class is an operation that creates an image useful to mimic the
- *internals of a camera.
+ * \brief The #BokehImageOperation class is an operation that creates an image useful to mimic the
+ * internals of a camera.
 *
 * features:
 *  - number of flaps
@ -19,20 +19,20 @@ namespace blender::compositor {
 *  - simulate lens-shift
 *
 * Per pixel the algorithm determines the edge of the bokeh on the same line as the center of the
- *image and the pixel is evaluating.
+ * image and the pixel is evaluating.
 *
 * The edge is detected by finding the closest point on the direct line between the two nearest
- *flap-corners. this edge is interpolated with a full circle. Result of this edge detection is
- *stored as the distance between the center of the image and the edge.
+ * flap-corners. this edge is interpolated with a full circle. Result of this edge detection is
+ * stored as the distance between the center of the image and the edge.
 *
 * catadioptric lenses are simulated to interpolate between the center of the image and the
- *distance of the edge. We now have three distances:
- *  - distance between the center of the image and the pixel to be evaluated
- *  - distance between the center of the image and the outer-edge
- *  - distance between the center of the image and the inner-edge
- *
+ * distance of the edge. We now have three distances:
+ * - Distance between the center of the image and the pixel to be evaluated.
+ * - Distance between the center of the image and the outer-edge.
+ * - Distance between the center of the image and the inner-edge.
+
 * With a simple compare it can be detected if the evaluated pixel is between the outer and inner
- *edge.
+ * edge.
 */
 class BokehImageOperation : public MultiThreadedOperation {
 private:
@ -105,7 +105,7 @@ class BokehImageOperation : public MultiThreadedOperation {
  void init_execution() override;

  /**
-   * \brief Deinitialize the execution
+   * \brief De-initialize the execution
   */
  void deinit_execution() override;

--- a/source/blender/draw/engines/eevee/eevee_shaders.cc
+++ b/source/blender/draw/engines/eevee/eevee_shaders.cc
@ -1400,7 +1400,7 @@ struct GPUMaterial *EEVEE_material_get(
    case GPU_MAT_QUEUED: {
      vedata->stl->g_data->queued_shaders_count++;
      GPUMaterial *default_mat = EEVEE_material_default_get(scene, ma, options);
-      /* Mark pending material with its default material for future cache warming.*/
+      /* Mark pending material with its default material for future cache warming. */
      GPU_material_set_default(mat, default_mat);
      /* Return default material. */
      mat = default_mat;
--- a/source/blender/draw/engines/eevee_next/eevee_shadow.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_shadow.cc
@ -440,7 +440,7 @@ void ShadowDirectional::cascade_tilemaps_distribution(Light &light, const Camera

  /* The bias is applied in cascade_level_range().
   * Using clipmap_lod_min here simplify code in shadow_directional_level().
-   * Minus 1 because of the ceil().*/
+   * Minus 1 because of the ceil(). */
  light._clipmap_lod_bias = light.clipmap_lod_min - 1;

  /* Scaling is handled by ShadowCoordinates.lod_relative. */
--- a/source/blender/editors/interface/interface_handlers.cc
+++ b/source/blender/editors/interface/interface_handlers.cc
@ -11303,7 +11303,7 @@ static int ui_region_handler(bContext *C, const wmEvent *event, void * /*userdat
    ui_blocks_set_tooltips(region, true);
  }

-  /* Always do this, to reliably update view and uilist item highlighting, even if
+  /* Always do this, to reliably update view and UI-list item highlighting, even if
   * the mouse hovers a button nested in the item (it's an overlapping layout). */
  ui_handle_viewlist_items_hover(event, region);
  if (retval == WM_UI_HANDLER_CONTINUE) {
--- a/source/blender/editors/interface/interface_icons.cc
+++ b/source/blender/editors/interface/interface_icons.cc
@ -1617,12 +1617,12 @@ static void icon_draw_cache_texture_flush_ex(GPUTexture *texture,
    return;
  }

-  GPUShader *shader = GPU_shader_get_builtin_shader(GPU_SHADER_2D_IMAGE_MULTI_RECT_COLOR);
+  GPUShader *shader = GPU_shader_get_builtin_shader(GPU_SHADER_ICON_MULTI);
  GPU_shader_bind(shader);

-  const int data_binding = GPU_shader_get_ubo_binding(shader, "multi_rect_data");
+  const int data_binding = GPU_shader_get_ubo_binding(shader, "multi_icon_data");
  GPUUniformBuf *ubo = GPU_uniformbuf_create_ex(
-      sizeof(MultiRectCallData), texture_draw_calls->drawcall_cache, __func__);
+      sizeof(MultiIconCallData), texture_draw_calls->drawcall_cache, __func__);
  GPU_uniformbuf_bind(ubo, data_binding);

  const int img_binding = GPU_shader_get_sampler_binding(shader, "image");
@ -1798,7 +1798,7 @@ static void icon_draw_texture(float x,
  GPU_shader_bind(shader);

  const int img_binding = GPU_shader_get_sampler_binding(shader, "image");
-  const int color_loc = GPU_shader_get_builtin_uniform(shader, GPU_UNIFORM_COLOR);
+  const int color_loc = GPU_shader_get_uniform(shader, "finalColor");
  const int rect_tex_loc = GPU_shader_get_uniform(shader, "rect_icon");
  const int rect_geom_loc = GPU_shader_get_uniform(shader, "rect_geom");

--- a/source/blender/editors/interface/interface_templates.cc
+++ b/source/blender/editors/interface/interface_templates.cc
@ -2337,7 +2337,7 @@ void uiTemplateModifiers(uiLayout * /*layout*/, bContext *C)
 /* -------------------------------------------------------------------- */
 /** \name Constraints Template
 *
- *  Template for building the panel layout for the active object or bone's constraints.
+ * Template for building the panel layout for the active object or bone's constraints.
 * \{ */

 /** For building the panel UI for constraints. */
--- a/source/blender/editors/screen/area.cc
+++ b/source/blender/editors/screen/area.cc
@ -77,7 +77,7 @@ static void region_draw_emboss(const ARegion *region, const rcti *scirct, int si
  rect.ymin = scirct->ymin - region->winrct.ymin;
  rect.ymax = scirct->ymax - region->winrct.ymin;

-  /* set transp line */
+  /* Set transparent line. */
  GPU_blend(GPU_BLEND_ALPHA);

  float color[4] = {0.0f, 0.0f, 0.0f, 0.25f};
@ -1286,11 +1286,11 @@ static void region_rect_recursive(
    alignment = RGN_ALIGN_NONE;
  }

-  /* If both the ARegion.sizex/y and the prefsize are 0, the region is tagged as too small, even
-   * before the layout for dynamic regions is created. #wm_draw_window_offscreen() allows the
-   * layout to be created despite the RGN_FLAG_TOO_SMALL flag being set. But there may still be
-   * regions that don't have a separate ARegionType.layout callback. For those, set a default
-   * prefsize so they can become visible. */
+  /* If both the #ARegion.sizex/y and the #ARegionType.prefsizex/y are 0,
+   * the region is tagged as too small, even before the layout for dynamic regions is created.
+   * #wm_draw_window_offscreen() allows the layout to be created despite the #RGN_FLAG_TOO_SMALL
+   * flag being set. But there may still be regions that don't have a separate #ARegionType.layout
+   * callback. For those, set a default #ARegionType.prefsizex/y so they can become visible. */
  if ((region->flag & RGN_FLAG_DYNAMIC_SIZE) && !(region->type->layout)) {
    if ((region->sizex == 0) && (region->type->prefsizex == 0)) {
      region->type->prefsizex = AREAMINX;
@ -1300,7 +1300,7 @@ static void region_rect_recursive(
    }
  }

-  /* prefsize, taking into account DPI */
+  /* `prefsizex/y`, taking into account DPI. */
  int prefsizex = UI_DPI_FAC *
                  ((region->sizex > 1) ? region->sizex + 0.5f : region->type->prefsizex);
  int prefsizey;
@ -1523,7 +1523,7 @@ static void region_rect_recursive(
    region_overlap_fix(area, region);
  }

-  /* set winrect for azones */
+  /* Set `region->winrct` for action-zones. */
  if (region->flag & (RGN_FLAG_HIDDEN | RGN_FLAG_TOO_SMALL)) {
    region->winrct = (region->overlap) ? *overlap_remainder : *remainder;

@ -1587,7 +1587,7 @@ static void region_rect_recursive(

 static void area_calc_totrct(ScrArea *area, const rcti *window_rect)
 {
-  short px = (short)U.pixelsize;
+  short px = short(U.pixelsize);

  area->totrct.xmin = area->v1->vec.x;
  area->totrct.xmax = area->v4->vec.x;
@ -2182,7 +2182,7 @@ static void region_align_info_from_area(ScrArea *area, RegionTypeAlignInfo *r_al

  LISTBASE_FOREACH (ARegion *, region, &area->regionbase) {
    const int index = region->regiontype;
-    if ((uint)index < RGN_TYPE_NUM) {
+    if (uint(index) < RGN_TYPE_NUM) {
      r_align_info->by_type[index].alignment = RGN_ALIGN_ENUM_FROM_MASK(region->alignment);
      r_align_info->by_type[index].hidden = (region->flag & RGN_FLAG_HIDDEN) != 0;
    }
@ -2362,7 +2362,7 @@ static void region_align_info_to_area(
  ARegion *region_by_type[RGN_TYPE_NUM] = {nullptr};
  LISTBASE_FOREACH (ARegion *, region, &area->regionbase) {
    const int index = region->regiontype;
-    if ((uint)index < RGN_TYPE_NUM) {
+    if (uint(index) < RGN_TYPE_NUM) {
      region_by_type[index] = region;
    }
  }
--- a/source/blender/editors/sculpt_paint/sculpt.cc
+++ b/source/blender/editors/sculpt_paint/sculpt.cc
@ -5526,7 +5526,8 @@ void SCULPT_flush_update_step(bContext *C, SculptUpdateType update_flags)
      SCULPT_update_object_bounding_box(ob);
    }

-    if (SCULPT_get_redraw_rect(region, CTX_wm_region_view3d(C), ob, &r)) {
+    RegionView3D *rv3d = CTX_wm_region_view3d(C);
+    if (rv3d && SCULPT_get_redraw_rect(region, rv3d, ob, &r)) {
      if (ss->cache) {
        ss->cache->current_r = r;
      }
--- a/source/blender/editors/sculpt_paint/sculpt_filter_color.cc
+++ b/source/blender/editors/sculpt_paint/sculpt_filter_color.cc
@ -12,6 +12,7 @@
 #include "BLI_task.h"

 #include "DNA_meshdata_types.h"
+#include "DNA_userdef_types.h"

 #include "BKE_context.h"
 #include "BKE_paint.h"
@ -30,6 +31,9 @@
 #include "RNA_access.h"
 #include "RNA_define.h"

+#include "UI_interface.h"
+#include "UI_resources.h"
+
 #include <cmath>
 #include <cstdlib>

@ -258,29 +262,15 @@ static void sculpt_color_presmooth_init(SculptSession *ss)
  }
 }

-static int sculpt_color_filter_modal(bContext *C, wmOperator *op, const wmEvent *event)
+static void sculpt_color_filter_apply(bContext *C, wmOperator *op, Object *ob)
 {
-  Object *ob = CTX_data_active_object(C);
-  SculptSession *ss = ob->sculpt;
  Sculpt *sd = CTX_data_tool_settings(C)->sculpt;
+  SculptSession *ss = ob->sculpt;
+
  const int mode = RNA_enum_get(op->ptr, "type");
  float filter_strength = RNA_float_get(op->ptr, "strength");
-
-  if (event->type == LEFTMOUSE && event->val == KM_RELEASE) {
-    SCULPT_undo_push_end(ob);
-    SCULPT_filter_cache_free(ss);
-    SCULPT_flush_update_done(C, ob, SCULPT_UPDATE_COLOR);
-    return OPERATOR_FINISHED;
-  }
-
-  if (event->type != MOUSEMOVE) {
-    return OPERATOR_RUNNING_MODAL;
-  }
-
-  const float len = event->prev_press_xy[0] - event->xy[0];
-  filter_strength = filter_strength * -len * 0.001f;
-
  float fill_color[3];
+
  RNA_float_get_array(op->ptr, "fill_color", fill_color);
  IMB_colormanagement_srgb_to_scene_linear_v3(fill_color, fill_color);

@ -303,31 +293,63 @@ static int sculpt_color_filter_modal(bContext *C, wmOperator *op, const wmEvent
  BLI_task_parallel_range(0, ss->filter_cache->totnode, &data, color_filter_task_cb, &settings);

  SCULPT_flush_update_step(C, SCULPT_UPDATE_COLOR);
+}
+
+static void sculpt_color_filter_end(bContext *C, Object *ob)
+{
+  SculptSession *ss = ob->sculpt;
+
+  SCULPT_undo_push_end(ob);
+  SCULPT_filter_cache_free(ss);
+  SCULPT_flush_update_done(C, ob, SCULPT_UPDATE_COLOR);
+}
+
+static int sculpt_color_filter_modal(bContext *C, wmOperator *op, const wmEvent *event)
+{
+  Object *ob = CTX_data_active_object(C);
+  SculptSession *ss = ob->sculpt;
+
+  if (event->type == LEFTMOUSE && event->val == KM_RELEASE) {
+    sculpt_color_filter_end(C, ob);
+    return OPERATOR_FINISHED;
+  }
+
+  if (event->type != MOUSEMOVE) {
+    return OPERATOR_RUNNING_MODAL;
+  }
+
+  const float len = (event->prev_press_xy[0] - event->xy[0]) * 0.001f;
+  float filter_strength = ss->filter_cache->start_filter_strength * -len;
+  RNA_float_set(op->ptr, "strength", filter_strength);
+
+  sculpt_color_filter_apply(C, op, ob);

  return OPERATOR_RUNNING_MODAL;
 }

-static int sculpt_color_filter_invoke(bContext *C, wmOperator *op, const wmEvent *event)
+static int sculpt_color_filter_init(bContext *C, wmOperator *op)
 {
  Object *ob = CTX_data_active_object(C);
  Sculpt *sd = CTX_data_tool_settings(C)->sculpt;
-  View3D *v3d = CTX_wm_view3d(C);
  SculptSession *ss = ob->sculpt;
  PBVH *pbvh = ob->sculpt->pbvh;
-  if (v3d->shading.type == OB_SOLID) {
-    v3d->shading.color_type = V3D_SHADING_VERTEX_COLOR;
-  }
+  View3D *v3d = CTX_wm_view3d(C);
+
+  int mval[2];
+  RNA_int_get_array(op->ptr, "start_mouse", mval);

  const bool use_automasking = SCULPT_is_automasking_enabled(sd, ss, nullptr);
  if (use_automasking) {
    /* Increment stroke id for auto-masking system. */
    SCULPT_stroke_id_next(ob);

-    /* Update the active face set manually as the paint cursor is not enabled when using the Mesh
-     * Filter Tool. */
-    float mval_fl[2] = {float(event->mval[0]), float(event->mval[1])};
-    SculptCursorGeometryInfo sgi;
-    SCULPT_cursor_geometry_info_update(C, &sgi, mval_fl, false);
+    if (v3d) {
+      /* Update the active face set manually as the paint cursor is not enabled when using the Mesh
+       * Filter Tool. */
+      float mval_fl[2] = {float(mval[0]), float(mval[1])};
+      SculptCursorGeometryInfo sgi;
+      SCULPT_cursor_geometry_info_update(C, &sgi, mval_fl, false);
+    }
  }

  /* Disable for multires and dyntopo for now */
@ -351,18 +373,78 @@ static int sculpt_color_filter_invoke(bContext *C, wmOperator *op, const wmEvent
                           ob,
                           sd,
                           SCULPT_UNDO_COLOR,
-                           event->mval,
+                           mval,
                           RNA_float_get(op->ptr, "area_normal_radius"),
                           RNA_float_get(op->ptr, "strength"));
  FilterCache *filter_cache = ss->filter_cache;
  filter_cache->active_face_set = SCULPT_FACE_SET_NONE;
  filter_cache->automasking = SCULPT_automasking_cache_init(sd, nullptr, ob);
+
+  return OPERATOR_PASS_THROUGH;
+}
+
+static int sculpt_color_filter_exec(bContext *C, wmOperator *op)
+{
+  Object *ob = CTX_data_active_object(C);
+
+  if (sculpt_color_filter_init(C, op) == OPERATOR_CANCELLED) {
+    return OPERATOR_CANCELLED;
+  }
+
+  sculpt_color_filter_apply(C, op, ob);
+  sculpt_color_filter_end(C, ob);
+
+  return OPERATOR_FINISHED;
+}
+
+static int sculpt_color_filter_invoke(bContext *C, wmOperator *op, const wmEvent *event)
+{
+  Object *ob = CTX_data_active_object(C);
+  View3D *v3d = CTX_wm_view3d(C);
+  if (v3d && v3d->shading.type == OB_SOLID) {
+    v3d->shading.color_type = V3D_SHADING_VERTEX_COLOR;
+  }
+
+  RNA_int_set_array(op->ptr, "start_mouse", event->mval);
+
+  if (sculpt_color_filter_init(C, op) == OPERATOR_CANCELLED) {
+    return OPERATOR_CANCELLED;
+  }
+
  ED_paint_tool_update_sticky_shading_color(C, ob);

  WM_event_add_modal_handler(C, op);
  return OPERATOR_RUNNING_MODAL;
 }

+static const char *sculpt_color_filter_get_name(wmOperatorType * /*ot*/, PointerRNA *ptr)
+{
+  int mode = RNA_enum_get(ptr, "type");
+  EnumPropertyItem *item = prop_color_filter_types;
+
+  while (item->identifier) {
+    if (item->value == mode) {
+      return item->name;
+    }
+
+    item++;
+  }
+
+  BLI_assert_unreachable();
+  return "error";
+}
+
+static void sculpt_color_filter_ui(bContext * /*C*/, wmOperator *op)
+{
+  uiLayout *layout = op->layout;
+
+  uiItemR(layout, op->ptr, "strength", 0, nullptr, ICON_NONE);
+
+  if (RNA_enum_get(op->ptr, "type") == COLOR_FILTER_FILL) {
+    uiItemR(layout, op->ptr, "fill_color", 0, nullptr, ICON_NONE);
+  }
+}
+
 void SCULPT_OT_color_filter(wmOperatorType *ot)
 {
  /* identifiers */
@ -372,8 +454,11 @@ void SCULPT_OT_color_filter(wmOperatorType *ot)

  /* api callbacks */
  ot->invoke = sculpt_color_filter_invoke;
+  ot->exec = sculpt_color_filter_exec;
  ot->modal = sculpt_color_filter_modal;
  ot->poll = SCULPT_mode_poll;
+  ot->ui = sculpt_color_filter_ui;
+  ot->get_name = sculpt_color_filter_get_name;

  ot->flag = OPTYPE_REGISTER | OPTYPE_UNDO;

--- a/source/blender/editors/sculpt_paint/sculpt_filter_mesh.cc
+++ b/source/blender/editors/sculpt_paint/sculpt_filter_mesh.cc
@ -162,8 +162,10 @@ void SCULPT_filter_cache_init(bContext *C,
  ED_view3d_viewcontext_init(C, &vc, depsgraph);

  ss->filter_cache->vc = vc;
-  copy_m4_m4(ss->filter_cache->viewmat, vc.rv3d->viewmat);
-  copy_m4_m4(ss->filter_cache->viewmat_inv, vc.rv3d->viewinv);
+  if (vc.rv3d) {
+    copy_m4_m4(ss->filter_cache->viewmat, vc.rv3d->viewmat);
+    copy_m4_m4(ss->filter_cache->viewmat_inv, vc.rv3d->viewinv);
+  }

  Scene *scene = CTX_data_scene(C);
  UnifiedPaintSettings *ups = &scene->toolsettings->unified_paint_settings;
@ -171,7 +173,7 @@ void SCULPT_filter_cache_init(bContext *C,
  float co[3];
  float mval_fl[2] = {float(mval[0]), float(mval[1])};

-  if (SCULPT_stroke_get_location(C, co, mval_fl, false)) {
+  if (vc.rv3d && SCULPT_stroke_get_location(C, co, mval_fl, false)) {
    PBVHNode **nodes;
    int totnode;

@ -229,14 +231,16 @@ void SCULPT_filter_cache_init(bContext *C,
  float mat[3][3];
  float viewDir[3] = {0.0f, 0.0f, 1.0f};

-  ED_view3d_ob_project_mat_get(vc.rv3d, ob, projection_mat);
+  if (vc.rv3d) {
+    ED_view3d_ob_project_mat_get(vc.rv3d, ob, projection_mat);

-  invert_m4_m4(ob->world_to_object, ob->object_to_world);
-  copy_m3_m4(mat, vc.rv3d->viewinv);
-  mul_m3_v3(mat, viewDir);
-  copy_m3_m4(mat, ob->world_to_object);
-  mul_m3_v3(mat, viewDir);
-  normalize_v3_v3(ss->filter_cache->view_normal, viewDir);
+    invert_m4_m4(ob->world_to_object, ob->object_to_world);
+    copy_m3_m4(mat, vc.rv3d->viewinv);
+    mul_m3_v3(mat, viewDir);
+    copy_m3_m4(mat, ob->world_to_object);
+    mul_m3_v3(mat, viewDir);
+    normalize_v3_v3(ss->filter_cache->view_normal, viewDir);
+  }
 }

 void SCULPT_filter_cache_free(SculptSession *ss)
@ -796,7 +800,7 @@ static void sculpt_mesh_filter_end(bContext *C, wmOperator * /*op*/)
  SCULPT_flush_update_done(C, ob, SCULPT_UPDATE_COORDS);
 }

-static void sculpt_mesh_filter_cancel(bContext *C, wmOperator *op)
+static void UNUSED_FUNCTION(sculpt_mesh_filter_cancel)(bContext *C, wmOperator * /*op*/)
 {
  Object *ob = CTX_data_active_object(C);
  SculptSession *ss = ob->sculpt;
--- a/source/blender/editors/sculpt_paint/sculpt_intern.hh
+++ b/source/blender/editors/sculpt_paint/sculpt_intern.hh
@ -1896,12 +1896,13 @@ void SCULPT_ensure_valid_pivot(const Object *ob, Scene *scene);
 /* Ensures vertex island keys exist and are valid. */
 void SCULPT_topology_islands_ensure(Object *ob);

-/* Mark vertex island keys as invalid.  Call when adding or hiding
- * geometry.
+/**
+ * Mark vertex island keys as invalid.
+ * Call when adding or hiding geometry.
 */
 void SCULPT_topology_islands_invalidate(SculptSession *ss);

-/* Get vertex island key.*/
+/** Get vertex island key. */
 int SCULPT_vertex_island_get(SculptSession *ss, PBVHVertRef vertex);

 /** \} */
--- a/source/blender/editors/space_graph/graph_draw.c
+++ b/source/blender/editors/space_graph/graph_draw.c
@ -593,7 +593,7 @@ static void draw_fcurve_curve(bAnimContext *ac,
   *
   * If the automatically determined sampling frequency is likely to cause an infinite
   * loop (i.e. too close to 0), then clamp it to a determined "safe" value. The value
-   *  chosen here is just the coarsest value which still looks reasonable...
+   * chosen here is just the coarsest value which still looks reasonable.
   */

  /* TODO: perhaps we should have 1.0 frames
--- a/source/blender/editors/space_graph/graph_select.c
+++ b/source/blender/editors/space_graph/graph_select.c
@ -107,10 +107,8 @@ static void nearest_fcurve_vert_store(ListBase *matches,
    int screen_co[2], dist;

    /* convert from data-space to screen coordinates
-     * NOTE: hpoint+1 gives us 0,1,2 respectively for each handle,
-     *  needed to access the relevant vertex coordinates in the 3x3
-     *  'vec' matrix
-     */
+     * NOTE: `hpoint +1` gives us 0,1,2 respectively for each handle,
+     * needed to access the relevant vertex coordinates in the 3x3 'vec' matrix */
    if (UI_view2d_view_to_region_clip(v2d,
                                      bezt->vec[hpoint + 1][0],
                                      (bezt->vec[hpoint + 1][1] + offset) * unit_scale,
--- a/source/blender/editors/space_outliner/outliner_tree.cc
+++ b/source/blender/editors/space_outliner/outliner_tree.cc
@ -1438,6 +1438,11 @@ static bool outliner_element_visible_get(const Scene *scene,
            return false;
          }
          break;
+        case OB_GPENCIL:
+          if (exclude_filter & SO_FILTER_NO_OB_GPENCIL) {
+            return false;
+          }
+          break;
        default:
          if (exclude_filter & SO_FILTER_NO_OB_OTHERS) {
            return false;
--- a/source/blender/geometry/GEO_mesh_merge_by_distance.hh
+++ b/source/blender/geometry/GEO_mesh_merge_by_distance.hh
@ -49,7 +49,7 @@ std::optional<Mesh *> mesh_merge_by_distance_connected(const Mesh &mesh,
 * \warning \a vert_merge_map must **not** contain any chained mapping (v1 -> v2 -> v3 etc.),
 * this is not supported and will likely generate corrupted geometry.
 *
- * \param vert_dest_map_len: The number of non '-1' values in vtargetmap. (not the size)
+ * \param vert_dest_map_len: The number of non '-1' values in `vert_dest_map`. (not the size)
 */
 Mesh *mesh_merge_verts(const Mesh &mesh, MutableSpan<int> vert_dest_map, int vert_dest_map_len);

--- a/source/blender/geometry/intern/uv_parametrizer.cc
+++ b/source/blender/geometry/intern/uv_parametrizer.cc
@ -3724,13 +3724,14 @@ using GeoUVPinIndex = struct GeoUVPinIndex {
  ParamKey reindex;
 };

-/* Find a (mostly) unique ParamKey given a BMVert index and UV co-ordinates.
- * For each unique pinned UVs, return a unique ParamKey, starting with
- *  a very large number, and decreasing steadily from there.
- * For non-pinned UVs which share a BMVert with a pinned UV,
- *  return the index corresponding to the closest pinned UV.
- * For everything else, just return the BMVert index.
- * Note that ParamKeys will eventually be hashed, so they don't need to be contiguous.
+/**
+ * Find a (mostly) unique #ParamKey given a #BMVert index and UV co-ordinates.
+ * For each unique pinned UVs, return a unique #ParamKey, starting with
+ * a very large number, and decreasing steadily from there.
+ * For non-pinned UVs which share a #BMVert with a pinned UV,
+ * return the index corresponding to the closest pinned UV.
+ * For everything else, just return the #BMVert index.
+ * Note that #ParamKeys will eventually be hashed, so they don't need to be contiguous.
 */
 ParamKey GEO_uv_find_pin_index(ParamHandle *handle, const int bmvertindex, const float uv[2])
 {
--- a/source/blender/gpu/CMakeLists.txt
+++ b/source/blender/gpu/CMakeLists.txt
@ -364,7 +364,7 @@ set(GLSL_SRC
  shaders/gpu_shader_2D_line_dashed_frag.glsl
  shaders/gpu_shader_2D_image_vert.glsl
  shaders/gpu_shader_2D_image_rect_vert.glsl
-  shaders/gpu_shader_2D_image_multi_rect_vert.glsl
+  shaders/gpu_shader_icon_multi_vert.glsl
  shaders/gpu_shader_icon_frag.glsl
  shaders/gpu_shader_icon_vert.glsl
  shaders/gpu_shader_image_frag.glsl
@ -373,7 +373,6 @@ set(GLSL_SRC
  shaders/gpu_shader_image_overlays_stereo_merge_frag.glsl
  shaders/gpu_shader_image_shuffle_color_frag.glsl
  shaders/gpu_shader_image_color_frag.glsl
-  shaders/gpu_shader_image_varying_color_frag.glsl
  shaders/gpu_shader_3D_image_vert.glsl
  shaders/gpu_shader_3D_vert.glsl
  shaders/gpu_shader_3D_normal_vert.glsl
@ -511,6 +510,7 @@ set(GLSL_SRC
  shaders/material/gpu_shader_material_world_normals.glsl

  shaders/gpu_shader_gpencil_stroke_vert.glsl
+  shaders/gpu_shader_gpencil_stroke_vert_no_geom.glsl
  shaders/gpu_shader_gpencil_stroke_frag.glsl
  shaders/gpu_shader_gpencil_stroke_geom.glsl

@ -656,7 +656,6 @@ set(SRC_SHADER_CREATE_INFOS
  shaders/infos/gpu_shader_2D_diag_stripes_info.hh
  shaders/infos/gpu_shader_2D_image_desaturate_color_info.hh
  shaders/infos/gpu_shader_2D_image_info.hh
-  shaders/infos/gpu_shader_2D_image_multi_rect_color_info.hh
  shaders/infos/gpu_shader_2D_image_overlays_merge_info.hh
  shaders/infos/gpu_shader_2D_image_overlays_stereo_merge_info.hh
  shaders/infos/gpu_shader_2D_image_rect_color_info.hh
--- a/source/blender/gpu/GPU_material.h
+++ b/source/blender/gpu/GPU_material.h
@ -254,9 +254,9 @@ void GPU_materials_free(struct Main *bmain);

 struct Scene *GPU_material_scene(GPUMaterial *material);
 struct GPUPass *GPU_material_get_pass(GPUMaterial *material);
-/* Return the most optimal shader configuration for the given material .*/
+/** Return the most optimal shader configuration for the given material. */
 struct GPUShader *GPU_material_get_shader(GPUMaterial *material);
-/* Return the base un-optimized shader. */
+/** Return the base un-optimized shader. */
 struct GPUShader *GPU_material_get_shader_base(GPUMaterial *material);
 const char *GPU_material_get_name(GPUMaterial *material);

--- a/source/blender/gpu/GPU_shader_builtin.h
+++ b/source/blender/gpu/GPU_shader_builtin.h
@ -39,7 +39,7 @@ typedef enum eGPUBuiltinShader {
  /** Draw a texture with a desaturation factor. */
  GPU_SHADER_2D_IMAGE_DESATURATE_COLOR,
  /** Draw a group of texture rectangle with an associated color multiplied. */
-  GPU_SHADER_2D_IMAGE_MULTI_RECT_COLOR,
+  GPU_SHADER_ICON_MULTI,
  /** Draw a two color checker based on screen position (not UV coordinates). */
  GPU_SHADER_2D_CHECKER,
  /** Draw diagonal stripes with two alternating colors. */
--- a/source/blender/gpu/GPU_shader_shared.h
+++ b/source/blender/gpu/GPU_shader_shared.h
@ -82,39 +82,39 @@ BLI_STATIC_ASSERT_ALIGN(struct SimpleLightingData, 16)

 #define MAX_CALLS 16

-struct MultiRectCallData {
+struct MultiIconCallData {
  float4 calls_data[MAX_CALLS * 3];
 };
-BLI_STATIC_ASSERT_ALIGN(struct MultiRectCallData, 16)
+BLI_STATIC_ASSERT_ALIGN(struct MultiIconCallData, 16)

 enum TestStatus {
-  TEST_STATUS_NONE = 0,
-  TEST_STATUS_PASSED = 1,
-  TEST_STATUS_FAILED = 2,
+  TEST_STATUS_NONE = 0u,
+  TEST_STATUS_PASSED = 1u,
+  TEST_STATUS_FAILED = 2u,
 };
 enum TestType {
-  TEST_TYPE_BOOL = 0,
-  TEST_TYPE_UINT = 1,
-  TEST_TYPE_INT = 2,
-  TEST_TYPE_FLOAT = 3,
-  TEST_TYPE_IVEC2 = 4,
-  TEST_TYPE_IVEC3 = 5,
-  TEST_TYPE_IVEC4 = 6,
-  TEST_TYPE_UVEC2 = 7,
-  TEST_TYPE_UVEC3 = 8,
-  TEST_TYPE_UVEC4 = 9,
-  TEST_TYPE_VEC2 = 10,
-  TEST_TYPE_VEC3 = 11,
-  TEST_TYPE_VEC4 = 12,
-  TEST_TYPE_MAT2X2 = 13,
-  TEST_TYPE_MAT2X3 = 14,
-  TEST_TYPE_MAT2X4 = 15,
-  TEST_TYPE_MAT3X2 = 16,
-  TEST_TYPE_MAT3X3 = 17,
-  TEST_TYPE_MAT3X4 = 18,
-  TEST_TYPE_MAT4X2 = 19,
-  TEST_TYPE_MAT4X3 = 20,
-  TEST_TYPE_MAT4X4 = 21,
+  TEST_TYPE_BOOL = 0u,
+  TEST_TYPE_UINT = 1u,
+  TEST_TYPE_INT = 2u,
+  TEST_TYPE_FLOAT = 3u,
+  TEST_TYPE_IVEC2 = 4u,
+  TEST_TYPE_IVEC3 = 5u,
+  TEST_TYPE_IVEC4 = 6u,
+  TEST_TYPE_UVEC2 = 7u,
+  TEST_TYPE_UVEC3 = 8u,
+  TEST_TYPE_UVEC4 = 9u,
+  TEST_TYPE_VEC2 = 10u,
+  TEST_TYPE_VEC3 = 11u,
+  TEST_TYPE_VEC4 = 12u,
+  TEST_TYPE_MAT2X2 = 13u,
+  TEST_TYPE_MAT2X3 = 14u,
+  TEST_TYPE_MAT2X4 = 15u,
+  TEST_TYPE_MAT3X2 = 16u,
+  TEST_TYPE_MAT3X3 = 17u,
+  TEST_TYPE_MAT3X4 = 18u,
+  TEST_TYPE_MAT4X2 = 19u,
+  TEST_TYPE_MAT4X3 = 20u,
+  TEST_TYPE_MAT4X4 = 21u,
 };

 /** \note Contains arrays of scalar. To be use only with SSBOs to avoid padding issues. */
--- a/source/blender/gpu/GPU_texture.h
+++ b/source/blender/gpu/GPU_texture.h
@ -47,7 +47,7 @@ typedef enum eGPUSamplerState {
  GPU_SAMPLER_DEFAULT = 0,
  /**
   * Enables hardware linear filtering.
-   * Enables linear interpolation between mips if GPU_SAMPLER_MIPMAP is also set.
+   * Enables linear interpolation between MIPS if GPU_SAMPLER_MIPMAP is also set.
   */
  GPU_SAMPLER_FILTER = (1 << 0),
  /**
@ -85,7 +85,7 @@ typedef enum eGPUSamplerState {
  /** Enable mirror repeat extension mode for directions using the `GPU_SAMPLER_REPEAT_*` flag. */
  GPU_SAMPLER_MIRROR_REPEAT = (1 << 8),

-  /** Special icon sampler with custom lod bias and interpolation mode. */
+  /** Special icon sampler with custom LOD bias and interpolation mode. */
  GPU_SAMPLER_ICON = (1 << 9),
 } eGPUSamplerState;

@ -487,7 +487,7 @@ void GPU_texture_update_mipmap_chain(GPUTexture *texture);
 /**
 * Read the content of a \a mip_level from a \a tex and returns a copy of its data.
 * \warning the texture must have been created using GPU_TEXTURE_USAGE_HOST_READ.
- * \note synchronisation of shader writes via `imageStore()` needs to be explicitly done using
+ * \note synchronization of shader writes via `imageStore()` needs to be explicitly done using
 * `GPU_memory_barrier(GPU_BARRIER_TEXTURE_FETCH)`.
 */
 void *GPU_texture_read(GPUTexture *texture, eGPUDataFormat data_format, int mip_level);
--- a/source/blender/gpu/intern/gpu_codegen.cc
+++ b/source/blender/gpu/intern/gpu_codegen.cc
@ -306,7 +306,7 @@ class GPUCodegen {
  bool should_optimize_heuristic() const
  {
    /* If each of the maximal attributes are exceeded, we can optimize, but we should also ensure
-     * the baseline is met.*/
+     * the baseline is met. */
    bool do_optimize = (nodes_total_ >= 60 || textures_total_ >= 4 || uniforms_total_ >= 64) &&
                       (textures_total_ >= 1 && uniforms_total_ >= 8 && nodes_total_ >= 4);
    return do_optimize;
--- a/source/blender/gpu/intern/gpu_shader_builtin.cc
+++ b/source/blender/gpu/intern/gpu_shader_builtin.cc
@ -42,8 +42,8 @@ static const char *builtin_shader_create_info_name(eGPUBuiltinShader shader)
      return "gpu_shader_2D_image_shuffle_color";
    case GPU_SHADER_2D_IMAGE_RECT_COLOR:
      return "gpu_shader_2D_image_rect_color";
-    case GPU_SHADER_2D_IMAGE_MULTI_RECT_COLOR:
-      return "gpu_shader_2D_image_multi_rect_color";
+    case GPU_SHADER_ICON_MULTI:
+      return "gpu_shader_icon_multi";
    case GPU_SHADER_3D_UNIFORM_COLOR:
      return "gpu_shader_3D_uniform_color";
    case GPU_SHADER_3D_FLAT_COLOR:
@ -85,8 +85,7 @@ static const char *builtin_shader_create_info_name(eGPUBuiltinShader shader)
    case GPU_SHADER_2D_NODELINK_INST:
      return "gpu_shader_2D_nodelink_inst";
    case GPU_SHADER_GPENCIL_STROKE:
-      return GPU_geometry_shader_support() ? "gpu_shader_gpencil_stroke_geom" :
-                                             "gpu_shader_gpencil_stroke_nogeom";
+      return "gpu_shader_gpencil_stroke";
    default:
      BLI_assert_unreachable();
      return "";
--- a/source/blender/gpu/intern/gpu_shader_create_info.cc
+++ b/source/blender/gpu/intern/gpu_shader_create_info.cc
@ -379,6 +379,9 @@ void gpu_shader_create_info_init()

    /* EEVEE Volumetric Material */
    eevee_legacy_material_volumetric_vert = eevee_legacy_material_volumetric_vert_no_geom;
+
+    /* GPencil stroke. */
+    gpu_shader_gpencil_stroke = gpu_shader_gpencil_stroke_no_geom;
  }
 #endif

--- a/source/blender/gpu/intern/gpu_shader_create_info.hh
+++ b/source/blender/gpu/intern/gpu_shader_create_info.hh
@ -291,8 +291,10 @@ struct StageInterfaceInfo {
  };

  StringRefNull name;
-  /** Name of the instance of the block (used to access).
-   *  Can be empty string (i.e: "") only if not using geometry shader. */
+  /**
+   * Name of the instance of the block (used to access).
+   * Can be empty string (i.e: "") only if not using geometry shader.
+   */
  StringRefNull instance_name;
  /** List of all members of the interface. */
  Vector<InOut> inouts;
--- a/source/blender/gpu/intern/gpu_viewport.c
+++ b/source/blender/gpu/intern/gpu_viewport.c
@ -155,7 +155,7 @@ static void gpu_viewport_textures_create(GPUViewport *viewport)

  /* Can be shared with GPUOffscreen. */
  if (viewport->depth_tx == NULL) {
-    /* Depth texture can be read back by gizmos #view3d_depths_create .*/
+    /* Depth texture can be read back by gizmos #view3d_depths_create. */
    viewport->depth_tx = GPU_texture_create_2d("dtxl_depth",
                                               UNPACK2(size),
                                               1,
--- a/source/blender/gpu/metal/mtl_context.hh
+++ b/source/blender/gpu/metal/mtl_context.hh
@ -768,6 +768,7 @@ class MTLContext : public Context {
  void texture_unbind_all();
  id<MTLSamplerState> get_sampler_from_state(MTLSamplerState state);
  id<MTLSamplerState> generate_sampler_from_state(MTLSamplerState state);
+  id<MTLSamplerState> generate_icon_sampler();
  id<MTLSamplerState> get_default_sampler_state();

  /* Metal Context pipeline state. */
--- a/source/blender/gpu/metal/mtl_context.mm
+++ b/source/blender/gpu/metal/mtl_context.mm
@ -223,11 +223,13 @@ MTLContext::MTLContext(void *ghost_window, void *ghost_context)
  }

  /* Initialize samplers. */
-  for (uint i = 0; i < GPU_SAMPLER_MAX; i++) {
+  for (uint i = 0; i < GPU_SAMPLER_ICON; i++) {
    MTLSamplerState state;
    state.state = static_cast<eGPUSamplerState>(i);
    sampler_state_cache_[i] = this->generate_sampler_from_state(state);
  }
+  /* Special sampler for icons. */
+  sampler_state_cache_[GPU_SAMPLER_ICON] = this->generate_icon_sampler();
 }

 MTLContext::~MTLContext()
@ -1525,7 +1527,7 @@ void MTLContext::ensure_texture_bindings(
    int compute_arg_buffer_bind_index = -1;

    /* Argument buffers are used for samplers, when the limit of 16 is exceeded.
-     * NOTE: Compute uses vertex argument for arg buffer bind index.*/
+     * NOTE: Compute uses vertex argument for arg buffer bind index. */
    bool use_argument_buffer_for_samplers = shader_interface->uses_argument_buffer_for_samplers();
    compute_arg_buffer_bind_index = shader_interface->get_argument_buffer_bind_index(
        ShaderStage::COMPUTE);
@ -2025,7 +2027,6 @@ id<MTLSamplerState> MTLContext::get_sampler_from_state(MTLSamplerState sampler_s

 id<MTLSamplerState> MTLContext::generate_sampler_from_state(MTLSamplerState sampler_state)
 {
-  /* Check if sampler already exists for given state. */
  MTLSamplerDescriptor *descriptor = [[MTLSamplerDescriptor alloc] init];
  descriptor.normalizedCoordinates = true;

@ -2068,6 +2069,21 @@ id<MTLSamplerState> MTLContext::generate_sampler_from_state(MTLSamplerState samp
  return state;
 }

+id<MTLSamplerState> MTLContext::generate_icon_sampler()
+{
+  MTLSamplerDescriptor *descriptor = [[MTLSamplerDescriptor alloc] init];
+  descriptor.minFilter = MTLSamplerMinMagFilterLinear;
+  descriptor.magFilter = MTLSamplerMinMagFilterLinear;
+  descriptor.mipFilter = MTLSamplerMipFilterNearest;
+  descriptor.lodMinClamp = 0;
+  descriptor.lodMaxClamp = 1;
+
+  id<MTLSamplerState> icon_state = [this->device newSamplerStateWithDescriptor:descriptor];
+  BLI_assert(icon_state != nil);
+  [descriptor autorelease];
+  return icon_state;
+}
+
 id<MTLSamplerState> MTLContext::get_default_sampler_state()
 {
  if (default_sampler_state_ == nil) {
--- a/source/blender/gpu/metal/mtl_immediate.mm
+++ b/source/blender/gpu/metal/mtl_immediate.mm
@ -239,6 +239,17 @@ void MTLImmediate::end()
    desc.vertex_descriptor.buffer_layouts[0].stride = this->vertex_format.stride;
    BLI_assert(this->vertex_format.stride > 0);

+    /* Emulate LineLoop using LineStrip. */
+    if (this->prim_type == GPU_PRIM_LINE_LOOP) {
+      /* Patch final vertex of line loop to close. Rendered using LineStrip.
+       * NOTE: vertex_len represents original length, however, allocated Metal
+       * buffer contains space for one extra vertex when LineLoop is used. */
+      uchar *buffer_data = reinterpret_cast<uchar *>(current_allocation_.data);
+      memcpy(buffer_data + (vertex_len)*vertex_format.stride, buffer_data, vertex_format.stride);
+      this->vertex_idx++;
+      this->prim_type = GPU_PRIM_LINE_STRIP;
+    }
+
    /* SSBO Vertex Fetch -- Verify Attributes. */
    if (active_mtl_shader->get_uses_ssbo_vertex_fetch()) {
      active_mtl_shader->ssbo_vertex_fetch_bind_attributes_end(rec);
@ -337,16 +348,6 @@ void MTLImmediate::end()
            }
            rendered = true;
          } break;
-          case GPU_PRIM_LINE_LOOP: {
-            /* Patch final vertex of line loop to close. Rendered using LineStrip.
-             * NOTE: vertex_len represents original length, however, allocated Metal
-             * buffer contains space for one extra vertex when LineLoop is used. */
-            uchar *buffer_data = reinterpret_cast<uchar *>(current_allocation_.data);
-            memcpy(buffer_data + (vertex_len)*vertex_format.stride,
-                   buffer_data,
-                   vertex_format.stride);
-            this->vertex_idx++;
-          } break;
          default: {
            BLI_assert_unreachable();
          } break;
--- a/source/blender/gpu/metal/mtl_memory.hh
+++ b/source/blender/gpu/metal/mtl_memory.hh
@ -252,31 +252,32 @@ struct CompareMTLBuffer {
  }
 };

-/* An MTLSafeFreeList is a temporary list of gpu::MTLBuffers which have
+/**
+ * An #MTLSafeFreeList is a temporary list of #gpu::MTLBuffers which have
 * been freed by the high level backend, but are pending GPU work execution before
- * the gpu::MTLBuffers can be returned to the Memory manager pools.
+ * the #gpu::MTLBuffers can be returned to the Memory manager pools.
 * This list is implemented as a chunked linked-list.
 *
- * Only a single MTLSafeFreeList is active at one time and is associated with current command
- * buffer submissions. If an MTLBuffer is freed during the lifetime of a command buffer, it could
- * still possibly be in-use and as such, the MTLSafeFreeList will increment its reference count for
- * each command buffer submitted while the current pool is active.
+ * Only a single #MTLSafeFreeList is active at one time and is associated with current command
+ * buffer submissions. If an #MTLBuffer is freed during the lifetime of a command buffer, it could
+ * still possibly be in-use and as such, the #MTLSafeFreeList will increment its reference count
+ * for each command buffer submitted while the current pool is active.
 *
- * -- Reference count is incremented upon MTLCommandBuffer commit.
- * -- Reference count is decremented in the MTLCommandBuffer completion callback handler.
+ * - Reference count is incremented upon #MTLCommandBuffer commit.
+ * - Reference count is decremented in the #MTLCommandBuffer completion callback handler.
 *
- * A new MTLSafeFreeList will begin each render step (frame). This pooling of buffers, rather than
+ * A new #MTLSafeFreeList will begin each render step (frame). This pooling of buffers, rather than
 * individual buffer resource tracking reduces performance overhead.
 *
- *  * The reference count starts at 1 to ensure that the reference count cannot prematurely reach
- *  zero until any command buffers have been submitted. This additional decrement happens
- *  when the next MTLSafeFreeList is created, to allow the existing pool to be released once
- *  the reference count hits zero after submitted command buffers complete.
+ * - The reference count starts at 1 to ensure that the reference count cannot prematurely reach
+ *   zero until any command buffers have been submitted. This additional decrement happens
+ *   when the next #MTLSafeFreeList is created, to allow the existing pool to be released once
+ *   the reference count hits zero after submitted command buffers complete.
 *
 * NOTE: the Metal API independently tracks resources used by command buffers for the purpose of
 * keeping resources alive while in-use by the driver and CPU, however, this differs from the
- * MTLSafeFreeList mechanism in the Metal backend, which exists for the purpose of allowing
- * previously allocated MTLBuffer resources to be re-used. This allows us to save on the expensive
+ * #MTLSafeFreeList mechanism in the Metal backend, which exists for the purpose of allowing
+ * previously allocated #MTLBuffer resources to be re-used. This allows us to save on the expensive
 * cost of memory allocation.
 */
 class MTLSafeFreeList {
--- a/source/blender/gpu/metal/mtl_shader.hh
+++ b/source/blender/gpu/metal/mtl_shader.hh
@ -788,7 +788,7 @@ inline bool mtl_convert_vertex_format(MTLVertexFormat shader_attrib_format,
          }
          else if (shader_attrib_format == MTLVertexFormatUInt && component_length == 4) {
            /* Special case here, format has been specified as GPU_COMP_U8 with 4 components, which
-             *is equivalent to a UInt-- so data will be compatible with shader interface. */
+             * is equivalent to a UInt-- so data will be compatible with shader interface. */
            out_vert_format = MTLVertexFormatUInt;
          }
          else {
--- a/source/blender/gpu/metal/mtl_shader.mm
+++ b/source/blender/gpu/metal/mtl_shader.mm
@ -1269,7 +1269,7 @@ bool MTLShader::bake_compute_pipeline_state(MTLContext *ctx)
     * specialization constant, customized per unique pipeline state permutation.
     *
     * For Compute shaders, this offset is always zero, but this needs setting as
-     * it is expected as part of the common Metal shader header.*/
+     * it is expected as part of the common Metal shader header. */
    int MTL_uniform_buffer_base_index = 0;
    [values setConstantValue:&MTL_uniform_buffer_base_index
                        type:MTLDataTypeInt
--- a/source/blender/gpu/metal/mtl_shader_interface.mm
+++ b/source/blender/gpu/metal/mtl_shader_interface.mm
@ -362,6 +362,8 @@ void MTLShaderInterface::prepare_common_shader_inputs()
  BLI_assert(&inputs_[attr_len_ + ubo_len_ + uniform_len_] >= current_input);
  current_input = &inputs_[attr_len_ + ubo_len_ + uniform_len_];

+  this->sort_inputs();
+
  /* Map builtin uniform indices to uniform binding locations. */
  this->map_builtins();
 }
--- a/source/blender/gpu/metal/mtl_texture.hh
+++ b/source/blender/gpu/metal/mtl_texture.hh
@ -353,8 +353,8 @@ class MTLTexture : public Texture {
   *
   *  blender::map<INPUT DEFINES STRUCT, compute PSO> update_2d_array_kernel_psos;
   * - Generate compute shader with configured kernel below with variable parameters depending
-   *  on input/output format configurations. Do not need to keep source or descriptors around,
-   *  just PSO, as same input defines will always generate the same code.
+   *   on input/output format configurations. Do not need to keep source or descriptors around,
+   *   just PSO, as same input defines will always generate the same code.
   *
   * - IF datatype IS an exact match e.g. :
   *    - Per-component size matches (e.g. GPU_DATA_UBYTE)
--- a/source/blender/gpu/opengl/gl_texture.cc
+++ b/source/blender/gpu/opengl/gl_texture.cc
@ -594,11 +594,11 @@ void GLTexture::samplers_init()
  }
  samplers_update();

-  /* Custom sampler for icons. */
+  /* Custom sampler for icons.
+   * NOTE: The icon texture is sampled within the shader using a -0.5f LOD bias. */
  GLuint icon_sampler = samplers_[GPU_SAMPLER_ICON];
  glSamplerParameteri(icon_sampler, GL_TEXTURE_MIN_FILTER, GL_LINEAR_MIPMAP_NEAREST);
  glSamplerParameteri(icon_sampler, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-  glSamplerParameterf(icon_sampler, GL_TEXTURE_LOD_BIAS, -0.5f);

  debug::object_label(GL_SAMPLER, icon_sampler, "icons");
 }
--- a/source/blender/gpu/shaders/gpu_shader_2D_widget_shadow_vert.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_2D_widget_shadow_vert.glsl
@ -25,7 +25,7 @@
 void main()
 {
  /* NOTE(Metal): Declaring constant array in function scope to avoid increasing local shader
-   * memory pressure.*/
+   * memory pressure. */
  const vec2 cornervec[36] = vec2[36](vec2(0.0, 1.0),
                                      vec2(0.02, 0.805),
                                      vec2(0.067, 0.617),
--- a/source/blender/gpu/shaders/gpu_shader_display_fallback_frag.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_display_fallback_frag.glsl
@ -1,5 +1,5 @@

 void main()
 {
-   fragColor = texture(image_texture, texCoord_interp);
+  fragColor = texture(image_texture, texCoord_interp);
 }
--- a/source/blender/gpu/shaders/gpu_shader_display_fallback_vert.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_display_fallback_vert.glsl
@ -1,11 +1,11 @@

 vec2 normalize_coordinates()
 {
-   return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);
+  return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);
 }

 void main()
 {
-   gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);
-   texCoord_interp = texCoord;
+  gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);
+  texCoord_interp = texCoord;
 }
--- a/source/blender/gpu/shaders/gpu_shader_gpencil_stroke_vert_no_geom.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_gpencil_stroke_vert_no_geom.glsl
@ -0,0 +1,344 @@
+
+#pragma USE_SSBO_VERTEX_FETCH(TriangleList, 27)
+
+#define GP_XRAY_FRONT 0
+#define GP_XRAY_3DSPACE 1
+#define GP_XRAY_BACK 2
+
+#define GPENCIL_FLATCAP 1
+
+#define DISCARD_VERTEX \
+  gl_Position = vec4(0.0); \
+  return;
+
+/* project 3d point to 2d on screen space */
+vec2 toScreenSpace(vec4 in_vertex)
+{
+  return vec2(in_vertex.xy / in_vertex.w) * gpencil_stroke_data.viewport;
+}
+
+/* get zdepth value */
+float getZdepth(vec4 point)
+{
+  if (gpencil_stroke_data.xraymode == GP_XRAY_FRONT) {
+    return 0.0;
+  }
+  if (gpencil_stroke_data.xraymode == GP_XRAY_3DSPACE) {
+    return (point.z / point.w);
+  }
+  if (gpencil_stroke_data.xraymode == GP_XRAY_BACK) {
+    return 1.0;
+  }
+
+  /* in front by default */
+  return 0.0;
+}
+
+/* check equality but with a small tolerance */
+bool is_equal(vec4 p1, vec4 p2)
+{
+  float limit = 0.0001;
+  float x = abs(p1.x - p2.x);
+  float y = abs(p1.y - p2.y);
+  float z = abs(p1.z - p2.z);
+
+  if ((x < limit) && (y < limit) && (z < limit)) {
+    return true;
+  }
+
+  return false;
+}
+
+/* Vertex emission. */
+
+#define EMIT_VERTEX(vertex_selector, _v0, _v1, _v2) \
+  { \
+    switch (vertex_selector) { \
+      case 0: { \
+        _v0 \
+      } break; \
+      case 1: { \
+        _v1 \
+      } break; \
+      case 2: { \
+        _v2 \
+      } break; \
+    } \
+  } \
+  return;
+
+#define EMIT_VERTEX_COND(vertex_selector, condition, _v0, _v1, _v2) \
+  { \
+    if (condition) { \
+      switch (vertex_selector) { \
+        case 0: { \
+          _v0 \
+        } break; \
+        case 1: { \
+          _v1 \
+        } break; \
+        case 2: { \
+          _v2 \
+        } break; \
+      } \
+    } \
+    else { \
+      DISCARD_VERTEX; \
+    } \
+  } \
+  return;
+
+/** All output vertex combinations. */
+/* Excessively long mitre gap. */
+#define V0_a \
+  geometry_out.mTexCoord = vec2(0, 0); \
+  geometry_out.mColor = finalColor[1]; \
+  gl_Position = vec4( \
+      (sp1 + finalThickness[2] * n0) / gpencil_stroke_data.viewport, getZdepth(P1), 1.0);
+
+#define V1_a \
+  geometry_out.mTexCoord = vec2(0, 0); \
+  geometry_out.mColor = finalColor[1]; \
+  gl_Position = vec4( \
+      (sp1 + finalThickness[2] * n1) / gpencil_stroke_data.viewport, getZdepth(P1), 1.0);
+
+#define V2_a \
+  geometry_out.mTexCoord = vec2(0, 0.5); \
+  geometry_out.mColor = finalColor[1]; \
+  gl_Position = vec4(sp1 / gpencil_stroke_data.viewport, getZdepth(P1), 1.0);
+
+#define V0_b \
+  geometry_out.mTexCoord = vec2(0, 1); \
+  geometry_out.mColor = finalColor[1]; \
+  gl_Position = vec4( \
+      (sp1 - finalThickness[2] * n1) / gpencil_stroke_data.viewport, getZdepth(P1), 1.0);
+#define V1_b \
+  geometry_out.mTexCoord = vec2(0, 1); \
+  geometry_out.mColor = finalColor[1]; \
+  gl_Position = vec4( \
+      (sp1 - finalThickness[2] * n0) / gpencil_stroke_data.viewport, getZdepth(P1), 1.0);
+
+#define V2_b \
+  geometry_out.mTexCoord = vec2(0, 0.5); \
+  geometry_out.mColor = finalColor[1]; \
+  gl_Position = vec4(sp1 / gpencil_stroke_data.viewport, getZdepth(P1), 1.0);
+
+/* -- start end cap. -- */
+#define V3 \
+  geometry_out.mTexCoord = vec2(1, 0.5); \
+  geometry_out.mColor = vec4(finalColor[1].rgb, finalColor[1].a * -1.0); \
+  vec2 svn1 = normalize(sp1 - sp2) * length_a * 4.0 * extend; \
+  gl_Position = vec4((sp1 + svn1) / gpencil_stroke_data.viewport, getZdepth(P1), 1.0);
+
+/* V4. */
+#define V4 \
+  geometry_out.mTexCoord = vec2(0, 0); \
+  geometry_out.mColor = vec4(finalColor[1].rgb, finalColor[1].a * -1.0); \
+  gl_Position = vec4( \
+      (sp1 - (length_a * 2.0) * miter_a) / gpencil_stroke_data.viewport, getZdepth(P1), 1.0);
+
+/* V5. */
+#define V5 \
+  geometry_out.mTexCoord = vec2(0, 1); \
+  geometry_out.mColor = vec4(finalColor[1].rgb, finalColor[1].a * -1.0); \
+  gl_Position = vec4( \
+      (sp1 + (length_a * 2.0) * miter_a) / gpencil_stroke_data.viewport, getZdepth(P1), 1.0);
+
+/* -- Main triangle strip --*/
+#define V6 \
+  geometry_out.mTexCoord = vec2(0, 0); \
+  geometry_out.mColor = finalColor[1]; \
+  gl_Position = vec4( \
+      (sp1 + length_a * miter_a) / gpencil_stroke_data.viewport, getZdepth(P1), 1.0);
+
+/* V7. */
+#define V7 \
+  geometry_out.mTexCoord = vec2(0, 1); \
+  geometry_out.mColor = finalColor[1]; \
+  gl_Position = vec4( \
+      (sp1 - length_a * miter_a) / gpencil_stroke_data.viewport, getZdepth(P1), 1.0);
+
+/* V8. */
+#define V8 \
+  geometry_out.mTexCoord = vec2(0, 0); \
+  geometry_out.mColor = finalColor[2]; \
+  gl_Position = vec4( \
+      (sp2 + length_b * miter_b) / gpencil_stroke_data.viewport, getZdepth(P2), 1.0);
+
+/* V9. */
+#define V9 \
+  geometry_out.mTexCoord = vec2(0, 1); \
+  geometry_out.mColor = finalColor[2]; \
+  gl_Position = vec4( \
+      (sp2 - length_b * miter_b) / gpencil_stroke_data.viewport, getZdepth(P2), 1.0);
+
+/* End end-cap. */
+/* V10. */
+#define V10 \
+  geometry_out.mTexCoord = vec2(0, 1); \
+  geometry_out.mColor = vec4(finalColor[2].rgb, finalColor[2].a * -1.0); \
+  gl_Position = vec4( \
+      (sp2 + (length_b * 2.0) * miter_b) / gpencil_stroke_data.viewport, getZdepth(P2), 1.0);
+
+/* V11. */
+#define V11 \
+  geometry_out.mTexCoord = vec2(0, 0); \
+  geometry_out.mColor = vec4(finalColor[2].rgb, finalColor[2].a * -1.0); \
+  gl_Position = vec4( \
+      (sp2 - (length_b * 2.0) * miter_b) / gpencil_stroke_data.viewport, getZdepth(P2), 1.0);
+
+/* V12. */
+#define V12 \
+  geometry_out.mTexCoord = vec2(1, 0.5); \
+  geometry_out.mColor = vec4(finalColor[2].rgb, finalColor[2].a * -1.0); \
+  vec2 svn2 = normalize(sp2 - sp1) * length_b * 4.0 * extend; \
+  gl_Position = vec4((sp2 + svn2) / gpencil_stroke_data.viewport, getZdepth(P2), 1.0);
+
+void main(void)
+{
+  /* Determine output geometry IDs. */
+  uint input_prim_id = gl_VertexID / 27;
+  uint output_vertex_id = gl_VertexID % 27;
+  uint output_prim_triangle_id = output_vertex_id / 3;
+  uint vertex_in_triangle = output_vertex_id % 3;
+
+  /** Run Vertex shader for all input vertices (Lines adjacency). */
+  vec4 finalPos[4];
+  vec4 finalColor[4];
+  float finalThickness[4];
+
+  float defaultpixsize = gpencil_stroke_data.pixsize * (1000.0 / gpencil_stroke_data.pixfactor);
+
+  for (int i = 0; i < 4; i++) {
+    finalPos[i] = ModelViewProjectionMatrix *
+                  vec4(vertex_fetch_attribute(input_prim_id + i, pos, vec3).xyz, 1.0);
+    finalColor[i] = vertex_fetch_attribute(input_prim_id + i, color, vec4);
+    float in_thickness = vertex_fetch_attribute(input_prim_id + i, thickness, float);
+
+    if (gpencil_stroke_data.keep_size) {
+      finalThickness[i] = in_thickness;
+    }
+    else {
+      float size = (ProjectionMatrix[3][3] == 0.0) ?
+                       (in_thickness / (gl_Position.z * defaultpixsize)) :
+                       (in_thickness / defaultpixsize);
+      finalThickness[i] = max(size * gpencil_stroke_data.objscale, 1.0);
+    }
+  }
+
+  /** Perform Geometry shader alternative. */
+  float MiterLimit = 0.75;
+
+  /* receive 4 points */
+  vec4 P0 = finalPos[0];
+  vec4 P1 = finalPos[1];
+  vec4 P2 = finalPos[2];
+  vec4 P3 = finalPos[3];
+
+  /* get the four vertices passed to the shader */
+  vec2 sp0 = toScreenSpace(P0); /* start of previous segment */
+  vec2 sp1 = toScreenSpace(P1); /* end of previous segment, start of current segment */
+  vec2 sp2 = toScreenSpace(P2); /* end of current segment, start of next segment */
+  vec2 sp3 = toScreenSpace(P3); /* end of next segment */
+
+  /* culling outside viewport */
+  vec2 area = gpencil_stroke_data.viewport * 4.0;
+  if (sp1.x < -area.x || sp1.x > area.x) {
+    DISCARD_VERTEX;
+  }
+  if (sp1.y < -area.y || sp1.y > area.y) {
+    DISCARD_VERTEX;
+  }
+  if (sp2.x < -area.x || sp2.x > area.x) {
+    DISCARD_VERTEX;
+  }
+  if (sp2.y < -area.y || sp2.y > area.y) {
+    DISCARD_VERTEX;
+  }
+
+  /* determine the direction of each of the 3 segments (previous,
+   * current, next) */
+  vec2 v0 = normalize(sp1 - sp0);
+  vec2 v1 = normalize(sp2 - sp1);
+  vec2 v2 = normalize(sp3 - sp2);
+
+  /* determine the normal of each of the 3 segments (previous,
+   * current, next) */
+  vec2 n0 = vec2(-v0.y, v0.x);
+  vec2 n1 = vec2(-v1.y, v1.x);
+  vec2 n2 = vec2(-v2.y, v2.x);
+
+  /* determine miter lines by averaging the normals of the 2
+   * segments */
+  vec2 miter_a = normalize(n0 + n1); /* miter at start of current segment */
+  vec2 miter_b = normalize(n1 + n2); /* miter at end of current segment */
+
+  /* determine the length of the miter by projecting it onto normal
+   * and then inverse it */
+  float an1 = dot(miter_a, n1);
+  float bn1 = dot(miter_b, n2);
+  if (an1 == 0) {
+    an1 = 1;
+  }
+  if (bn1 == 0) {
+    bn1 = 1;
+  }
+  float length_a = finalThickness[1] / an1;
+  float length_b = finalThickness[2] / bn1;
+  if (length_a <= 0.0) {
+    length_a = 0.01;
+  }
+  if (length_b <= 0.0) {
+    length_b = 0.01;
+  }
+
+  /** Geometry output. */
+  /* First triangle (T0). prevent excessively long miters at sharp
+   * corners */
+  if (output_prim_triangle_id == 0) {
+    if (dot(v0, v1) < -MiterLimit) {
+      if (dot(v0, n1) > 0) {
+        EMIT_VERTEX(vertex_in_triangle, V0_a, V1_a, V2_a)
+      }
+      else {
+        EMIT_VERTEX(vertex_in_triangle, V0_b, V1_b, V2_b)
+      }
+    }
+    else {
+      DISCARD_VERTEX
+    }
+  }
+
+  if (dot(v1, v2) < -MiterLimit) {
+    miter_b = n1;
+    length_b = finalThickness[2];
+  }
+
+  float extend = gpencil_stroke_data.fill_stroke ? 2 : 1;
+  bool start_endcap = ((gpencil_stroke_data.caps_start != GPENCIL_FLATCAP) && is_equal(P0, P2));
+  bool end_endcap = (gpencil_stroke_data.caps_end != GPENCIL_FLATCAP) && is_equal(P1, P3);
+
+  switch (output_prim_triangle_id) {
+    /* -- Start end cap. -*/
+    case 1:
+      EMIT_VERTEX_COND(vertex_in_triangle, start_endcap, V3, V4, V5)
+    case 2:
+      EMIT_VERTEX_COND(vertex_in_triangle, start_endcap, V4, V5, V6)
+    case 3:
+      EMIT_VERTEX_COND(vertex_in_triangle, start_endcap, V5, V6, V7)
+    /* -- Standard triangle strip. -- */
+    case 4:
+      EMIT_VERTEX(vertex_in_triangle, V6, V7, V8)
+    case 5:
+      EMIT_VERTEX(vertex_in_triangle, V7, V8, V9)
+    /* -- End end cap. -- */
+    case 6:
+      EMIT_VERTEX_COND(vertex_in_triangle, end_endcap, V8, V9, V10)
+    case 7:
+      EMIT_VERTEX_COND(vertex_in_triangle, end_endcap, V9, V10, V11)
+    case 8:
+      EMIT_VERTEX_COND(vertex_in_triangle, end_endcap, V10, V11, V12)
+    default:
+      DISCARD_VERTEX
+  }
+}
--- a/source/blender/gpu/shaders/gpu_shader_icon_frag.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_icon_frag.glsl
@ -8,6 +8,10 @@

 void main()
 {
+  /* Sample texture with LOD BIAS. Used instead of custom lod bias in GPU_SAMPLER_ICON. */
+  fragColor = texture(image, texCoord_interp, -0.5) * finalColor;
+
+#ifdef DO_CORNER_MASKING
  /* Top-left rounded corner parameters. */
  const float circle_radius_outer = 0.1;
  const float circle_radius_inner = 0.075;
@ -18,7 +22,6 @@ void main()
  const float mask_transparency = 0.25;

  vec2 circle_center = vec2(circle_radius_outer - text_width, 0.5);
-  fragColor = texture(image, texCoord_interp) * color;

  /* radius in icon space (1 is the icon width).  */
  float radius = length(mask_coord_interp - circle_center);
@ -39,4 +42,5 @@ void main()
  }

  fragColor = mix(vec4(0.0), fragColor, max(mask_transparency, mask));
+#endif
 }
--- a/source/blender/gpu/shaders/gpu_shader_2D_image_multi_rect_vert.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_2D_image_multi_rect_vert.glsl
@ -5,9 +5,9 @@

 void main()
 {
-  vec4 rect = multi_rect_data.calls_data[gl_InstanceID * 3];
-  vec4 tex = multi_rect_data.calls_data[gl_InstanceID * 3 + 1];
-  finalColor = multi_rect_data.calls_data[gl_InstanceID * 3 + 2];
+  vec4 rect = multi_icon_data.calls_data[gl_InstanceID * 3];
+  vec4 tex = multi_icon_data.calls_data[gl_InstanceID * 3 + 1];
+  finalColor = multi_icon_data.calls_data[gl_InstanceID * 3 + 2];

  /* Use pos to select the right swizzle (instead of gl_VertexID)
   * in order to workaround an OSX driver bug. */
--- a/source/blender/gpu/shaders/gpu_shader_image_varying_color_frag.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_image_varying_color_frag.glsl
@ -1,5 +0,0 @@
-
-void main()
-{
-  fragColor = texture(image, texCoord_interp) * finalColor;
-}
--- a/source/blender/gpu/shaders/gpu_shader_point_uniform_color_aa_frag.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_point_uniform_color_aa_frag.glsl
@ -6,7 +6,7 @@ void main()

  /* transparent outside of point
   * --- 0 ---
-   *  smooth transition
+   * smooth transition
   * --- 1 ---
   * pure point color
   * ...
--- a/source/blender/gpu/shaders/gpu_shader_text_frag.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_text_frag.glsl
@ -73,7 +73,7 @@ void main()

    if (interp_size == 1) {
      /* NOTE(Metal): Declaring constant array in function scope to avoid increasing local shader
-       * memory pressure.*/
+       * memory pressure. */
      const vec2 offsets4[4] = vec2[4](
          vec2(-0.5, 0.5), vec2(0.5, 0.5), vec2(-0.5, -0.5), vec2(-0.5, -0.5));

@ -87,7 +87,7 @@ void main()
    }
    else {
      /* NOTE(Metal): Declaring constant array in function scope to avoid increasing local shader
-       * memory pressure.*/
+       * memory pressure. */
      const vec2 offsets16[16] = vec2[16](vec2(-1.5, 1.5),
                                          vec2(-0.5, 1.5),
                                          vec2(0.5, 1.5),
--- a/source/blender/gpu/shaders/infos/gpu_shader_2D_image_multi_rect_color_info.hh
+++ b/source/blender/gpu/shaders/infos/gpu_shader_2D_image_multi_rect_color_info.hh
@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-
-#include "gpu_interface_info.hh"
-#include "gpu_shader_create_info.hh"
-
-GPU_SHADER_CREATE_INFO(gpu_shader_2D_image_multi_rect_color)
-    .vertex_in(0, Type::VEC2, "pos")
-    .vertex_out(flat_color_smooth_tex_coord_interp_iface)
-    .fragment_out(0, Type::VEC4, "fragColor")
-    .uniform_buf(0, "MultiRectCallData", "multi_rect_data")
-    .sampler(0, ImageType::FLOAT_2D, "image")
-    .typedef_source("GPU_shader_shared.h")
-    .vertex_source("gpu_shader_2D_image_multi_rect_vert.glsl")
-    .fragment_source("gpu_shader_image_varying_color_frag.glsl")
-    .do_static_compilation(true);
--- a/source/blender/gpu/shaders/infos/gpu_shader_gpencil_stroke_info.hh
+++ b/source/blender/gpu/shaders/infos/gpu_shader_gpencil_stroke_info.hh
@ -25,17 +25,20 @@ GPU_SHADER_CREATE_INFO(gpu_shader_gpencil_stroke_base)

    .push_constant(Type::MAT4, "ModelViewProjectionMatrix")
    .push_constant(Type::MAT4, "ProjectionMatrix")
-    .vertex_source("gpu_shader_gpencil_stroke_vert.glsl")
    .fragment_source("gpu_shader_gpencil_stroke_frag.glsl")
    .typedef_source("GPU_shader_shared.h");

-GPU_SHADER_CREATE_INFO(gpu_shader_gpencil_stroke_geom)
+GPU_SHADER_CREATE_INFO(gpu_shader_gpencil_stroke)
    .additional_info("gpu_shader_gpencil_stroke_base")
    .geometry_layout(PrimitiveIn::LINES_ADJACENCY, PrimitiveOut::TRIANGLE_STRIP, 13)
    .geometry_out(gpencil_stroke_geom_iface)
+    .vertex_source("gpu_shader_gpencil_stroke_vert.glsl")
    .geometry_source("gpu_shader_gpencil_stroke_geom.glsl")
    .do_static_compilation(true);

-GPU_SHADER_CREATE_INFO(gpu_shader_gpencil_stroke_nogeom)
+GPU_SHADER_CREATE_INFO(gpu_shader_gpencil_stroke_no_geom)
+    .metal_backend_only(true)
    .additional_info("gpu_shader_gpencil_stroke_base")
+    .vertex_out(gpencil_stroke_geom_iface)
+    .vertex_source("gpu_shader_gpencil_stroke_vert_no_geom.glsl")
    .do_static_compilation(true);
--- a/source/blender/gpu/shaders/infos/gpu_shader_icon_info.hh
+++ b/source/blender/gpu/shaders/infos/gpu_shader_icon_info.hh
@ -9,10 +9,11 @@
 #include "gpu_shader_create_info.hh"

 GPU_SHADER_CREATE_INFO(gpu_shader_icon)
+    .define("DO_CORNER_MASKING")
    .vertex_out(smooth_icon_interp_iface)
    .fragment_out(0, Type::VEC4, "fragColor")
    .push_constant(Type::MAT4, "ModelViewProjectionMatrix")
-    .push_constant(Type::VEC4, "color")
+    .push_constant(Type::VEC4, "finalColor")
    .push_constant(Type::VEC4, "rect_icon")
    .push_constant(Type::VEC4, "rect_geom")
    .push_constant(Type::FLOAT, "text_width")
@ -20,3 +21,14 @@ GPU_SHADER_CREATE_INFO(gpu_shader_icon)
    .vertex_source("gpu_shader_icon_vert.glsl")
    .fragment_source("gpu_shader_icon_frag.glsl")
    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(gpu_shader_icon_multi)
+    .vertex_in(0, Type::VEC2, "pos")
+    .vertex_out(flat_color_smooth_tex_coord_interp_iface)
+    .fragment_out(0, Type::VEC4, "fragColor")
+    .uniform_buf(0, "MultiIconCallData", "multi_icon_data")
+    .sampler(0, ImageType::FLOAT_2D, "image")
+    .typedef_source("GPU_shader_shared.h")
+    .vertex_source("gpu_shader_icon_multi_vert.glsl")
+    .fragment_source("gpu_shader_icon_frag.glsl")
+    .do_static_compilation(true);
--- a/source/blender/gpu/tests/gpu_shader_builtin_test.cc
+++ b/source/blender/gpu/tests/gpu_shader_builtin_test.cc
@ -36,7 +36,7 @@ static void test_shader_builtin()
  test_compile_builtin_shader(GPU_SHADER_3D_IMAGE_COLOR, GPU_SHADER_CFG_DEFAULT);
  test_compile_builtin_shader(GPU_SHADER_2D_IMAGE_DESATURATE_COLOR, GPU_SHADER_CFG_DEFAULT);
  test_compile_builtin_shader(GPU_SHADER_2D_IMAGE_RECT_COLOR, GPU_SHADER_CFG_DEFAULT);
-  test_compile_builtin_shader(GPU_SHADER_2D_IMAGE_MULTI_RECT_COLOR, GPU_SHADER_CFG_DEFAULT);
+  test_compile_builtin_shader(GPU_SHADER_ICON_MULTI, GPU_SHADER_CFG_DEFAULT);
  test_compile_builtin_shader(GPU_SHADER_2D_CHECKER, GPU_SHADER_CFG_DEFAULT);
  test_compile_builtin_shader(GPU_SHADER_2D_DIAG_STRIPES, GPU_SHADER_CFG_DEFAULT);
  test_compile_builtin_shader(GPU_SHADER_3D_CLIPPED_UNIFORM_COLOR, GPU_SHADER_CFG_DEFAULT);
--- a/source/blender/gpu/tests/gpu_storage_buffer_test.cc
+++ b/source/blender/gpu/tests/gpu_storage_buffer_test.cc
@ -37,7 +37,7 @@ static void test_gpu_storage_buffer_create_update_read()
  read_data.resize(SIZE, 0);
  GPU_storagebuf_read(ssbo, read_data.data());

-  /* Check if data is the same.*/
+  /* Check if data is the same. */
  for (int i : IndexRange(SIZE)) {
    EXPECT_EQ(data[i], read_data[i]);
  }
@ -47,4 +47,4 @@ static void test_gpu_storage_buffer_create_update_read()

 GPU_TEST(gpu_storage_buffer_create_update_read);

-}  // namespace blender::gpu::tests
+}  // namespace blender::gpu::tests
--- a/source/blender/gpu/vulkan/vk_command_buffer.cc
+++ b/source/blender/gpu/vulkan/vk_command_buffer.cc
@ -129,7 +129,7 @@ void VKCommandBuffer::encode_recorded_commands()
 {
  /* Intentionally not implemented. For the graphics pipeline we want to extract the
   * resources and its usages so we can encode multiple commands in the same command buffer with
-   * the correct synchorinzations. */
+   * the correct synchronizations. */
 }

 void VKCommandBuffer::submit_encoded_commands()
--- a/Show More
+++ b/Show More