21 changed files with 800 additions and 102 deletions
--- a/source/blender/draw/DRW_engine.h
+++ b/source/blender/draw/DRW_engine.h
@ -197,6 +197,7 @@ void DRW_gpu_render_context_enable(void *re_gpu_context);
 void DRW_gpu_render_context_disable(void *re_gpu_context);

 void DRW_deferred_shader_remove(struct GPUMaterial *mat);
+void DRW_deferred_shader_optimize_remove(struct GPUMaterial *mat);

 /**
 * Get DrawData from the given ID-block. In order for this to work, we assume that
--- a/source/blender/draw/engines/eevee/eevee_engine.c
+++ b/source/blender/draw/engines/eevee/eevee_engine.c
@ -53,6 +53,7 @@ static void eevee_engine_init(void *ved)
  stl->g_data->valid_double_buffer = (txl->color_double_buffer != NULL);
  stl->g_data->valid_taa_history = (txl->taa_history != NULL);
  stl->g_data->queued_shaders_count = 0;
+  stl->g_data->queued_optimise_shaders_count = 0;
  stl->g_data->render_timesteps = 1;
  stl->g_data->disable_ligthprobes = v3d &&
                                     (v3d->object_type_exclude_viewport & (1 << OB_LIGHTPROBE));
@ -178,6 +179,11 @@ static void eevee_cache_finish(void *vedata)
  if (g_data->queued_shaders_count > 0) {
    SNPRINTF(ved->info, TIP_("Compiling Shaders (%d remaining)"), g_data->queued_shaders_count);
  }
+  else if (g_data->queued_optimise_shaders_count > 0) {
+    SNPRINTF(ved->info,
+             TIP_("Optimizing Shaders (%d remaining)"),
+             g_data->queued_optimise_shaders_count);
+  }
 }

 /* As renders in an HDR off-screen buffer, we need draw everything once
--- a/source/blender/draw/engines/eevee/eevee_private.h
+++ b/source/blender/draw/engines/eevee/eevee_private.h
@ -1000,6 +1000,8 @@ typedef struct EEVEE_PrivateData {
  /* Compiling shaders count. This is to track if a shader has finished compiling. */
  int queued_shaders_count;
  int queued_shaders_count_prev;
+  /* Optimising shaders count. */
+  int queued_optimise_shaders_count;

  /* LookDev Settings */
  int studiolight_index;
--- a/source/blender/draw/engines/eevee/eevee_shaders.cc
+++ b/source/blender/draw/engines/eevee/eevee_shaders.cc
@ -1390,12 +1390,21 @@ struct GPUMaterial *EEVEE_material_get(
    return nullptr;
  }
  switch (status) {
-    case GPU_MAT_SUCCESS:
-      break;
-    case GPU_MAT_QUEUED:
+    case GPU_MAT_SUCCESS: {
+      /* Determine optimization status for remaining compilations counter. */
+      int optimization_status = GPU_material_optimization_status(mat);
+      if (optimization_status == GPU_MAT_OPTIMIZATION_QUEUED) {
+        vedata->stl->g_data->queued_optimise_shaders_count++;
+      }
+    } break;
+    case GPU_MAT_QUEUED: {
      vedata->stl->g_data->queued_shaders_count++;
-      mat = EEVEE_material_default_get(scene, ma, options);
-      break;
+      GPUMaterial *default_mat = EEVEE_material_default_get(scene, ma, options);
+      /* Mark pending material with its default material for future cache warming.*/
+      GPU_material_set_default(mat, default_mat);
+      /* Return default material. */
+      mat = default_mat;
+    } break;
    case GPU_MAT_FAILED:
    default:
      ma = EEVEE_material_default_error_get();
--- a/source/blender/draw/engines/eevee_next/eevee_shader.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_shader.cc
@ -507,6 +507,8 @@ GPUMaterial *ShaderModule::material_shader_get(const char *name,
                                                   this);
  GPU_material_status_set(gpumat, GPU_MAT_QUEUED);
  GPU_material_compile(gpumat);
+  /* Queue deferred material optimization. */
+  DRW_shader_queue_optimize_material(gpumat);
  return gpumat;
 }

--- a/source/blender/draw/intern/DRW_render.h
+++ b/source/blender/draw/intern/DRW_render.h
@ -307,6 +307,7 @@ struct GPUMaterial *DRW_shader_from_material(struct Material *ma,
                                             bool deferred,
                                             GPUCodegenCallbackFn callback,
                                             void *thunk);
+void DRW_shader_queue_optimize_material(struct GPUMaterial *mat);
 void DRW_shader_free(struct GPUShader *shader);
 #define DRW_SHADER_FREE_SAFE(shader) \
  do { \
--- a/source/blender/draw/intern/draw_manager_shader.c
+++ b/source/blender/draw/intern/draw_manager_shader.c
@ -55,6 +55,9 @@ typedef struct DRWShaderCompiler {
  ListBase queue; /* GPUMaterial */
  SpinLock list_lock;

+  /** Optimization queue. */
+  ListBase optimize_queue; /* GPUMaterial */
+
  void *gl_context;
  GPUContext *gpu_context;
  bool own_context;
@ -110,8 +113,29 @@ static void drw_deferred_shader_compilation_exec(
      MEM_freeN(link);
    }
    else {
-      /* No more materials to optimize, or shaders to compile. */
-      break;
+      /* Check for Material Optimization job once there are no more
+       * shaders to compile. */
+      BLI_spin_lock(&comp->list_lock);
+      /* Pop tail because it will be less likely to lock the main thread
+       * if all GPUMaterials are to be freed (see DRW_deferred_shader_remove()). */
+      LinkData *link = (LinkData *)BLI_poptail(&comp->optimize_queue);
+      GPUMaterial *optimize_mat = link ? (GPUMaterial *)link->data : NULL;
+      if (optimize_mat) {
+        /* Avoid another thread freeing the material during optimization. */
+        GPU_material_acquire(optimize_mat);
+      }
+      BLI_spin_unlock(&comp->list_lock);
+
+      if (optimize_mat) {
+        /* Compile optimized material shader. */
+        GPU_material_optimize(optimize_mat);
+        GPU_material_release(optimize_mat);
+        MEM_freeN(link);
+      }
+      else {
+        /* No more materials to optimize, or shaders to compile. */
+        break;
+      }
    }

    if (GPU_type_matches_ex(GPU_DEVICE_ANY, GPU_OS_ANY, GPU_DRIVER_ANY, GPU_BACKEND_OPENGL)) {
@ -133,6 +157,7 @@ static void drw_deferred_shader_compilation_free(void *custom_data)

  BLI_spin_lock(&comp->list_lock);
  BLI_freelistN(&comp->queue);
+  BLI_freelistN(&comp->optimize_queue);
  BLI_spin_unlock(&comp->list_lock);

  if (comp->own_context) {
@ -148,34 +173,13 @@ static void drw_deferred_shader_compilation_free(void *custom_data)
  MEM_freeN(comp);
 }

-static void drw_deferred_shader_add(GPUMaterial *mat, bool deferred)
+/**
+ * Append either shader compilation or optimization job to deferred queue and
+ * ensure shader compilation worker is active.
+ * We keep two separate queue's to ensure core compilations always complete before optimization.
+ */
+static void drw_deferred_queue_append(GPUMaterial *mat, bool is_optimization_job)
 {
-  if (ELEM(GPU_material_status(mat), GPU_MAT_SUCCESS, GPU_MAT_FAILED)) {
-    return;
-  }
-  /* Do not defer the compilation if we are rendering for image.
-   * deferred rendering is only possible when `evil_C` is available */
-  if (DST.draw_ctx.evil_C == NULL || DRW_state_is_image_render() || !USE_DEFERRED_COMPILATION) {
-    deferred = false;
-  }
-
-  if (!deferred) {
-    DRW_deferred_shader_remove(mat);
-    /* Shaders could already be compiling. Have to wait for compilation to finish. */
-    while (GPU_material_status(mat) == GPU_MAT_QUEUED) {
-      PIL_sleep_ms(20);
-    }
-    if (GPU_material_status(mat) == GPU_MAT_CREATED) {
-      GPU_material_compile(mat);
-    }
-    return;
-  }
-
-  /* Don't add material to the queue twice. */
-  if (GPU_material_status(mat) == GPU_MAT_QUEUED) {
-    return;
-  }
-
  const bool use_main_context = GPU_use_main_context_workaround();
  const bool job_own_context = !use_main_context;

@ -196,6 +200,7 @@ static void drw_deferred_shader_add(GPUMaterial *mat, bool deferred)
  if (old_comp) {
    BLI_spin_lock(&old_comp->list_lock);
    BLI_movelisttolist(&comp->queue, &old_comp->queue);
+    BLI_movelisttolist(&comp->optimize_queue, &old_comp->optimize_queue);
    BLI_spin_unlock(&old_comp->list_lock);
    /* Do not recreate context, just pass ownership. */
    if (old_comp->gl_context) {
@ -206,9 +211,18 @@ static void drw_deferred_shader_add(GPUMaterial *mat, bool deferred)
    }
  }

-  GPU_material_status_set(mat, GPU_MAT_QUEUED);
-  LinkData *node = BLI_genericNodeN(mat);
-  BLI_addtail(&comp->queue, node);
+  /* Add to either compilation or optimization queue. */
+  if (is_optimization_job) {
+    BLI_assert(GPU_material_optimization_status(mat) != GPU_MAT_OPTIMIZATION_QUEUED);
+    GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_QUEUED);
+    LinkData *node = BLI_genericNodeN(mat);
+    BLI_addtail(&comp->optimize_queue, node);
+  }
+  else {
+    GPU_material_status_set(mat, GPU_MAT_QUEUED);
+    LinkData *node = BLI_genericNodeN(mat);
+    BLI_addtail(&comp->queue, node);
+  }

  /* Create only one context. */
  if (comp->gl_context == NULL) {
@ -237,6 +251,39 @@ static void drw_deferred_shader_add(GPUMaterial *mat, bool deferred)
  WM_jobs_start(wm, wm_job);
 }

+static void drw_deferred_shader_add(GPUMaterial *mat, bool deferred)
+{
+  if (ELEM(GPU_material_status(mat), GPU_MAT_SUCCESS, GPU_MAT_FAILED)) {
+    return;
+  }
+
+  /* Do not defer the compilation if we are rendering for image.
+   * deferred rendering is only possible when `evil_C` is available */
+  if (DST.draw_ctx.evil_C == NULL || DRW_state_is_image_render() || !USE_DEFERRED_COMPILATION) {
+    deferred = false;
+  }
+
+  if (!deferred) {
+    DRW_deferred_shader_remove(mat);
+    /* Shaders could already be compiling. Have to wait for compilation to finish. */
+    while (GPU_material_status(mat) == GPU_MAT_QUEUED) {
+      PIL_sleep_ms(20);
+    }
+    if (GPU_material_status(mat) == GPU_MAT_CREATED) {
+      GPU_material_compile(mat);
+    }
+    return;
+  }
+
+  /* Don't add material to the queue twice. */
+  if (GPU_material_status(mat) == GPU_MAT_QUEUED) {
+    return;
+  }
+
+  /* Add deferred shader compilation to queue. */
+  drw_deferred_queue_append(mat, false);
+}
+
 static void drw_register_shader_vlattrs(GPUMaterial *mat)
 {
  const ListBase *attrs = GPU_material_layer_attributes(mat);
@ -288,9 +335,42 @@ void DRW_deferred_shader_remove(GPUMaterial *mat)
          BLI_remlink(&comp->queue, link);
          GPU_material_status_set(link->data, GPU_MAT_CREATED);
        }
-        BLI_spin_unlock(&comp->list_lock);

        MEM_SAFE_FREE(link);
+
+        /* Search for optimization job in queue. */
+        LinkData *opti_link = (LinkData *)BLI_findptr(
+            &comp->optimize_queue, mat, offsetof(LinkData, data));
+        if (opti_link) {
+          BLI_remlink(&comp->optimize_queue, opti_link);
+          GPU_material_optimization_status_set(opti_link->data, GPU_MAT_OPTIMIZATION_READY);
+        }
+        BLI_spin_unlock(&comp->list_lock);
+
+        MEM_SAFE_FREE(opti_link);
+      }
+    }
+  }
+}
+
+void DRW_deferred_shader_optimize_remove(GPUMaterial *mat)
+{
+  LISTBASE_FOREACH (wmWindowManager *, wm, &G_MAIN->wm) {
+    LISTBASE_FOREACH (wmWindow *, win, &wm->windows) {
+      DRWShaderCompiler *comp = (DRWShaderCompiler *)WM_jobs_customdata_from_type(
+          wm, wm, WM_JOB_TYPE_SHADER_COMPILATION);
+      if (comp != NULL) {
+        BLI_spin_lock(&comp->list_lock);
+        /* Search for optimization job in queue. */
+        LinkData *opti_link = (LinkData *)BLI_findptr(
+            &comp->optimize_queue, mat, offsetof(LinkData, data));
+        if (opti_link) {
+          BLI_remlink(&comp->optimize_queue, opti_link);
+          GPU_material_optimization_status_set(opti_link->data, GPU_MAT_OPTIMIZATION_READY);
+        }
+        BLI_spin_unlock(&comp->list_lock);
+
+        MEM_SAFE_FREE(opti_link);
      }
    }
  }
@ -432,6 +512,7 @@ GPUMaterial *DRW_shader_from_world(World *wo,
  }

  drw_deferred_shader_add(mat, deferred);
+  DRW_shader_queue_optimize_material(mat);
  return mat;
 }

@ -463,9 +544,52 @@ GPUMaterial *DRW_shader_from_material(Material *ma,
  }

  drw_deferred_shader_add(mat, deferred);
+  DRW_shader_queue_optimize_material(mat);
  return mat;
 }

+void DRW_shader_queue_optimize_material(GPUMaterial *mat)
+{
+  /* Do not perform deferred optimization if performing render.
+   * De-queue any queued optimization jobs. */
+  if (DRW_state_is_image_render()) {
+    if (GPU_material_optimization_status(mat) == GPU_MAT_OPTIMIZATION_QUEUED) {
+      /* Remove from pending optimization job queue. */
+      DRW_deferred_shader_optimize_remove(mat);
+      /* If optimization job had already started, wait for it to complete. */
+      while (GPU_material_optimization_status(mat) == GPU_MAT_OPTIMIZATION_QUEUED) {
+        PIL_sleep_ms(20);
+      }
+    }
+    return;
+  }
+
+  /* We do not need to perform optimization on the material if it is already compiled or in the
+   * optimization queue. If optimization is not required, the status will be flagged as
+   * `GPU_MAT_OPTIMIZATION_SKIP`.
+   * We can also skip cases which have already been queued up. */
+  if (ELEM(GPU_material_optimization_status(mat),
+           GPU_MAT_OPTIMIZATION_SKIP,
+           GPU_MAT_OPTIMIZATION_SUCCESS,
+           GPU_MAT_OPTIMIZATION_QUEUED)) {
+    return;
+  }
+
+  /* Only queue optimization once the original shader has been successfully compiled. */
+  if (GPU_material_status(mat) != GPU_MAT_SUCCESS) {
+    return;
+  }
+
+  /* Defer optimization until sufficient time has passed beyond creation. This avoids excessive
+   * recompilation for shaders which are being actively modified. */
+  if (!GPU_material_optimization_ready(mat)) {
+    return;
+  }
+
+  /* Add deferred shader compilation to queue. */
+  drw_deferred_queue_append(mat, true);
+}
+
 void DRW_shader_free(GPUShader *shader)
 {
  GPU_shader_free(shader);
--- a/source/blender/gpu/GPU_material.h
+++ b/source/blender/gpu/GPU_material.h
@ -254,9 +254,20 @@ void GPU_materials_free(struct Main *bmain);

 struct Scene *GPU_material_scene(GPUMaterial *material);
 struct GPUPass *GPU_material_get_pass(GPUMaterial *material);
+/* Return the most optimal shader configuration for the given material .*/
 struct GPUShader *GPU_material_get_shader(GPUMaterial *material);
+/* Return the base un-optimized shader. */
+struct GPUShader *GPU_material_get_shader_base(GPUMaterial *material);
 const char *GPU_material_get_name(GPUMaterial *material);

+/**
+ * Material Optimization.
+ * \note Compiles optimal version of shader graph, populating mat->optimized_pass.
+ * This operation should always be deferred until existing compilations have completed.
+ * Default un-optimized materials will still exist for interactive material editing performance.
+ */
+void GPU_material_optimize(GPUMaterial *mat);
+
 /**
 * Return can be NULL if it's a world material.
 */
@ -274,6 +285,24 @@ eGPUMaterialOptimizationStatus GPU_material_optimization_status(GPUMaterial *mat
 void GPU_material_optimization_status_set(GPUMaterial *mat, eGPUMaterialOptimizationStatus status);
 bool GPU_material_optimization_ready(GPUMaterial *mat);

+/**
+ * Store reference to a similar default material for async PSO cache warming.
+ *
+ * This function expects `material` to have not yet been compiled and for `default_material` to be
+ * ready. When compiling `material` as part of an async shader compilation job, use existing PSO
+ * descriptors from `default_material`'s shader to also compile PSOs for this new material
+ * asynchronously, rather than at runtime.
+ *
+ * The default_material `options` should match this new materials options in order
+ * for PSO descriptors to match those needed by the new `material`.
+ *
+ * NOTE: `default_material` must exist when `GPU_material_compile(..)` is called for
+ * `material`.
+ *
+ * See `GPU_shader_warm_cache(..)` for more information.
+ */
+void GPU_material_set_default(GPUMaterial *material, GPUMaterial *default_material);
+
 struct GPUUniformBuf *GPU_material_uniform_buffer_get(GPUMaterial *material);
 /**
 * Create dynamic UBO from parameters
--- a/source/blender/gpu/GPU_shader.h
+++ b/source/blender/gpu/GPU_shader.h
@ -217,6 +217,47 @@ GPUShader *GPU_shader_create_ex(const char *vertcode,
 bool GPU_shader_transform_feedback_enable(GPUShader *shader, struct GPUVertBuf *vertbuf);
 void GPU_shader_transform_feedback_disable(GPUShader *shader);

+/**
+ * Shader cache warming.
+ * For each shader, rendering APIs perform a two-step compilation:
+ *
+ *  * The first stage is Front-End compilation which only needs to be performed once, and generates
+ * a portable intermediate representation. This happens during `gpu::Shader::finalize()`.
+ *
+ *  * The second is Back-End compilation which compiles a device-specific executable shader
+ * program. This compilation requires some contextual pipeline state which is baked into the
+ * executable shader source, producing a Pipeline State Object (PSO). In OpenGL, backend
+ * compilation happens in the background, within the driver, but can still incur runtime stutters.
+ * In Metal/Vulkan, PSOs are compiled explicitly. These are currently resolved within the backend
+ * based on the current pipeline state and can incur runtime stalls when they occur.
+ *
+ * Shader Cache warming uses the specified parent shader set using `GPU_shader_set_parent(..)` as a
+ * template reference for pre-compiling Render Pipeline State Objects (PSOs) outside of the main
+ * render pipeline.
+ *
+ * PSOs require descriptors containing information on the render state for a given shader, which
+ * includes input vertex data layout and output pixel formats, along with some state such as
+ * blend mode and colour output masks. As this state information is usually consistent between
+ * similar draws, we can assign a parent shader and use this shader's cached pipeline state's to
+ * prime compilations.
+ *
+ * Shaders do not necessarily have to be similar in functionality to be used as a parent, so long
+ * as the GPUVertFormt and GPUFrameBuffer which they are used with remain the same. Other bindings
+ * such as textures, uniforms and UBOs are all assigned independently as dynamic state.
+ *
+ * This function should be called asynchronously, mitigating the impact of run-time stuttering from
+ * dynamic compilation of PSOs during normal rendering.
+ *
+ * \param: shader: The shader whose cache to warm.
+ * \param limit: The maximum number of PSOs to compile within a call. Specifying
+ * a limit <= 0 will compile a PSO for all cached PSOs in the parent shader. */
+void GPU_shader_warm_cache(GPUShader *shader, int limit);
+
+/* We expect the parent shader to be compiled and already have some cached PSOs when being assigned
+ * as a reference. Ensure the parent shader still exists when `GPU_shader_cache_warm(..)` is
+ * called. */
+void GPU_shader_set_parent(GPUShader *shader, GPUShader *parent);
+
 /** DEPRECATED: Kept only because of BGL API. */
 int GPU_shader_get_program(GPUShader *shader);

--- a/source/blender/gpu/intern/gpu_codegen.cc
+++ b/source/blender/gpu/intern/gpu_codegen.cc
@ -25,6 +25,7 @@
 #include "BKE_material.h"

 #include "GPU_capabilities.h"
+#include "GPU_context.h"
 #include "GPU_material.h"
 #include "GPU_shader.h"
 #include "GPU_uniform_buffer.h"
@ -95,6 +96,9 @@ struct GPUPass {
  uint32_t hash;
  /** Did we already tried to compile the attached GPUShader. */
  bool compiled;
+  /** Hint that an optimized variant of this pass should be created based on a complexity heuristic
+   * during pass code generation. */
+  bool should_optimize;
 };

 /* -------------------------------------------------------------------- */
@ -252,6 +256,11 @@ class GPUCodegen {
  ListBase ubo_inputs_ = {nullptr, nullptr};
  GPUInput *cryptomatte_input_ = nullptr;

+  /** Cache paramters for complexity heuristic. */
+  uint nodes_total_ = 0;
+  uint textures_total_ = 0;
+  uint uniforms_total_ = 0;
+
 public:
  GPUCodegen(GPUMaterial *mat_, GPUNodeGraph *graph_) : mat(*mat_), graph(*graph_)
  {
@ -292,6 +301,17 @@ class GPUCodegen {
    return hash_;
  }

+  /* Heuristic determined during pass codegen for whether a
+   * more optimal variant of this material should be compiled. */
+  bool should_optimize_heuristic() const
+  {
+    /* If each of the maximal attributes are exceeded, we can optimize, but we should also ensure
+     * the baseline is met.*/
+    bool do_optimize = (nodes_total_ >= 60 || textures_total_ >= 4 || uniforms_total_ >= 64) &&
+                       (textures_total_ >= 1 && uniforms_total_ >= 8 && nodes_total_ >= 4);
+    return do_optimize;
+  }
+
 private:
  void set_unique_ids();

@ -413,6 +433,9 @@ void GPUCodegen::generate_resources()
    }
  }

+  /* Increment heuristic. */
+  textures_total_ = slot;
+
  if (!BLI_listbase_is_empty(&ubo_inputs_)) {
    /* NOTE: generate_uniform_buffer() should have sorted the inputs before this. */
    ss << "struct NodeTree {\n";
@ -454,11 +477,16 @@ void GPUCodegen::generate_library()
  GPUCodegenCreateInfo &info = *create_info;

  void *value;
-  GSetIterState pop_state = {};
-  while (BLI_gset_pop(graph.used_libraries, &pop_state, &value)) {
+  /* Iterate over libraries. We need to keep this struct intact incase
+   * it is required for the optimization pass. */
+  GHashIterator *ihash = BLI_ghashIterator_new((GHash *)graph.used_libraries);
+  while (!BLI_ghashIterator_done(ihash)) {
+    value = BLI_ghashIterator_getKey(ihash);
    auto deps = gpu_shader_dependency_get_resolved_source((const char *)value);
    info.dependencies_generated.extend_non_duplicates(deps);
+    BLI_ghashIterator_step(ihash);
  }
+  BLI_ghashIterator_free(ihash);
 }

 void GPUCodegen::node_serialize(std::stringstream &eval_ss, const GPUNode *node)
@ -526,6 +554,9 @@ void GPUCodegen::node_serialize(std::stringstream &eval_ss, const GPUNode *node)
    }
  }
  eval_ss << ");\n\n";
+
+  /* Increment heuristic. */
+  nodes_total_++;
 }

 char *GPUCodegen::graph_serialize(eGPUNodeTag tree_tag, GPUNodeLink *output_link)
@ -589,6 +620,7 @@ void GPUCodegen::generate_uniform_buffer()
      if (input->source == GPU_SOURCE_UNIFORM && !input->link) {
        /* We handle the UBO uniforms separately. */
        BLI_addtail(&ubo_inputs_, BLI_genericNodeN(input));
+        uniforms_total_++;
      }
    }
  }
@ -661,10 +693,17 @@ void GPUCodegen::generate_graphs()
 GPUPass *GPU_generate_pass(GPUMaterial *material,
                           GPUNodeGraph *graph,
                           GPUCodegenCallbackFn finalize_source_cb,
-                           void *thunk)
+                           void *thunk,
+                           bool optimize_graph)
 {
  gpu_node_graph_prune_unused(graph);

+  /* If Optimize flag is passed in, we are generating an optimized
+   * variant of the GPUMaterial's GPUPass. */
+  if (optimize_graph) {
+    gpu_node_graph_optimize(graph);
+  }
+
  /* Extract attributes before compiling so the generated VBOs are ready to accept the future
   * shader. */
  gpu_node_graph_finalize_uniform_attrs(graph);
@ -672,23 +711,33 @@ GPUPass *GPU_generate_pass(GPUMaterial *material,
  GPUCodegen codegen(material, graph);
  codegen.generate_graphs();
  codegen.generate_cryptomatte();
-  codegen.generate_uniform_buffer();

-  /* Cache lookup: Reuse shaders already compiled. */
-  GPUPass *pass_hash = gpu_pass_cache_lookup(codegen.hash_get());
+  GPUPass *pass_hash = nullptr;

-  /* FIXME(fclem): This is broken. Since we only check for the hash and not the full source
-   * there is no way to have a collision currently. Some advocated to only use a bigger hash. */
-  if (pass_hash && (pass_hash->next == nullptr || pass_hash->next->hash != codegen.hash_get())) {
-    if (!gpu_pass_is_valid(pass_hash)) {
-      /* Shader has already been created but failed to compile. */
-      return nullptr;
+  if (!optimize_graph) {
+    /* The optimized version of the shader should not re-generate a UBO.
+     * The UBO will not be used for this variant. */
+    codegen.generate_uniform_buffer();
+
+    /** Cache lookup: Reuse shaders already compiled.
+     * NOTE: We only perform cache look-up for non-optimized shader
+     * graphs, as baked constant data amongst other optimizations will generate too many
+     * shader source permutations, with minimal re-usability. */
+    pass_hash = gpu_pass_cache_lookup(codegen.hash_get());
+
+    /* FIXME(fclem): This is broken. Since we only check for the hash and not the full source
+     * there is no way to have a collision currently. Some advocated to only use a bigger hash. */
+    if (pass_hash && (pass_hash->next == nullptr || pass_hash->next->hash != codegen.hash_get())) {
+      if (!gpu_pass_is_valid(pass_hash)) {
+        /* Shader has already been created but failed to compile. */
+        return nullptr;
+      }
+      /* No collision, just return the pass. */
+      BLI_spin_lock(&pass_cache_spin);
+      pass_hash->refcount += 1;
+      BLI_spin_unlock(&pass_cache_spin);
+      return pass_hash;
    }
-    /* No collision, just return the pass. */
-    BLI_spin_lock(&pass_cache_spin);
-    pass_hash->refcount += 1;
-    BLI_spin_unlock(&pass_cache_spin);
-    return pass_hash;
  }

  /* Either the shader is not compiled or there is a hash collision...
@ -726,14 +775,33 @@ GPUPass *GPU_generate_pass(GPUMaterial *material,
    pass->create_info = codegen.create_info;
    pass->hash = codegen.hash_get();
    pass->compiled = false;
+    /* Only flag pass optimization hint if this is the first generated pass for a material.
+     * Optimized passes cannot be optimized further, even if the heuristic is still not
+     * favourable. */
+    pass->should_optimize = (!optimize_graph) && codegen.should_optimize_heuristic();

    codegen.create_info = nullptr;

-    gpu_pass_cache_insert_after(pass_hash, pass);
+    /* Only insert non-optimized graphs into cache.
+     * Optimized graphs will continuously be recompiled with new unique source during material
+     * editing, and thus causing the cache to fill up quickly with materials offering minimal
+     * re-use. */
+    if (!optimize_graph) {
+      gpu_pass_cache_insert_after(pass_hash, pass);
+    }
  }
  return pass;
 }

+bool GPU_pass_should_optimize(GPUPass *pass)
+{
+  /* Returns optimization heuristic prepared during
+   * initial codegen.
+   * NOTE: Optimization currently limited to Metal backend as repeated compilations required for
+   * material specialization cause impactful CPU stalls on OpenGL platforms. */
+  return (GPU_backend_get_type() == GPU_BACKEND_METAL) && pass->should_optimize;
+}
+
 /** \} */

 /* -------------------------------------------------------------------- */
--- a/source/blender/gpu/intern/gpu_codegen.h
+++ b/source/blender/gpu/intern/gpu_codegen.h
@ -25,10 +25,12 @@ typedef struct GPUPass GPUPass;
 GPUPass *GPU_generate_pass(GPUMaterial *material,
                           struct GPUNodeGraph *graph,
                           GPUCodegenCallbackFn finalize_source_cb,
-                           void *thunk);
+                           void *thunk,
+                           bool optimize_graph);
 GPUShader *GPU_pass_shader_get(GPUPass *pass);
 bool GPU_pass_compile(GPUPass *pass, const char *shname);
 void GPU_pass_release(GPUPass *pass);
+bool GPU_pass_should_optimize(GPUPass *pass);

 /* Module */

--- a/source/blender/gpu/intern/gpu_material.c
+++ b/source/blender/gpu/intern/gpu_material.c
@ -34,6 +34,8 @@

 #include "DRW_engine.h"

+#include "PIL_time.h"
+
 #include "gpu_codegen.h"
 #include "gpu_node_graph.h"

@ -43,6 +45,17 @@
 #define MAX_COLOR_BAND 128
 #define MAX_GPU_SKIES 8

+/** Whether the optimized variant of the GPUPass should be created asynchronously.
+ * Usage of this depends on whether there are possible threading challenges of doing so.
+ * Currently, the overhead of GPU_generate_pass is relatively small in comparison to shader
+ * compilation, though this option exists in case any potential scenarios for material graph
+ * optimization cause a slow down on the main thread.
+ *
+ * NOTE: The actual shader program for the optimized pass will always be compiled asynchronously,
+ * this flag controls whether shader node graph source serialization happens on the compilation
+ * worker thread as well. */
+#define ASYNC_OPTIMIZED_PASS_CREATION 0
+
 typedef struct GPUColorBandBuilder {
  float pixels[MAX_COLOR_BAND][CM_TABLE + 1][4];
  int current_layer;
@ -57,6 +70,27 @@ struct GPUMaterial {
  /* Contains #GPUShader and source code for deferred compilation.
   * Can be shared between similar material (i.e: sharing same node-tree topology). */
  GPUPass *pass;
+  /* Optimized GPUPass, situationally compiled after initial pass for optimal realtime performance.
+   * This shader variant bakes dynamic uniform data as constant. This variant will not use
+   * the ubo, and instead bake constants directly into the shader source. */
+  GPUPass *optimized_pass;
+  /* Optimization status.
+   * We also use this status to determine whether this material should be considered for
+   * optimization. Only sufficiently complex shaders benefit from constant-folding optimizations.
+   *   `GPU_MAT_OPTIMIZATION_READY` -> shader should be optimized and is ready for optimization.
+   *   `GPU_MAT_OPTIMIZATION_SKIP` -> Shader should not be optimized as it would not benefit
+   * performance to do so, based on the heuristic.
+   */
+  eGPUMaterialOptimizationStatus optimization_status;
+  double creation_time;
+#if ASYNC_OPTIMIZED_PASS_CREATION == 1
+  struct DeferredOptimizePass {
+    GPUCodegenCallbackFn callback;
+    void *thunk;
+  } DeferredOptimizePass;
+  struct DeferredOptimizePass optimize_pass_info;
+#endif
+
  /** UBOs for this material parameters. */
  GPUUniformBuf *ubo;
  /** Compilation status. Do not use if shader is not GPU_MAT_SUCCESS. */
@ -86,6 +120,12 @@ struct GPUMaterial {
  /* Low level node graph(s). Also contains resources needed by the material. */
  GPUNodeGraph graph;

+  /** Default material reference used for PSO cache warming. Default materials may perform
+   * different operations, but the permutation will frequently share the same input PSO
+   * descriptors. This enables async PSO compilation as part of the deferred compiltion
+   * pass, reducing runtime stuttering and responsiveness while compiling materials. */
+  GPUMaterial *default_mat;
+
  /** DEPRECATED: To remove. */
  bool has_surface_output;
  bool has_volume_output;
@ -214,6 +254,9 @@ void GPU_material_free_single(GPUMaterial *material)

  gpu_node_graph_free(&material->graph);

+  if (material->optimized_pass != NULL) {
+    GPU_pass_release(material->optimized_pass);
+  }
  if (material->pass != NULL) {
    GPU_pass_release(material->pass);
  }
@ -252,12 +295,29 @@ Scene *GPU_material_scene(GPUMaterial *material)

 GPUPass *GPU_material_get_pass(GPUMaterial *material)
 {
-  return material->pass;
+  /* If an optimized pass variant is available, and optimization is
+   * flagged as complete, we use this one instead. */
+  return ((GPU_material_optimization_status(material) == GPU_MAT_OPTIMIZATION_SUCCESS) &&
+          material->optimized_pass) ?
+             material->optimized_pass :
+             material->pass;
 }

 GPUShader *GPU_material_get_shader(GPUMaterial *material)
 {
-  return material->pass ? GPU_pass_shader_get(material->pass) : NULL;
+  /* If an optimized material shader variant is available, and optimization is
+   * flagged as complete, we use this one instead. */
+  GPUShader *shader = ((GPU_material_optimization_status(material) ==
+                        GPU_MAT_OPTIMIZATION_SUCCESS) &&
+                       material->optimized_pass) ?
+                          GPU_pass_shader_get(material->optimized_pass) :
+                          NULL;
+  return (shader) ? shader : ((material->pass) ? GPU_pass_shader_get(material->pass) : NULL);
+}
+
+GPUShader *GPU_material_get_shader_base(GPUMaterial *material)
+{
+  return (material->pass) ? GPU_pass_shader_get(material->pass) : NULL;
 }

 const char *GPU_material_get_name(GPUMaterial *material)
@ -665,6 +725,41 @@ void GPU_material_status_set(GPUMaterial *mat, eGPUMaterialStatus status)
  mat->status = status;
 }

+eGPUMaterialOptimizationStatus GPU_material_optimization_status(GPUMaterial *mat)
+{
+  return mat->optimization_status;
+}
+
+void GPU_material_optimization_status_set(GPUMaterial *mat, eGPUMaterialOptimizationStatus status)
+{
+  mat->optimization_status = status;
+  if (mat->optimization_status == GPU_MAT_OPTIMIZATION_READY) {
+    /* Reset creation timer to delay optimization pass. */
+    mat->creation_time = PIL_check_seconds_timer();
+  }
+}
+
+bool GPU_material_optimization_ready(GPUMaterial *mat)
+{
+  /* Timer threshold before optimizations will be queued.
+   * When materials are frequently being modified, optimization
+   * can incur CPU overhead from excessive compilation.
+   *
+   * As the optimization is entirely asynchronous, it is still beneficial
+   * to do this quickly to avoid build-up and improve runtime performance.
+   * The threshold just prevents compilations being queued frame after frame. */
+  const double optimization_time_threshold_s = 1.2;
+  return ((PIL_check_seconds_timer() - mat->creation_time) >= optimization_time_threshold_s);
+}
+
+void GPU_material_set_default(GPUMaterial *material, GPUMaterial *default_material)
+{
+  BLI_assert(material != default_material);
+  if (material != default_material) {
+    material->default_mat = default_material;
+  }
+}
+
 /* Code generation */

 bool GPU_material_has_surface_output(GPUMaterial *mat)
@ -730,6 +825,7 @@ GPUMaterial *GPU_material_from_nodetree(Scene *scene,
  mat->uuid = shader_uuid;
  mat->flag = GPU_MATFLAG_UPDATED;
  mat->status = GPU_MAT_CREATED;
+  mat->default_mat = NULL;
  mat->is_volume_shader = is_volume_shader;
  mat->graph.used_libraries = BLI_gset_new(
      BLI_ghashutil_ptrhash, BLI_ghashutil_ptrcmp, "GPUNodeGraph.used_libraries");
@ -748,7 +844,7 @@ GPUMaterial *GPU_material_from_nodetree(Scene *scene,

  {
    /* Create source code and search pass cache for an already compiled version. */
-    mat->pass = GPU_generate_pass(mat, &mat->graph, callback, thunk);
+    mat->pass = GPU_generate_pass(mat, &mat->graph, callback, thunk, false);

    if (mat->pass == NULL) {
      /* We had a cache hit and the shader has already failed to compile. */
@ -756,11 +852,44 @@ GPUMaterial *GPU_material_from_nodetree(Scene *scene,
      gpu_node_graph_free(&mat->graph);
    }
    else {
+      /* Determine whether we should generate an optimized variant of the graph.
+       * Heuristic is based on complexity of default material pass and shader node graph. */
+      if (GPU_pass_should_optimize(mat->pass)) {
+        GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_READY);
+      }
+
      GPUShader *sh = GPU_pass_shader_get(mat->pass);
      if (sh != NULL) {
        /* We had a cache hit and the shader is already compiled. */
        mat->status = GPU_MAT_SUCCESS;
-        gpu_node_graph_free_nodes(&mat->graph);
+
+        if (mat->optimization_status == GPU_MAT_OPTIMIZATION_SKIP) {
+          gpu_node_graph_free_nodes(&mat->graph);
+        }
+      }
+
+      /* Generate optimized pass. */
+      if (mat->optimization_status == GPU_MAT_OPTIMIZATION_READY) {
+#if ASYNC_OPTIMIZED_PASS_CREATION == 1
+        mat->optimized_pass = NULL;
+        mat->optimize_pass_info.callback = callback;
+        mat->optimize_pass_info.thunk = thunk;
+#else
+        mat->optimized_pass = GPU_generate_pass(mat, &mat->graph, callback, thunk, true);
+        if (mat->optimized_pass == NULL) {
+          /* Failed to create optimized pass. */
+          gpu_node_graph_free_nodes(&mat->graph);
+          GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SKIP);
+        }
+        else {
+          GPUShader *optimized_sh = GPU_pass_shader_get(mat->optimized_pass);
+          if (optimized_sh != NULL) {
+            /* Optimized shader already available. */
+            gpu_node_graph_free_nodes(&mat->graph);
+            GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SUCCESS);
+          }
+        }
+#endif
      }
    }
  }
@ -810,8 +939,37 @@ void GPU_material_compile(GPUMaterial *mat)
  if (success) {
    GPUShader *sh = GPU_pass_shader_get(mat->pass);
    if (sh != NULL) {
+
+      /** Perform async Render Pipeline State Object (PSO) compilation.
+       *
+       * Warm PSO cache within async compilation thread using default material as source.
+       * GPU_shader_warm_cache(..) performs the API-specific PSO compilation using the assigned
+       * parent shader's cached PSO descriptors as an input.
+       *
+       * This is only applied if the given material has a specified default reference
+       * material available, and the default material is already compiled.
+       *
+       * As PSOs do not always match for default shaders, we limit warming for PSO
+       * configurations to ensure compile time remains fast, as these first
+       * entries will be the most commonly used PSOs. As not all PSOs are necesasrily
+       * required immediately, this limit should remain low (1-3 at most).
+       * */
+      if (mat->default_mat != NULL && mat->default_mat != mat) {
+        if (mat->default_mat->pass != NULL) {
+          GPUShader *parent_sh = GPU_pass_shader_get(mat->default_mat->pass);
+          if (parent_sh) {
+            GPU_shader_set_parent(sh, parent_sh);
+            GPU_shader_warm_cache(sh, 1);
+          }
+        }
+      }
+
+      /* Flag success. */
      mat->status = GPU_MAT_SUCCESS;
-      gpu_node_graph_free_nodes(&mat->graph);
+      if (mat->optimization_status == GPU_MAT_OPTIMIZATION_SKIP) {
+        /* Only free node graph nodes if not required by secondary optimization pass. */
+        gpu_node_graph_free_nodes(&mat->graph);
+      }
    }
    else {
      mat->status = GPU_MAT_FAILED;
@ -825,6 +983,89 @@ void GPU_material_compile(GPUMaterial *mat)
  }
 }

+void GPU_material_optimize(GPUMaterial *mat)
+{
+  /* If shader is flagged for skipping optimization or has already been successfully
+   * optimized, skip. */
+  if (ELEM(mat->optimization_status, GPU_MAT_OPTIMIZATION_SKIP, GPU_MAT_OPTIMIZATION_SUCCESS)) {
+    return;
+  }
+
+  /* If original shader has not been fully compiled, we are not
+   * ready to perform optimization. */
+  if (mat->status != GPU_MAT_SUCCESS) {
+    /* Reset optimization status. */
+    GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_READY);
+    return;
+  }
+
+#if ASYNC_OPTIMIZED_PASS_CREATION == 1
+  /* If the optimized pass is not valid, first generate optimized pass.
+   * NOTE(Threading): Need to verify if GPU_generate_pass can cause side-effects, especially when
+   * used with "thunk". So far, this appears to work, and deferring optimized pass creation is more
+   * optimal, as these do not benefit from caching, due to baked constants. However, this could
+   * possibly be cause for concern for certain cases.  */
+  if (!mat->optimized_pass) {
+    mat->optimized_pass = GPU_generate_pass(
+        mat, &mat->graph, mat->optimize_pass_info.callback, mat->optimize_pass_info.thunk, true);
+    BLI_assert(mat->optimized_pass);
+  }
+#else
+  if (!mat->optimized_pass) {
+    /* Optimized pass has not been created, skip future optimization attempts. */
+    GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SKIP);
+    return;
+  }
+#endif
+
+  bool success;
+  /* NOTE: The shader may have already been compiled here since we are
+   * sharing GPUShader across GPUMaterials. In this case it's a no-op. */
+#ifndef NDEBUG
+  success = GPU_pass_compile(mat->optimized_pass, mat->name);
+#else
+  success = GPU_pass_compile(mat->optimized_pass, __func__);
+#endif
+
+  if (success) {
+    GPUShader *sh = GPU_pass_shader_get(mat->optimized_pass);
+    if (sh != NULL) {
+      /** Perform async Render Pipeline State Object (PSO) compilation.
+       *
+       * Warm PSO cache within async compilation thread for optimized materials.
+       * This setup assigns the original unoptimized shader as a "parent" shader
+       * for the optimized version. This then allows the associated GPU backend to
+       * compile PSOs within this asynchronous pass, using the identical PSO descriptors of the
+       * parent shader.
+       *
+       * This eliminates all run-time stuttering associated with material optimization and ensures
+       * realtime material editing and animation remains seamless, while retaining optimal realtime
+       * performance. */
+      GPUShader *parent_sh = GPU_pass_shader_get(mat->pass);
+      if (parent_sh) {
+        GPU_shader_set_parent(sh, parent_sh);
+        GPU_shader_warm_cache(sh, -1);
+      }
+
+      /* Mark as complete. */
+      GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SUCCESS);
+    }
+    else {
+      /* Optimized pass failed to compile. Disable any future optimization attempts. */
+      GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SKIP);
+    }
+  }
+  else {
+    /* Optimization pass generation failed. Disable future attempts to optimize. */
+    GPU_pass_release(mat->optimized_pass);
+    mat->optimized_pass = NULL;
+    GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SKIP);
+  }
+
+  /* Release node graph as no longer needed. */
+  gpu_node_graph_free_nodes(&mat->graph);
+}
+
 void GPU_materials_free(Main *bmain)
 {
  LISTBASE_FOREACH (Material *, ma, &bmain->materials) {
@ -847,6 +1088,9 @@ GPUMaterial *GPU_material_from_callbacks(ConstructGPUMaterialFn construct_functi
  material->graph.used_libraries = BLI_gset_new(
      BLI_ghashutil_ptrhash, BLI_ghashutil_ptrcmp, "GPUNodeGraph.used_libraries");
  material->refcount = 1;
+  material->optimization_status = GPU_MAT_OPTIMIZATION_SKIP;
+  material->optimized_pass = NULL;
+  material->default_mat = NULL;

  /* Construct the material graph by adding and linking the necessary GPU material nodes. */
  construct_function_cb(thunk, material);
@ -855,7 +1099,9 @@ GPUMaterial *GPU_material_from_callbacks(ConstructGPUMaterialFn construct_functi
  gpu_material_ramp_texture_build(material);

  /* Lookup an existing pass in the cache or generate a new one. */
-  material->pass = GPU_generate_pass(material, &material->graph, generate_code_function_cb, thunk);
+  material->pass = GPU_generate_pass(
+      material, &material->graph, generate_code_function_cb, thunk, false);
+  material->optimized_pass = NULL;

  /* The pass already exists in the pass cache but its shader already failed to compile. */
  if (material->pass == NULL) {
@ -868,7 +1114,10 @@ GPUMaterial *GPU_material_from_callbacks(ConstructGPUMaterialFn construct_functi
  GPUShader *shader = GPU_pass_shader_get(material->pass);
  if (shader != NULL) {
    material->status = GPU_MAT_SUCCESS;
-    gpu_node_graph_free_nodes(&material->graph);
+    if (material->optimization_status == GPU_MAT_OPTIMIZATION_SKIP) {
+      /* Only free node graph if not required by secondary optimization pass. */
+      gpu_node_graph_free_nodes(&material->graph);
+    }
    return material;
  }

--- a/source/blender/gpu/intern/gpu_node_graph.cc
+++ b/source/blender/gpu/intern/gpu_node_graph.cc
@ -983,3 +983,22 @@ void gpu_node_graph_prune_unused(GPUNodeGraph *graph)
    }
  }
 }
+
+void gpu_node_graph_optimize(GPUNodeGraph *graph)
+{
+  /* Replace all uniform node links with constant. */
+  LISTBASE_FOREACH (GPUNode *, node, &graph->nodes) {
+    LISTBASE_FOREACH (GPUInput *, input, &node->inputs) {
+      if (input->link) {
+        if (input->link->link_type == GPU_NODE_LINK_UNIFORM) {
+          input->link->link_type = GPU_NODE_LINK_CONSTANT;
+        }
+      }
+      if (input->source == GPU_SOURCE_UNIFORM) {
+        input->source = (input->type == GPU_CLOSURE) ? GPU_SOURCE_STRUCT : GPU_SOURCE_CONSTANT;
+      }
+    }
+  }
+
+  /* TODO: Consider performing other node graph optimizations here. */
+}
--- a/source/blender/gpu/intern/gpu_node_graph.h
+++ b/source/blender/gpu/intern/gpu_node_graph.h
@ -190,6 +190,19 @@ void gpu_nodes_tag(GPUNodeLink *link, eGPUNodeTag tag);
 void gpu_node_graph_prune_unused(GPUNodeGraph *graph);
 void gpu_node_graph_finalize_uniform_attrs(GPUNodeGraph *graph);

+/**
+ * Optimize node graph for optimized material shader path.
+ * Once the base material has been generated, we can modify the shader
+ * node graph to create one which will produce an optimally performing shader.
+ * This currently involves baking uniform data into constant data to enable
+ * aggressive constant folding by the compiler in order to reduce complexity and
+ * shader core memory pressure.
+ *
+ * NOTE: Graph optimizations will produce a shader which needs to be re-compiled
+ * more frequently, however, the default material pass will always exist to fall
+ * back on. */
+void gpu_node_graph_optimize(GPUNodeGraph *graph);
+
 /**
 * Free intermediate node graph.
 */
--- a/source/blender/gpu/intern/gpu_shader.cc
+++ b/source/blender/gpu/intern/gpu_shader.cc
@ -500,6 +500,26 @@ const char *GPU_shader_get_name(GPUShader *shader)
  return unwrap(shader)->name_get();
 }

+/* -------------------------------------------------------------------- */
+/** \name Shader cache warming
+ * \{ */
+
+void GPU_shader_set_parent(GPUShader *shader, GPUShader *parent)
+{
+  BLI_assert(shader != nullptr);
+  BLI_assert(shader != parent);
+  if (shader != parent) {
+    Shader *shd_child = unwrap(shader);
+    Shader *shd_parent = unwrap(parent);
+    shd_child->parent_set(shd_parent);
+  }
+}
+
+void GPU_shader_warm_cache(GPUShader *shader, int limit)
+{
+  unwrap(shader)->warm_cache(limit);
+}
+
 /** \} */

 /* -------------------------------------------------------------------- */
--- a/source/blender/gpu/intern/gpu_shader_private.hh
+++ b/source/blender/gpu/intern/gpu_shader_private.hh
@ -34,6 +34,12 @@ class Shader {
  /** For debugging purpose. */
  char name[64];

+  /* Parent shader can be used for shaders which are derived from the same source material.
+   * The child shader can pull information from its parent to prepare additional resources
+   * such as PSOs upfront. This enables asynchronous PSO compilation which mitigates stuttering
+   * when updating new materials. */
+  Shader *parent_shader_ = nullptr;
+
 public:
  Shader(const char *name);
  virtual ~Shader();
@ -43,6 +49,11 @@ class Shader {
  virtual void fragment_shader_from_glsl(MutableSpan<const char *> sources) = 0;
  virtual void compute_shader_from_glsl(MutableSpan<const char *> sources) = 0;
  virtual bool finalize(const shader::ShaderCreateInfo *info = nullptr) = 0;
+  /* Pre-warms PSOs using parent shader's cached PSO descriptors. Limit specifies maximum PSOs to
+   * warm. If -1, compiles all PSO permutations in parent shader.
+   *
+   * See `GPU_shader_warm_cache(..)` in `GPU_shader.h` for more information. */
+  virtual void warm_cache(int limit) = 0;

  virtual void transform_feedback_names_set(Span<const char *> name_list,
                                            eGPUShaderTFBType geom_type) = 0;
@ -69,7 +80,17 @@ class Shader {
  inline const char *const name_get() const
  {
    return name;
-  };
+  }
+
+  inline void parent_set(Shader *parent)
+  {
+    parent_shader_ = parent;
+  }
+
+  inline Shader *parent_get() const
+  {
+    return parent_shader_;
+  }

  static bool srgb_uniform_dirty_get();
  static void set_srgb_uniform(GPUShader *shader);
--- a/source/blender/gpu/metal/mtl_pso_descriptor_state.hh
+++ b/source/blender/gpu/metal/mtl_pso_descriptor_state.hh
@ -31,6 +31,14 @@ struct MTLVertexAttributeDescriptorPSO {
    return uint64_t((uint64_t(this->format) ^ (this->offset << 4) ^ (this->buffer_index << 8) ^
                     (this->format_conversion_mode << 12)));
  }
+
+  void reset()
+  {
+    format = MTLVertexFormatInvalid;
+    offset = 0;
+    buffer_index = 0;
+    format_conversion_mode = GPU_FETCH_FLOAT;
+  }
 };

 struct MTLVertexBufferLayoutDescriptorPSO {
@ -48,6 +56,13 @@ struct MTLVertexBufferLayoutDescriptorPSO {
  {
    return uint64_t(uint64_t(this->step_function) ^ (this->step_rate << 4) ^ (this->stride << 8));
  }
+
+  void reset()
+  {
+    step_function = MTLVertexStepFunctionPerVertex;
+    step_rate = 1;
+    stride = 0;
+  }
 };

 /* SSBO attribute state caching. */
@ -76,6 +91,16 @@ struct MTLSSBOAttribute {
  {
    return (memcmp(this, &other, sizeof(MTLSSBOAttribute)) == 0);
  }
+
+  void reset()
+  {
+    mtl_attribute_index = 0;
+    vbo_id = 0;
+    attribute_offset = 0;
+    per_vertex_stride = 0;
+    attribute_format = 0;
+    is_instance = false;
+  }
 };

 struct MTLVertexDescriptor {
@ -241,10 +266,10 @@ struct MTLRenderPipelineStateDescriptor {
      hash ^= uint64_t(this->dest_rgb_blend_factor) << 37;   /* Up to 18 (5 bits). */
      hash ^= uint64_t(this->src_alpha_blend_factor) << 42;  /* Up to 18 (5 bits). */
      hash ^= uint64_t(this->src_rgb_blend_factor) << 47;    /* Up to 18 (5 bits). */
-    }

-    for (const uint c : IndexRange(GPU_FB_MAX_COLOR_ATTACHMENT)) {
-      hash ^= uint64_t(this->color_attachment_format[c]) << (c + 52); /* Up to 555 (9 bits). */
+      for (const uint c : IndexRange(GPU_FB_MAX_COLOR_ATTACHMENT)) {
+        hash ^= uint64_t(this->color_attachment_format[c]) << (c + 52); /* Up to 555 (9 bits). */
+      }
    }

    hash |= uint64_t((this->blending_enabled && (this->num_color_attachments > 0)) ? 1 : 0) << 62;
@ -262,9 +287,9 @@ struct MTLRenderPipelineStateDescriptor {
    vertex_descriptor.total_attributes = 0;
    vertex_descriptor.max_attribute_value = 0;
    vertex_descriptor.num_vert_buffers = 0;
+    vertex_descriptor.prim_topology_class = MTLPrimitiveTopologyClassUnspecified;
    for (int i = 0; i < GPU_VERT_ATTR_MAX_LEN; i++) {
-      vertex_descriptor.attributes[i].format = MTLVertexFormatInvalid;
-      vertex_descriptor.attributes[i].offset = 0;
+      vertex_descriptor.attributes[i].reset();
    }
    vertex_descriptor.uses_ssbo_vertex_fetch = false;
    vertex_descriptor.num_ssbo_attributes = 0;
--- a/source/blender/gpu/metal/mtl_shader.hh
+++ b/source/blender/gpu/metal/mtl_shader.hh
@ -76,6 +76,8 @@ struct MTLRenderPipelineStateInstance {
  int null_attribute_buffer_index;
  /* buffer bind used for transform feedback output buffer. */
  int transform_feedback_buffer_index;
+  /* Topology class. */
+  MTLPrimitiveTopologyClass prim_type;

  /** Reflection Data.
   * Currently used to verify whether uniform buffers of incorrect sizes being bound, due to left
@ -188,6 +190,7 @@ class MTLShader : public Shader {
  MTLRenderPipelineStateDescriptor current_pipeline_state_;
  /* Cache of compiled PipelineStateObjects. */
  blender::Map<MTLRenderPipelineStateDescriptor, MTLRenderPipelineStateInstance *> pso_cache_;
+  std::mutex pso_cache_lock_;

  /** Compute pipeline state and Compute PSO caching. */
  MTLComputePipelineStateInstance compute_pso_instance_;
@ -256,6 +259,7 @@ class MTLShader : public Shader {
  /* Compile and build - Return true if successful. */
  bool finalize(const shader::ShaderCreateInfo *info = nullptr) override;
  bool finalize_compute(const shader::ShaderCreateInfo *info);
+  void warm_cache(int limit) override;

  /* Utility. */
  bool is_valid()
@ -331,8 +335,14 @@ class MTLShader : public Shader {
  void shader_source_from_msl(NSString *input_vertex_source, NSString *input_fragment_source);
  void shader_compute_source_from_msl(NSString *input_compute_source);
  void set_interface(MTLShaderInterface *interface);
+
  MTLRenderPipelineStateInstance *bake_current_pipeline_state(MTLContext *ctx,
                                                              MTLPrimitiveTopologyClass prim_type);
+  MTLRenderPipelineStateInstance *bake_pipeline_state(
+      MTLContext *ctx,
+      MTLPrimitiveTopologyClass prim_type,
+      const MTLRenderPipelineStateDescriptor &pipeline_descriptor);
+
  bool bake_compute_pipeline_state(MTLContext *ctx);
  const MTLComputePipelineStateInstance &get_compute_pipeline_state();

--- a/source/blender/gpu/metal/mtl_shader.mm
+++ b/source/blender/gpu/metal/mtl_shader.mm
@ -6,6 +6,8 @@

 #include "BKE_global.h"

+#include "PIL_time.h"
+
 #include "BLI_string.h"
 #include <algorithm>
 #include <fstream>
@ -110,6 +112,7 @@ MTLShader::~MTLShader()
    }

    /* Free Pipeline Cache. */
+    pso_cache_lock_.lock();
    for (const MTLRenderPipelineStateInstance *pso_inst : pso_cache_.values()) {
      if (pso_inst->vert) {
        [pso_inst->vert release];
@ -123,6 +126,7 @@ MTLShader::~MTLShader()
      delete pso_inst;
    }
    pso_cache_.clear();
+    pso_cache_lock_.unlock();

    /* Free Compute pipeline state object. */
    if (compute_pso_instance_.compute) {
@ -616,6 +620,36 @@ void MTLShader::push_constant_bindstate_mark_dirty(bool is_dirty)
  push_constant_modified_ = is_dirty;
 }

+void MTLShader::warm_cache(int limit)
+{
+  if (parent_shader_ != nullptr) {
+    MTLContext *ctx = MTLContext::get();
+    MTLShader *parent_mtl = reinterpret_cast<MTLShader *>(parent_shader_);
+
+    /* Extract PSO descriptors from parent shader. */
+    blender::Vector<MTLRenderPipelineStateDescriptor> descriptors;
+    blender::Vector<MTLPrimitiveTopologyClass> prim_classes;
+
+    parent_mtl->pso_cache_lock_.lock();
+    for (const auto &pso_entry : parent_mtl->pso_cache_.items()) {
+      const MTLRenderPipelineStateDescriptor &pso_descriptor = pso_entry.key;
+      const MTLRenderPipelineStateInstance *pso_inst = pso_entry.value;
+      descriptors.append(pso_descriptor);
+      prim_classes.append(pso_inst->prim_type);
+    }
+    parent_mtl->pso_cache_lock_.unlock();
+
+    /* Warm shader cache with applied limit.
+     * If limit is <= 0, compile all PSO permutations. */
+    limit = (limit > 0) ? limit : descriptors.size();
+    for (int i : IndexRange(min_ii(descriptors.size(), limit))) {
+      const MTLRenderPipelineStateDescriptor &pso_descriptor = descriptors[i];
+      const MTLPrimitiveTopologyClass &prim_class = prim_classes[i];
+      bake_pipeline_state(ctx, prim_class, pso_descriptor);
+    }
+  }
+}
+
 /** \} */

 /* -------------------------------------------------------------------- */
@ -681,12 +715,10 @@ void MTLShader::set_interface(MTLShaderInterface *interface)
 MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
    MTLContext *ctx, MTLPrimitiveTopologyClass prim_type)
 {
+  /** Populate global pipeline descriptor and use this to prepare new PSO. */
  /* NOTE(Metal): PSO cache can be accessed from multiple threads, though these operations should
   * be thread-safe due to organization of high-level renderer. If there are any issues, then
   * access can be guarded as appropriate. */
-  BLI_assert(this);
-  MTLShaderInterface *mtl_interface = this->get_interface();
-  BLI_assert(mtl_interface);
  BLI_assert(this->is_valid());

  /* NOTE(Metal): Vertex input assembly description will have been populated externally
@ -756,15 +788,32 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
  pipeline_descriptor.vertex_descriptor.prim_topology_class =
      (requires_specific_topology_class) ? prim_type : MTLPrimitiveTopologyClassUnspecified;

+  /* Bake pipeline state using global descriptor. */
+  return bake_pipeline_state(ctx, prim_type, pipeline_descriptor);
+}
+
+/* Variant which bakes a pipeline state based on an an existing MTLRenderPipelineStateDescriptor.
+ * This function should be callable from a secondary compilatiom thread. */
+MTLRenderPipelineStateInstance *MTLShader::bake_pipeline_state(
+    MTLContext *ctx,
+    MTLPrimitiveTopologyClass prim_type,
+    const MTLRenderPipelineStateDescriptor &pipeline_descriptor)
+{
+  /* Fetch shader interface. */
+  MTLShaderInterface *mtl_interface = this->get_interface();
+  BLI_assert(mtl_interface);
+  BLI_assert(this->is_valid());
+
  /* Check if current PSO exists in the cache. */
+  pso_cache_lock_.lock();
  MTLRenderPipelineStateInstance **pso_lookup = pso_cache_.lookup_ptr(pipeline_descriptor);
  MTLRenderPipelineStateInstance *pipeline_state = (pso_lookup) ? *pso_lookup : nullptr;
+  pso_cache_lock_.unlock();
+
  if (pipeline_state != nullptr) {
    return pipeline_state;
  }

-  shader_debug_printf("Baking new pipeline variant for shader: %s\n", this->name);
-
  /* Generate new Render Pipeline State Object (PSO). */
  @autoreleasepool {
    /* Prepare Render Pipeline Descriptor. */
@ -774,7 +823,6 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
    MTLFunctionConstantValues *values = [[MTLFunctionConstantValues new] autorelease];

    /* Prepare Vertex descriptor based on current pipeline vertex binding state. */
-    MTLRenderPipelineStateDescriptor &current_state = pipeline_descriptor;
    MTLRenderPipelineDescriptor *desc = pso_descriptor_;
    [desc reset];
    pso_descriptor_.label = [NSString stringWithUTF8String:this->name];
@ -784,7 +832,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
     * specialization constant, customized per unique pipeline state permutation.
     *
     * NOTE: For binding point compaction, we could use the number of VBOs present
-     * in the current PSO configuration `current_state.vertex_descriptor.num_vert_buffers`).
+     * in the current PSO configuration `pipeline_descriptors.vertex_descriptor.num_vert_buffers`).
     * However, it is more efficient to simply offset the uniform buffer base index to the
     * maximal number of VBO bind-points, as then UBO bind-points for similar draw calls
     * will align and avoid the requirement for additional binding. */
@ -792,7 +840,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(

    /* Null buffer index is used if an attribute is not found in the
     * bound VBOs #VertexFormat. */
-    int null_buffer_index = current_state.vertex_descriptor.num_vert_buffers;
+    int null_buffer_index = pipeline_descriptor.vertex_descriptor.num_vert_buffers;
    bool using_null_buffer = false;

    if (this->get_uses_ssbo_vertex_fetch()) {
@ -806,11 +854,12 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
      MTL_uniform_buffer_base_index = MTL_SSBO_VERTEX_FETCH_IBO_INDEX + 1;
    }
    else {
-      for (const uint i : IndexRange(current_state.vertex_descriptor.max_attribute_value + 1)) {
+      for (const uint i :
+           IndexRange(pipeline_descriptor.vertex_descriptor.max_attribute_value + 1)) {

        /* Metal back-end attribute descriptor state. */
-        MTLVertexAttributeDescriptorPSO &attribute_desc =
-            current_state.vertex_descriptor.attributes[i];
+        const MTLVertexAttributeDescriptorPSO &attribute_desc =
+            pipeline_descriptor.vertex_descriptor.attributes[i];

        /* Flag format conversion */
        /* In some cases, Metal cannot implicitly convert between data types.
@ -860,10 +909,10 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
        mtl_attribute.bufferIndex = attribute_desc.buffer_index;
      }

-      for (const uint i : IndexRange(current_state.vertex_descriptor.num_vert_buffers)) {
+      for (const uint i : IndexRange(pipeline_descriptor.vertex_descriptor.num_vert_buffers)) {
        /* Metal back-end state buffer layout. */
        const MTLVertexBufferLayoutDescriptorPSO &buf_layout =
-            current_state.vertex_descriptor.buffer_layouts[i];
+            pipeline_descriptor.vertex_descriptor.buffer_layouts[i];
        /* Copy metal back-end buffer layout state into PSO descriptor.
         * NOTE: need to copy each element due to copying from internal
         * back-end descriptor to Metal API descriptor. */
@ -875,7 +924,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
      }

      /* Mark empty attribute conversion. */
-      for (int i = current_state.vertex_descriptor.max_attribute_value + 1;
+      for (int i = pipeline_descriptor.vertex_descriptor.max_attribute_value + 1;
           i < GPU_VERT_ATTR_MAX_LEN;
           i++) {
        int MTL_attribute_conversion_mode = 0;
@ -1039,7 +1088,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
    for (int color_attachment = 0; color_attachment < GPU_FB_MAX_COLOR_ATTACHMENT;
         color_attachment++) {
      /* Fetch color attachment pixel format in back-end pipeline state. */
-      MTLPixelFormat pixel_format = current_state.color_attachment_format[color_attachment];
+      MTLPixelFormat pixel_format = pipeline_descriptor.color_attachment_format[color_attachment];
      /* Populate MTL API PSO attachment descriptor. */
      MTLRenderPipelineColorAttachmentDescriptor *col_attachment =
          desc.colorAttachments[color_attachment];
@ -1048,19 +1097,19 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
      if (pixel_format != MTLPixelFormatInvalid) {
        bool format_supports_blending = mtl_format_supports_blending(pixel_format);

-        col_attachment.writeMask = current_state.color_write_mask;
-        col_attachment.blendingEnabled = current_state.blending_enabled &&
+        col_attachment.writeMask = pipeline_descriptor.color_write_mask;
+        col_attachment.blendingEnabled = pipeline_descriptor.blending_enabled &&
                                         format_supports_blending;
-        if (format_supports_blending && current_state.blending_enabled) {
-          col_attachment.alphaBlendOperation = current_state.alpha_blend_op;
-          col_attachment.rgbBlendOperation = current_state.rgb_blend_op;
-          col_attachment.destinationAlphaBlendFactor = current_state.dest_alpha_blend_factor;
-          col_attachment.destinationRGBBlendFactor = current_state.dest_rgb_blend_factor;
-          col_attachment.sourceAlphaBlendFactor = current_state.src_alpha_blend_factor;
-          col_attachment.sourceRGBBlendFactor = current_state.src_rgb_blend_factor;
+        if (format_supports_blending && pipeline_descriptor.blending_enabled) {
+          col_attachment.alphaBlendOperation = pipeline_descriptor.alpha_blend_op;
+          col_attachment.rgbBlendOperation = pipeline_descriptor.rgb_blend_op;
+          col_attachment.destinationAlphaBlendFactor = pipeline_descriptor.dest_alpha_blend_factor;
+          col_attachment.destinationRGBBlendFactor = pipeline_descriptor.dest_rgb_blend_factor;
+          col_attachment.sourceAlphaBlendFactor = pipeline_descriptor.src_alpha_blend_factor;
+          col_attachment.sourceRGBBlendFactor = pipeline_descriptor.src_rgb_blend_factor;
        }
        else {
-          if (current_state.blending_enabled && !format_supports_blending) {
+          if (pipeline_descriptor.blending_enabled && !format_supports_blending) {
            shader_debug_printf(
                "[Warning] Attempting to Bake PSO, but MTLPixelFormat %d does not support "
                "blending\n",
@ -1069,8 +1118,8 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
        }
      }
    }
-    desc.depthAttachmentPixelFormat = current_state.depth_attachment_format;
-    desc.stencilAttachmentPixelFormat = current_state.stencil_attachment_format;
+    desc.depthAttachmentPixelFormat = pipeline_descriptor.depth_attachment_format;
+    desc.stencilAttachmentPixelFormat = pipeline_descriptor.stencil_attachment_format;

    /* Compile PSO */
    MTLAutoreleasedRenderPipelineReflection reflection_data;
@ -1090,7 +1139,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
      return nullptr;
    }
    else {
-#ifndef NDEBUG
+#if 0
      NSLog(@"Successfully compiled PSO for shader: %s (Metal Context: %p)\n", this->name, ctx);
 #endif
    }
@ -1103,7 +1152,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
    pso_inst->base_uniform_buffer_index = MTL_uniform_buffer_base_index;
    pso_inst->null_attribute_buffer_index = (using_null_buffer) ? null_buffer_index : -1;
    pso_inst->transform_feedback_buffer_index = MTL_transform_feedback_buffer_index;
-    pso_inst->shader_pso_index = pso_cache_.size();
+    pso_inst->prim_type = prim_type;

    pso_inst->reflection_data_available = (reflection_data != nil);
    if (reflection_data != nil) {
@ -1189,9 +1238,14 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
    [pso_inst->pso retain];

    /* Insert into pso cache. */
+    pso_cache_lock_.lock();
+    pso_inst->shader_pso_index = pso_cache_.size();
    pso_cache_.add(pipeline_descriptor, pso_inst);
-    shader_debug_printf("PSO CACHE: Stored new variant in PSO cache for shader '%s'\n",
-                        this->name);
+    pso_cache_lock_.unlock();
+    shader_debug_printf(
+        "PSO CACHE: Stored new variant in PSO cache for shader '%s' Hash: '%llu'\n",
+        this->name,
+        pipeline_descriptor.hash());
    return pso_inst;
  }
 }
@ -1256,7 +1310,7 @@ bool MTLShader::bake_compute_pipeline_state(MTLContext *ctx)
      return false;
    }
    else {
-#ifndef NDEBUG
+#if 0
      NSLog(@"Successfully compiled compute PSO for shader: %s (Metal Context: %p)\n",
            this->name,
            ctx);
--- a/source/blender/gpu/opengl/gl_shader.hh
+++ b/source/blender/gpu/opengl/gl_shader.hh
@ -47,6 +47,7 @@ class GLShader : public Shader {
  void fragment_shader_from_glsl(MutableSpan<const char *> sources) override;
  void compute_shader_from_glsl(MutableSpan<const char *> sources) override;
  bool finalize(const shader::ShaderCreateInfo *info = nullptr) override;
+  void warm_cache(int limit) override{};

  std::string resources_declare(const shader::ShaderCreateInfo &info) const override;
  std::string vertex_interface_declare(const shader::ShaderCreateInfo &info) const override;
--- a/source/blender/gpu/vulkan/vk_shader.hh
+++ b/source/blender/gpu/vulkan/vk_shader.hh
@ -35,6 +35,7 @@ class VKShader : public Shader {
  void fragment_shader_from_glsl(MutableSpan<const char *> sources) override;
  void compute_shader_from_glsl(MutableSpan<const char *> sources) override;
  bool finalize(const shader::ShaderCreateInfo *info = nullptr) override;
+  void warm_cache(int limit) override{};

  void transform_feedback_names_set(Span<const char *> name_list,
                                    eGPUShaderTFBType geom_type) override;