2023-12-09 05:37:47 +01:00
31 changed files with 543 additions and 164 deletions
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@ -674,7 +674,6 @@ if(NOT OpenImageIO_FOUND)
  set(OPENIMAGEIO_LIBRARIES ${OIIO_OPTIMIZED} ${OIIO_DEBUG})
  set(OPENIMAGEIO_IDIFF "${OPENIMAGEIO}/bin/idiff.exe")
 endif()
-add_definitions(-DOIIO_NO_SSE=1)

 if(WITH_LLVM)
  set(LLVM_ROOT_DIR ${LIBDIR}/llvm CACHE PATH "Path to the LLVM installation")
--- a/source/blender/compositor/operations/COM_SplitOperation.cc
+++ b/source/blender/compositor/operations/COM_SplitOperation.cc
@ -37,7 +37,7 @@ void SplitOperation::execute_pixel_sampled(float output[4],
 {
  int perc = x_split_ ? split_percentage_ * this->get_width() / 100.0f :
                        split_percentage_ * this->get_height() / 100.0f;
-  bool image1 = x_split_ ? x > perc : y > perc;
+  bool image1 = x_split_ ? x >= perc : y >= perc;
  if (image1) {
    image1Input_->read_sampled(output, x, y, PixelSampler::Nearest);
  }
@ -64,7 +64,7 @@ void SplitOperation::update_memory_buffer_partial(MemoryBuffer *output,
                                 split_percentage_ * this->get_height() / 100.0f;
  const size_t elem_bytes = COM_data_type_bytes_len(get_output_socket()->get_data_type());
  for (BuffersIterator<float> it = output->iterate_with(inputs, area); !it.is_end(); ++it) {
-    const bool is_image1 = x_split_ ? it.x > percent : it.y > percent;
+    const bool is_image1 = x_split_ ? it.x >= percent : it.y >= percent;
    memcpy(it.out, it.in(is_image1 ? 0 : 1), elem_bytes);
  }
 }
--- a/source/blender/compositor/realtime_compositor/intern/shader_operation.cc
+++ b/source/blender/compositor/realtime_compositor/intern/shader_operation.cc
@ -41,7 +41,8 @@ using namespace nodes::derived_node_tree_types;
 ShaderOperation::ShaderOperation(Context &context, ShaderCompileUnit &compile_unit)
    : Operation(context), compile_unit_(compile_unit)
 {
-  material_ = GPU_material_from_callbacks(&construct_material, &generate_code, this);
+  material_ = GPU_material_from_callbacks(
+      GPU_MAT_COMPOSITOR, &construct_material, &generate_code, this);
  GPU_material_status_set(material_, GPU_MAT_QUEUED);
  GPU_material_compile(material_);
 }
--- a/source/blender/compositor/realtime_compositor/shaders/compositor_split.glsl
+++ b/source/blender/compositor/realtime_compositor/shaders/compositor_split.glsl
@ -9,9 +9,9 @@ void main()
  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
  ivec2 output_size = imageSize(output_img);
 #if defined(SPLIT_HORIZONTAL)
-  bool condition = (output_size.x * split_ratio) < texel.x;
+  bool condition = (output_size.x * split_ratio) <= texel.x;
 #elif defined(SPLIT_VERTICAL)
-  bool condition = (output_size.y * split_ratio) < texel.y;
+  bool condition = (output_size.y * split_ratio) <= texel.y;
 #endif
  vec4 color = condition ? texture_load(first_image_tx, texel) :
                           texture_load(second_image_tx, texel);
--- a/source/blender/draw/CMakeLists.txt
+++ b/source/blender/draw/CMakeLists.txt
@ -477,6 +477,10 @@ set(GLSL_SRC
  engines/eevee_next/shaders/eevee_deferred_capture_frag.glsl
  engines/eevee_next/shaders/eevee_deferred_combine_frag.glsl
  engines/eevee_next/shaders/eevee_deferred_planar_frag.glsl
+  engines/eevee_next/shaders/eevee_deferred_tile_classify_frag.glsl
+  engines/eevee_next/shaders/eevee_deferred_tile_compact_vert.glsl
+  engines/eevee_next/shaders/eevee_deferred_tile_stencil_frag.glsl
+  engines/eevee_next/shaders/eevee_deferred_tile_stencil_vert.glsl
  engines/eevee_next/shaders/eevee_depth_of_field_accumulator_lib.glsl
  engines/eevee_next/shaders/eevee_depth_of_field_bokeh_lut_comp.glsl
  engines/eevee_next/shaders/eevee_depth_of_field_downsample_comp.glsl
--- a/source/blender/draw/engines/eevee/eevee_shaders.cc
+++ b/source/blender/draw/engines/eevee/eevee_shaders.cc
@ -1384,11 +1384,13 @@ static GPUMaterial *eevee_material_get_ex(

  if (ma) {
    bNodeTree *ntree = !is_default ? ma->nodetree : EEVEE_shader_default_surface_nodetree(ma);
-    mat = DRW_shader_from_material(ma, ntree, options, is_volume, deferred, cbfn, nullptr);
+    mat = DRW_shader_from_material(
+        ma, ntree, GPU_MAT_EEVEE_LEGACY, options, is_volume, deferred, cbfn, nullptr);
  }
  else {
    bNodeTree *ntree = !is_default ? wo->nodetree : EEVEE_shader_default_world_nodetree(wo);
-    mat = DRW_shader_from_world(wo, ntree, options, is_volume, deferred, cbfn, nullptr);
+    mat = DRW_shader_from_world(
+        wo, ntree, GPU_MAT_EEVEE_LEGACY, options, is_volume, deferred, cbfn, nullptr);
  }
  return mat;
 }
--- a/source/blender/draw/engines/eevee_next/eevee_defines.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_defines.hh
@ -98,6 +98,10 @@
 #define SHADOW_MAX_RAY 4
 #define SHADOW_ROG_ID 0

+/* Deferred Lighting. */
+#define DEFERRED_RADIANCE_FORMAT GPU_R11F_G11F_B10F
+#define DEFERRED_GBUFFER_ROG_ID 0
+
 /* Ray-tracing. */
 #define RAYTRACE_GROUP_SIZE 8
 /* Keep this as a define to avoid shader variations. */
--- a/source/blender/draw/engines/eevee_next/eevee_light.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_light.cc
@ -238,7 +238,7 @@ float Light::point_radiance_get(const ::Light *la)
 void Light::debug_draw()
 {
 #ifndef NDEBUG
-  drw_debug_sphere(_position, influence_radius_max, float4(0.8f, 0.3f, 0.0f, 1.0f));
+  drw_debug_sphere(float3(_position), influence_radius_max, float4(0.8f, 0.3f, 0.0f, 1.0f));
 #endif
 }

--- a/source/blender/draw/engines/eevee_next/eevee_pipeline.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_pipeline.cc
@ -445,27 +445,27 @@ void DeferredLayer::begin_sync()
  }
  {
    gbuffer_ps_.init();
+    gbuffer_ps_.subpass_transition(GPU_ATTACHEMENT_WRITE,
+                                   {GPU_ATTACHEMENT_WRITE,
+                                    GPU_ATTACHEMENT_WRITE,
+                                    GPU_ATTACHEMENT_WRITE,
+                                    GPU_ATTACHEMENT_WRITE});
+    /* G-buffer. */
+    gbuffer_ps_.bind_image(GBUF_CLOSURE_SLOT, &inst_.gbuffer.closure_img_tx);
+    gbuffer_ps_.bind_image(GBUF_COLOR_SLOT, &inst_.gbuffer.color_img_tx);
+    /* RenderPasses & AOVs. */
+    gbuffer_ps_.bind_image(RBUFS_COLOR_SLOT, &inst_.render_buffers.rp_color_tx);
+    gbuffer_ps_.bind_image(RBUFS_VALUE_SLOT, &inst_.render_buffers.rp_value_tx);
+    /* Cryptomatte. */
+    gbuffer_ps_.bind_image(RBUFS_CRYPTOMATTE_SLOT, &inst_.render_buffers.cryptomatte_tx);
+    /* Storage Buffer. */
+    /* Textures. */
+    gbuffer_ps_.bind_texture(RBUFS_UTILITY_TEX_SLOT, inst_.pipelines.utility_tx);

-    {
-      /* Common resources. */
-
-      /* G-buffer. */
-      gbuffer_ps_.bind_image(GBUF_CLOSURE_SLOT, &inst_.gbuffer.closure_img_tx);
-      gbuffer_ps_.bind_image(GBUF_COLOR_SLOT, &inst_.gbuffer.color_img_tx);
-      /* RenderPasses & AOVs. */
-      gbuffer_ps_.bind_image(RBUFS_COLOR_SLOT, &inst_.render_buffers.rp_color_tx);
-      gbuffer_ps_.bind_image(RBUFS_VALUE_SLOT, &inst_.render_buffers.rp_value_tx);
-      /* Cryptomatte. */
-      gbuffer_ps_.bind_image(RBUFS_CRYPTOMATTE_SLOT, &inst_.render_buffers.cryptomatte_tx);
-      /* Storage Buffer. */
-      /* Textures. */
-      gbuffer_ps_.bind_texture(RBUFS_UTILITY_TEX_SLOT, inst_.pipelines.utility_tx);
-
-      inst_.bind_uniform_data(&gbuffer_ps_);
-      inst_.sampling.bind_resources(gbuffer_ps_);
-      inst_.hiz_buffer.bind_resources(gbuffer_ps_);
-      inst_.cryptomatte.bind_resources(gbuffer_ps_);
-    }
+    inst_.bind_uniform_data(&gbuffer_ps_);
+    inst_.sampling.bind_resources(gbuffer_ps_);
+    inst_.hiz_buffer.bind_resources(gbuffer_ps_);
+    inst_.cryptomatte.bind_resources(gbuffer_ps_);

    DRWState state = DRW_STATE_WRITE_COLOR | DRW_STATE_DEPTH_EQUAL;

@ -483,27 +483,85 @@ void DeferredLayer::end_sync()
 {
  eClosureBits evaluated_closures = CLOSURE_DIFFUSE | CLOSURE_REFLECTION | CLOSURE_REFRACTION;
  if (closure_bits_ & evaluated_closures) {
+    /* First add the tile classification step at the end of the GBuffer pass. */
+    {
+      /* Fill tile mask texture with the collected closure present in a tile. */
+      PassMain::Sub &sub = gbuffer_ps_.sub("TileClassify");
+      sub.subpass_transition(GPU_ATTACHEMENT_WRITE, /* Needed for depth test. */
+                             {GPU_ATTACHEMENT_IGNORE,
+                              GPU_ATTACHEMENT_READ, /* Header. */
+                              GPU_ATTACHEMENT_IGNORE,
+                              GPU_ATTACHEMENT_IGNORE});
+      /* Use depth test to reject background pixels. */
+      /* WORKAROUND: Avoid rasterizer discard, but the shaders actually use no fragment output. */
+      sub.state_set(DRW_STATE_WRITE_STENCIL | DRW_STATE_DEPTH_GREATER);
+      sub.shader_set(inst_.shaders.static_shader_get(DEFERRED_TILE_CLASSIFY));
+      sub.bind_image("tile_mask_img", &tile_mask_tx_);
+      sub.push_constant("closure_tile_size_shift", &closure_tile_size_shift_);
+      sub.barrier(GPU_BARRIER_TEXTURE_FETCH);
+      sub.draw_procedural(GPU_PRIM_TRIS, 1, 3);
+    }
+    {
+      PassMain::Sub &sub = gbuffer_ps_.sub("TileCompaction");
+      /* Use rasterizer discard. This processes the tile data to create tile command lists. */
+      sub.state_set(DRW_STATE_NO_DRAW);
+      sub.shader_set(inst_.shaders.static_shader_get(DEFERRED_TILE_COMPACT));
+      sub.bind_texture("tile_mask_tx", &tile_mask_tx_);
+      sub.bind_ssbo("closure_single_tile_buf", &closure_bufs_[0].tile_buf_);
+      sub.bind_ssbo("closure_single_draw_buf", &closure_bufs_[0].draw_buf_);
+      sub.bind_ssbo("closure_double_tile_buf", &closure_bufs_[1].tile_buf_);
+      sub.bind_ssbo("closure_double_draw_buf", &closure_bufs_[1].draw_buf_);
+      sub.bind_ssbo("closure_triple_tile_buf", &closure_bufs_[2].tile_buf_);
+      sub.bind_ssbo("closure_triple_draw_buf", &closure_bufs_[2].draw_buf_);
+      sub.barrier(GPU_BARRIER_TEXTURE_FETCH);
+      sub.draw_procedural(GPU_PRIM_POINTS, 1, max_lighting_tile_count_);
+    }
+
    {
      PassSimple &pass = eval_light_ps_;
      pass.init();
-      /* Use depth test to reject background pixels. */
-      /* WORKAROUND: Avoid rasterizer discard, but the shaders actually use no fragment output. */
-      pass.state_set(DRW_STATE_WRITE_STENCIL | DRW_STATE_DEPTH_GREATER);
-      pass.shader_set(inst_.shaders.static_shader_get(DEFERRED_LIGHT));
-      pass.bind_image("direct_diffuse_img", &direct_diffuse_tx_);
-      pass.bind_image("direct_reflect_img", &direct_reflect_tx_);
-      pass.bind_image("direct_refract_img", &direct_refract_tx_);
-      pass.bind_texture(RBUFS_UTILITY_TEX_SLOT, inst_.pipelines.utility_tx);
-      pass.bind_image(RBUFS_COLOR_SLOT, &inst_.render_buffers.rp_color_tx);
-      pass.bind_image(RBUFS_VALUE_SLOT, &inst_.render_buffers.rp_value_tx);
-      inst_.bind_uniform_data(&pass);
-      inst_.gbuffer.bind_resources(pass);
-      inst_.lights.bind_resources(pass);
-      inst_.shadows.bind_resources(pass);
-      inst_.sampling.bind_resources(pass);
-      inst_.hiz_buffer.bind_resources(pass);
-      pass.barrier(GPU_BARRIER_TEXTURE_FETCH | GPU_BARRIER_SHADER_IMAGE_ACCESS);
-      pass.draw_procedural(GPU_PRIM_TRIS, 1, 3);
+
+      {
+        PassSimple::Sub &sub = pass.sub("StencilSet");
+        sub.state_set(DRW_STATE_WRITE_STENCIL | DRW_STATE_STENCIL_ALWAYS |
+                      DRW_STATE_DEPTH_GREATER);
+        sub.shader_set(inst_.shaders.static_shader_get(DEFERRED_TILE_STENCIL));
+        sub.push_constant("closure_tile_size_shift", &closure_tile_size_shift_);
+        sub.bind_texture("direct_radiance_tx", &direct_radiance_txs_[0]);
+        /* Set stencil value for each tile complexity level. */
+        for (int i = 0; i < ARRAY_SIZE(closure_bufs_); i++) {
+          sub.bind_ssbo("closure_tile_buf", &closure_bufs_[i].tile_buf_);
+          sub.state_stencil(0xFFu, 1u << i, 0xFFu);
+          sub.draw_procedural_indirect(GPU_PRIM_TRIS, closure_bufs_[i].draw_buf_);
+        }
+      }
+      {
+        PassSimple::Sub &sub = pass.sub("Eval");
+        /* Use depth test to reject background pixels which have not been stencil cleared. */
+        /* WORKAROUND: Avoid rasterizer discard by enabling stencil write, but the shaders actually
+         * use no fragment output. */
+        sub.state_set(DRW_STATE_WRITE_STENCIL | DRW_STATE_STENCIL_EQUAL | DRW_STATE_DEPTH_GREATER);
+        sub.barrier(GPU_BARRIER_SHADER_STORAGE);
+        sub.bind_texture(RBUFS_UTILITY_TEX_SLOT, inst_.pipelines.utility_tx);
+        sub.bind_image(RBUFS_COLOR_SLOT, &inst_.render_buffers.rp_color_tx);
+        sub.bind_image(RBUFS_VALUE_SLOT, &inst_.render_buffers.rp_value_tx);
+        /* Submit the more costly ones first to avoid long tail in occupancy.
+         * See page 78 of "Siggraph 2023: Unreal Engine Substrate" by Hillaire & de Rousiers. */
+        for (int i = ARRAY_SIZE(closure_bufs_) - 1; i >= 0; i--) {
+          sub.shader_set(inst_.shaders.static_shader_get(eShaderType(DEFERRED_LIGHT_SINGLE + i)));
+          sub.bind_image("direct_radiance_1_img", &direct_radiance_txs_[0]);
+          sub.bind_image("direct_radiance_2_img", &direct_radiance_txs_[1]);
+          sub.bind_image("direct_radiance_3_img", &direct_radiance_txs_[2]);
+          inst_.bind_uniform_data(&sub);
+          inst_.gbuffer.bind_resources(sub);
+          inst_.lights.bind_resources(sub);
+          inst_.shadows.bind_resources(sub);
+          inst_.sampling.bind_resources(sub);
+          inst_.hiz_buffer.bind_resources(sub);
+          sub.state_stencil(0xFFu, 1u << i, 0xFFu);
+          sub.draw_procedural(GPU_PRIM_TRIS, 1, 3);
+        }
+      }
    }
    {
      PassSimple &pass = combine_ps_;
@ -511,9 +569,9 @@ void DeferredLayer::end_sync()
      /* Use depth test to reject background pixels. */
      pass.state_set(DRW_STATE_WRITE_COLOR | DRW_STATE_DEPTH_GREATER | DRW_STATE_BLEND_ADD_FULL);
      pass.shader_set(inst_.shaders.static_shader_get(DEFERRED_COMBINE));
-      pass.bind_image("direct_diffuse_img", &direct_diffuse_tx_);
-      pass.bind_image("direct_reflect_img", &direct_reflect_tx_);
-      pass.bind_image("direct_refract_img", &direct_refract_tx_);
+      pass.bind_image("direct_radiance_1_img", &direct_radiance_txs_[0]);
+      pass.bind_image("direct_radiance_2_img", &direct_radiance_txs_[1]);
+      pass.bind_image("direct_radiance_3_img", &direct_radiance_txs_[2]);
      pass.bind_image("indirect_diffuse_img", &indirect_diffuse_tx_);
      pass.bind_image("indirect_reflect_img", &indirect_reflect_tx_);
      pass.bind_image("indirect_refract_img", &indirect_refract_tx_);
@ -566,6 +624,7 @@ void DeferredLayer::render(View &main_view,
   * environment. So in this case, disable tracing and fallback to probe. */
  bool do_screen_space_refraction = !is_first_pass && (closure_bits_ & CLOSURE_REFRACTION);
  bool do_screen_space_reflection = (closure_bits_ & CLOSURE_REFLECTION);
+  eGPUTextureUsage usage_rw = GPU_TEXTURE_USAGE_SHADER_READ | GPU_TEXTURE_USAGE_SHADER_WRITE;

  if (do_screen_space_reflection) {
    /* TODO(fclem): Verify if GPU_TEXTURE_USAGE_ATTACHMENT is needed for the copy and the clear. */
@ -609,14 +668,33 @@ void DeferredLayer::render(View &main_view,
    }
  }

-  if (/* FIXME(fclem): Metal doesn't clear the whole framebuffer correctly. */
-      GPU_backend_get_type() == GPU_BACKEND_METAL ||
-      /* FIXME(fclem): Vulkan doesn't implement load / store config yet. */
+  if (/* FIXME(fclem): Vulkan doesn't implement load / store config yet. */
      GPU_backend_get_type() == GPU_BACKEND_VULKAN)
  {
    inst_.gbuffer.header_tx.clear(int4(0));
  }

+  int2 tile_mask_size;
+  int tile_count;
+  closure_tile_size_shift_ = 4;
+  /* Increase tile size until they fit the budget. */
+  for (int i = 0; i < 4; i++, closure_tile_size_shift_++) {
+    tile_mask_size = math::divide_ceil(extent, int2(1u << closure_tile_size_shift_));
+    tile_count = tile_mask_size.x * tile_mask_size.y;
+    if (tile_count <= max_lighting_tile_count_) {
+      break;
+    }
+  }
+
+  int target_count = power_of_2_max_u(tile_count);
+  for (int i = 0; i < ARRAY_SIZE(closure_bufs_); i++) {
+    closure_bufs_[i].tile_buf_.resize(target_count);
+    closure_bufs_[i].draw_buf_.clear_to_zero();
+  }
+
+  tile_mask_tx_.ensure_2d_array(GPU_R8UI, tile_mask_size, 4, usage_rw);
+  tile_mask_tx_.clear(uint4(0));
+
  GPU_framebuffer_bind_ex(gbuffer_fb,
                          {
                              {GPU_LOADACTION_LOAD, GPU_STOREACTION_STORE},       /* Depth */
@ -646,11 +724,10 @@ void DeferredLayer::render(View &main_view,

  inst_.shadows.set_view(render_view);

-  {
-    eGPUTextureUsage usage = GPU_TEXTURE_USAGE_SHADER_READ | GPU_TEXTURE_USAGE_SHADER_WRITE;
-    direct_diffuse_tx_.acquire(extent, GPU_RGBA16F, usage);
-    direct_reflect_tx_.acquire(extent, GPU_RGBA16F, usage);
-    direct_refract_tx_.acquire(extent, GPU_RGBA16F, usage);
+  int closure_count = count_bits_i(closure_bits_ & (CLOSURE_REFLECTION | CLOSURE_DIFFUSE));
+  for (int i = 0; i < ARRAY_SIZE(direct_radiance_txs_); i++) {
+    direct_radiance_txs_[i].acquire(
+        (closure_count > 1) ? extent : int2(1), GPU_R11F_G11F_B10F, usage_rw);
  }

  GPU_framebuffer_bind(combined_fb);
@ -676,7 +753,8 @@ void DeferredLayer::render(View &main_view,
  indirect_reflect_tx_ = reflect_result.get();
  indirect_refract_tx_ = refract_result.get();

-  inst_.subsurface.render(direct_diffuse_tx_, indirect_diffuse_tx_, closure_bits_, render_view);
+  inst_.subsurface.render(
+      direct_radiance_txs_[0], indirect_diffuse_tx_, closure_bits_, render_view);

  GPU_framebuffer_bind(combined_fb);
  inst_.manager->submit(combine_ps_);
@ -685,9 +763,9 @@ void DeferredLayer::render(View &main_view,
  refract_result.release();
  reflect_result.release();

-  direct_diffuse_tx_.release();
-  direct_reflect_tx_.release();
-  direct_refract_tx_.release();
+  for (int i = 0; i < ARRAY_SIZE(direct_radiance_txs_); i++) {
+    direct_radiance_txs_[i].release();
+  }

  if (do_screen_space_reflection) {
    GPU_texture_copy(radiance_feedback_tx_, rb.combined_tx);
--- a/source/blender/draw/engines/eevee_next/eevee_pipeline.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_pipeline.hh
@ -204,6 +204,8 @@ class DeferredLayer : DeferredLayerBase {
 private:
  Instance &inst_;

+  static constexpr int max_lighting_tile_count_ = 128 * 128;
+
  /* Evaluate all light objects contribution. */
  PassSimple eval_light_ps_ = {"EvalLights"};
  /* Combine direct and indirect light contributions and apply BSDF color. */
@ -216,15 +218,28 @@ class DeferredLayer : DeferredLayerBase {
   * BSDF color and do additive blending for each of the lighting step.
   *
   * NOTE: Not to be confused with the render passes.
+   * NOTE: Using array of texture instead of texture array to allow to use TextureFromPool.
   */
-  TextureFromPool direct_diffuse_tx_ = {"direct_diffuse_tx"};
-  TextureFromPool direct_reflect_tx_ = {"direct_reflect_tx"};
-  TextureFromPool direct_refract_tx_ = {"direct_refract_tx"};
+  TextureFromPool direct_radiance_txs_[3] = {
+      {"direct_radiance_1"}, {"direct_radiance_2"}, {"direct_radiance_3"}};
  /* Reference to ray-tracing result. */
  GPUTexture *indirect_diffuse_tx_ = nullptr;
  GPUTexture *indirect_reflect_tx_ = nullptr;
  GPUTexture *indirect_refract_tx_ = nullptr;

+  /* Parameters for the light evaluation pass. */
+  int closure_tile_size_shift_ = 0;
+  /* Tile buffers for different lighting complexity levels. */
+  struct {
+    DrawIndirectBuf draw_buf_ = {"DrawIndirectBuf"};
+    ClosureTileBuf tile_buf_ = {"ClosureTileBuf"};
+  } closure_bufs_[3];
+  /**
+   * Tile texture containing several bool per tile indicating presence of feature.
+   * It is used to select specialized shader for each tile.
+   */
+  Texture tile_mask_tx_ = {"tile_mask_tx_"};
+
  /* TODO(fclem): This should be a TextureFromPool. */
  Texture radiance_behind_tx_ = {"radiance_behind_tx"};
  /* TODO(fclem): This shouldn't be part of the pipeline but of the view. */
--- a/source/blender/draw/engines/eevee_next/eevee_shader.cc
+++ b/source/blender/draw/engines/eevee_next/eevee_shader.cc
@ -94,12 +94,22 @@ const char *ShaderModule::static_shader_create_info_name_get(eShaderType shader_
      return "eevee_film_cryptomatte_post";
    case DEFERRED_COMBINE:
      return "eevee_deferred_combine";
-    case DEFERRED_LIGHT:
-      return "eevee_deferred_light";
+    case DEFERRED_LIGHT_SINGLE:
+      return "eevee_deferred_light_single";
+    case DEFERRED_LIGHT_DOUBLE:
+      return "eevee_deferred_light_double";
+    case DEFERRED_LIGHT_TRIPLE:
+      return "eevee_deferred_light_triple";
    case DEFERRED_CAPTURE_EVAL:
      return "eevee_deferred_capture_eval";
    case DEFERRED_PLANAR_EVAL:
      return "eevee_deferred_planar_eval";
+    case DEFERRED_TILE_CLASSIFY:
+      return "eevee_deferred_tile_classify";
+    case DEFERRED_TILE_COMPACT:
+      return "eevee_deferred_tile_compact";
+    case DEFERRED_TILE_STENCIL:
+      return "eevee_deferred_tile_stencil";
    case HIZ_DEBUG:
      return "eevee_hiz_debug";
    case HIZ_UPDATE:
@ -668,8 +678,14 @@ GPUMaterial *ShaderModule::material_shader_get(::Material *blender_mat,
  uint64_t shader_uuid = shader_uuid_from_material_type(
      pipeline_type, geometry_type, displacement_type, blender_mat->blend_flag);

-  return DRW_shader_from_material(
-      blender_mat, nodetree, shader_uuid, is_volume, deferred_compilation, codegen_callback, this);
+  return DRW_shader_from_material(blender_mat,
+                                  nodetree,
+                                  GPU_MAT_EEVEE,
+                                  shader_uuid,
+                                  is_volume,
+                                  deferred_compilation,
+                                  codegen_callback,
+                                  this);
 }

 GPUMaterial *ShaderModule::world_shader_get(::World *blender_world,
@ -683,8 +699,14 @@ GPUMaterial *ShaderModule::world_shader_get(::World *blender_world,

  uint64_t shader_uuid = shader_uuid_from_material_type(pipeline_type, geometry_type);

-  return DRW_shader_from_world(
-      blender_world, nodetree, shader_uuid, is_volume, defer_compilation, codegen_callback, this);
+  return DRW_shader_from_world(blender_world,
+                               nodetree,
+                               GPU_MAT_EEVEE,
+                               shader_uuid,
+                               is_volume,
+                               defer_compilation,
+                               codegen_callback,
+                               this);
 }

 /* Variation to compile a material only with a nodetree. Caller needs to maintain the list of
@ -704,6 +726,7 @@ GPUMaterial *ShaderModule::material_shader_get(const char *name,
                                                   nodetree,
                                                   &materials,
                                                   name,
+                                                   GPU_MAT_EEVEE,
                                                   shader_uuid,
                                                   is_volume,
                                                   false,
--- a/source/blender/draw/engines/eevee_next/eevee_shader.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_shader.hh
@ -32,10 +32,15 @@ enum eShaderType {
  FILM_COMP,
  FILM_CRYPTOMATTE_POST,

-  DEFERRED_COMBINE,
-  DEFERRED_LIGHT,
  DEFERRED_CAPTURE_EVAL,
+  DEFERRED_COMBINE,
+  DEFERRED_LIGHT_SINGLE,
+  DEFERRED_LIGHT_DOUBLE,
+  DEFERRED_LIGHT_TRIPLE,
  DEFERRED_PLANAR_EVAL,
+  DEFERRED_TILE_CLASSIFY,
+  DEFERRED_TILE_COMPACT,
+  DEFERRED_TILE_STENCIL,

  DEBUG_GBUFFER,
  DEBUG_SURFELS,
--- a/source/blender/draw/engines/eevee_next/eevee_shader_shared.hh
+++ b/source/blender/draw/engines/eevee_next/eevee_shader_shared.hh
@ -749,10 +749,10 @@ struct LightData {
 #define _clipmap_origin_y object_mat[3][3]
  /** Aliases for axes. */
 #ifndef USE_GPU_SHADER_CREATE_INFO
-#  define _right object_mat[0].xyz()
-#  define _up object_mat[1].xyz()
-#  define _back object_mat[2].xyz()
-#  define _position object_mat[3].xyz()
+#  define _right object_mat[0]
+#  define _up object_mat[1]
+#  define _back object_mat[2]
+#  define _position object_mat[3]
 #else
 #  define _right object_mat[0].xyz
 #  define _up object_mat[1].xyz
@ -1426,7 +1426,7 @@ struct PipelineInfoData {
  float alpha_hash_scale;
  float _pad0;
  float _pad1;
-  float _pad3;
+  float _pad2;
 };
 BLI_STATIC_ASSERT_ALIGN(PipelineInfoData, 16)

@ -1528,6 +1528,7 @@ float4 utility_tx_sample_lut(sampler2DArray util_tx, float cos_theta, float roug

 using AOVsInfoDataBuf = draw::StorageBuffer<AOVsInfoData>;
 using CameraDataBuf = draw::UniformBuffer<CameraData>;
+using ClosureTileBuf = draw::StorageArrayBuffer<uint, 1024, true>;
 using DepthOfFieldDataBuf = draw::UniformBuffer<DepthOfFieldData>;
 using DepthOfFieldScatterListBuf = draw::StorageArrayBuffer<ScatterRect, 16, true>;
 using DrawIndirectBuf = draw::StorageBuffer<DrawCommand, true>;
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_combine_frag.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_combine_frag.glsl
@ -22,18 +22,19 @@ void main()
  vec3 refract_light = vec3(0.0);

  if (gbuf.has_diffuse) {
-    diffuse_light = imageLoad(direct_diffuse_img, texel).rgb +
+    diffuse_light = imageLoad(direct_radiance_1_img, texel).rgb +
                    imageLoad(indirect_diffuse_img, texel).rgb;
  }

  if (gbuf.has_reflection) {
-    reflect_light = imageLoad(direct_reflect_img, texel).rgb +
+    reflect_light = imageLoad(direct_radiance_2_img, texel).rgb +
                    imageLoad(indirect_reflect_img, texel).rgb;
  }

  if (gbuf.has_refraction) {
-    refract_light = /* imageLoad(direct_refract_img, texel).rgb + */ /* TODO: Not implemented. */
-                    imageLoad(indirect_refract_img, texel).rgb;
+    refract_light =
+        /* imageLoad(direct_radiance_3_img, texel).rgb + */ /* TODO: Not implemented. */
+        imageLoad(indirect_refract_img, texel).rgb;
  }

  /* Light passes. */
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_light_frag.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_light_frag.glsl
@ -20,32 +20,45 @@ void main()
  float depth = texelFetch(hiz_tx, texel, 0).r;
  GBufferData gbuf = gbuffer_read(gbuf_header_tx, gbuf_closure_tx, gbuf_color_tx, texel);

-  if (!gbuf.has_reflection && !gbuf.has_diffuse /* TODO(fclem) && !gbuf.has_refraction */) {
+  if (gbuf.closure_count == 0) {
    return;
  }

  vec3 P = drw_point_screen_to_world(vec3(uvcoordsvar.xy, depth));
  /* Assume reflection closure normal is always somewhat representative of the geometric normal.
   * Ng is only used for shadow biases and subsurface check in this case. */
-  vec3 Ng = gbuf.has_reflection ? gbuf.reflection.N : gbuf.diffuse.N;
+  vec3 Ng = gbuf.surface_N;
  vec3 V = drw_world_incident_vector(P);
  float vPz = dot(drw_view_forward(), P) - dot(drw_view_forward(), drw_view_position());

  ClosureLightStack stack;

-  ClosureLight cl_diff;
-  cl_diff.N = gbuf.diffuse.N;
-  cl_diff.ltc_mat = LTC_LAMBERT_MAT;
-  cl_diff.type = LIGHT_DIFFUSE;
-  stack.cl[0] = cl_diff;
+  /* TODO(fclem): This is waiting for fully flexible evaluation pipeline. We need to refactor the
+   * raytracing pipeline first. */
+  if (gbuf.has_diffuse) {
+    ClosureLight cl_diff;
+    cl_diff.N = gbuf.diffuse.N;
+    cl_diff.ltc_mat = LTC_LAMBERT_MAT;
+    cl_diff.type = LIGHT_DIFFUSE;
+    stack.cl[0] = cl_diff;
+  }
+  else {
+    ClosureLight cl_refl;
+    cl_refl.N = gbuf.reflection.N;
+    cl_refl.ltc_mat = LTC_GGX_MAT(dot(gbuf.reflection.N, V), gbuf.reflection.roughness);
+    cl_refl.type = LIGHT_SPECULAR;
+    stack.cl[0] = cl_refl;
+  }

+#if LIGHT_CLOSURE_EVAL_COUNT > 1
  ClosureLight cl_refl;
  cl_refl.N = gbuf.reflection.N;
  cl_refl.ltc_mat = LTC_GGX_MAT(dot(gbuf.reflection.N, V), gbuf.reflection.roughness);
  cl_refl.type = LIGHT_SPECULAR;
  stack.cl[1] = cl_refl;
+#endif

-#ifdef SSS_TRANSMITTANCE
+#if LIGHT_CLOSURE_EVAL_COUNT > 2
  ClosureLight cl_sss;
  cl_sss.N = -gbuf.diffuse.N;
  cl_sss.ltc_mat = LTC_LAMBERT_MAT;
@ -53,54 +66,65 @@ void main()
  stack.cl[2] = cl_sss;
 #endif

-#ifdef SSS_TRANSMITTANCE
-  float shadow_thickness = thickness_from_shadow(P, Ng, vPz);
-  float thickness = (shadow_thickness != THICKNESS_NO_VALUE) ?
-                        max(shadow_thickness, gbuf.thickness) :
-                        gbuf.thickness;
-#else
  float thickness = 0.0;
+#ifdef SSS_TRANSMITTANCE
+  if (gbuf.has_sss) {
+    float shadow_thickness = thickness_from_shadow(P, Ng, vPz);
+    thickness = (shadow_thickness != THICKNESS_NO_VALUE) ? max(shadow_thickness, gbuf.thickness) :
+                                                           gbuf.thickness;
+  }
 #endif

  light_eval(stack, P, Ng, V, vPz, thickness);

+  vec3 radiance_shadowed = stack.cl[0].light_shadowed;
+  vec3 radiance_unshadowed = stack.cl[0].light_unshadowed;
+#if LIGHT_CLOSURE_EVAL_COUNT > 1
+  radiance_shadowed += stack.cl[1].light_shadowed;
+  radiance_unshadowed += stack.cl[1].light_unshadowed;
+#endif
+#if LIGHT_CLOSURE_EVAL_COUNT > 2
+  radiance_shadowed += stack.cl[2].light_shadowed;
+  radiance_unshadowed += stack.cl[2].light_unshadowed;
+#endif
+
 #ifdef SSS_TRANSMITTANCE
-  if (gbuf.diffuse.sss_id != 0u) {
+  if (gbuf.has_sss) {
    vec3 sss_profile = subsurface_transmission(gbuf.diffuse.sss_radius, thickness);
    stack.cl[2].light_shadowed *= sss_profile;
    stack.cl[2].light_unshadowed *= sss_profile;
+    /* Add to diffuse light for processing inside the Screen Space SSS pass. */
+    stack.cl[0].light_shadowed += stack.cl[2].light_shadowed;
+    stack.cl[0].light_unshadowed += stack.cl[2].light_unshadowed;
  }
-  else {
-    stack.cl[2].light_shadowed = vec3(0.0);
-    stack.cl[2].light_unshadowed = vec3(0.0);
-  }
-#endif
-
-  vec3 radiance_diffuse = stack.cl[0].light_shadowed;
-  vec3 radiance_specular = stack.cl[1].light_shadowed;
-#ifdef SSS_TRANSMITTANCE
-  radiance_diffuse += stack.cl[2].light_shadowed;
-#endif
-
-  vec3 radiance_shadowed = stack.cl[0].light_shadowed;
-  vec3 radiance_unshadowed = stack.cl[0].light_unshadowed;
-  radiance_shadowed += stack.cl[1].light_shadowed;
-  radiance_unshadowed += stack.cl[1].light_unshadowed;
-#ifdef SSS_TRANSMITTANCE
-  radiance_shadowed += stack.cl[2].light_shadowed;
-  radiance_unshadowed += stack.cl[2].light_unshadowed;
 #endif

  /* TODO(fclem): Change shadow pass to be colored. */
  vec3 shadows = radiance_shadowed * safe_rcp(radiance_unshadowed);
  output_renderpass_value(uniform_buf.render_pass.shadow_id, average(shadows));

-  if (gbuf.has_diffuse) {
-    imageStore(direct_diffuse_img, texel, vec4(radiance_diffuse, 1.0));
+  if (gbuf.closure_count > 0) {
+    /* TODO(fclem): This is waiting for fully flexible evaluation pipeline. We need to refactor the
+     * raytracing pipeline first. */
+    if (gbuf.has_diffuse) {
+      imageStore(direct_radiance_1_img, texel, vec4(stack.cl[0].light_shadowed, 1.0));
+    }
+    else {
+      imageStore(direct_radiance_2_img, texel, vec4(stack.cl[0].light_shadowed, 1.0));
+    }
  }
-  if (gbuf.has_reflection) {
-    imageStore(direct_reflect_img, texel, vec4(radiance_specular, 1.0));
+
+#if LIGHT_CLOSURE_EVAL_COUNT > 1
+  if (gbuf.closure_count > 1) {
+    imageStore(direct_radiance_2_img, texel, vec4(stack.cl[1].light_shadowed, 1.0));
  }
-  /* TODO(fclem): Support LTC for refraction. */
-  // imageStore(direct_refract_img, texel, vec4(cl_refr.light_shadowed, 1.0));
+#endif
+
+#if LIGHT_CLOSURE_EVAL_COUNT > 2
+#  if 0 /* Will work when we have fully flexible evaluation. */
+  if (gbuf.closure_count > 2) {
+    imageStore(direct_radiance_3_img, texel, vec4(stack.cl[2].light_shadowed, 1.0));
+  }
+#  endif
+#endif
 }
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_tile_classify_frag.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_tile_classify_frag.glsl
@ -0,0 +1,33 @@
+/* SPDX-FileCopyrightText: 2023 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+/**
+ * This pass load Gbuffer data and output a mask of tiles to process.
+ * This mask is then processed by the compaction phase.
+ */
+
+#pragma BLENDER_REQUIRE(gpu_shader_utildefines_lib.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_math_vector_lib.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_codegen_lib.glsl)
+#pragma BLENDER_REQUIRE(eevee_gbuffer_lib.glsl)
+
+void main()
+{
+  ivec2 texel = ivec2(gl_FragCoord.xy);
+
+  ivec2 tile_co = texel >> closure_tile_size_shift;
+
+  if (gbuffer_has_closure(in_gbuffer_header, eClosureBits(CLOSURE_DIFFUSE))) {
+    imageStore(tile_mask_img, ivec3(tile_co, 0), uvec4(1u));
+  }
+  if (gbuffer_has_closure(in_gbuffer_header, eClosureBits(CLOSURE_REFLECTION))) {
+    imageStore(tile_mask_img, ivec3(tile_co, 1), uvec4(1u));
+  }
+  if (gbuffer_has_closure(in_gbuffer_header, eClosureBits(CLOSURE_REFRACTION))) {
+    imageStore(tile_mask_img, ivec3(tile_co, 2), uvec4(1u));
+  }
+  if (gbuffer_has_closure(in_gbuffer_header, eClosureBits(CLOSURE_SSS))) {
+    imageStore(tile_mask_img, ivec3(tile_co, 3), uvec4(1u));
+  }
+}
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_tile_compact_vert.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_tile_compact_vert.glsl
@ -0,0 +1,51 @@
+/* SPDX-FileCopyrightText: 2023 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+/**
+ * Convert the tile classification texture into streams of tiles of each types.
+ * Dispatched with 1 vertex (thread) per tile.
+ */
+
+#pragma BLENDER_REQUIRE(gpu_shader_utildefines_lib.glsl)
+
+void main()
+{
+  /* Doesn't matter. Doesn't get rasterized. */
+  gl_Position = vec4(0.0);
+
+  int tile_per_row = textureSize(tile_mask_tx, 0).x;
+  ivec2 tile_coord = ivec2(gl_VertexID % tile_per_row, gl_VertexID / tile_per_row);
+
+  if (gl_VertexID == 0) {
+    closure_double_draw_buf.instance_len = 1u;
+    closure_single_draw_buf.instance_len = 1u;
+    closure_triple_draw_buf.instance_len = 1u;
+  }
+
+  if (!in_texture_range(tile_coord, tile_mask_tx)) {
+    return;
+  }
+
+  uint closure_count = texelFetch(tile_mask_tx, ivec3(tile_coord, 0), 0).r +
+                       texelFetch(tile_mask_tx, ivec3(tile_coord, 1), 0).r +
+                       // texelFetch(tile_mask_tx, ivec3(tile_coord, 2), 0).r + /* TODO: refract */
+                       texelFetch(tile_mask_tx, ivec3(tile_coord, 3), 0).r;
+  /* TODO(fclem): This is waiting for fully flexible evaluation pipeline. We need to refactor the
+   * raytracing pipeline first. */
+  bool has_reflection = texelFetch(tile_mask_tx, ivec3(tile_coord, 1), 0).r != 0u;
+  bool has_sss = texelFetch(tile_mask_tx, ivec3(tile_coord, 3), 0).r != 0u;
+
+  if (closure_count == 3 || has_sss) {
+    uint tile_index = atomicAdd(closure_triple_draw_buf.vertex_len, 6u) / 6u;
+    closure_triple_tile_buf[tile_index] = packUvec2x16(uvec2(tile_coord));
+  }
+  else if (closure_count == 2 || has_reflection) {
+    uint tile_index = atomicAdd(closure_double_draw_buf.vertex_len, 6u) / 6u;
+    closure_double_tile_buf[tile_index] = packUvec2x16(uvec2(tile_coord));
+  }
+  else if (closure_count == 1) {
+    uint tile_index = atomicAdd(closure_single_draw_buf.vertex_len, 6u) / 6u;
+    closure_single_tile_buf[tile_index] = packUvec2x16(uvec2(tile_coord));
+  }
+}
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_tile_stencil_frag.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_tile_stencil_frag.glsl
@ -0,0 +1,12 @@
+/* SPDX-FileCopyrightText: 2023 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+/**
+ * Load tile classification data and mark stencil areas.
+ */
+
+void main()
+{
+  /* Stencil only pass. Passthrough. */
+}
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_tile_stencil_vert.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_deferred_tile_stencil_vert.glsl
@ -0,0 +1,29 @@
+/* SPDX-FileCopyrightText: 2023 Blender Authors
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later */
+
+/**
+ * Load tile classification data and mark stencil areas.
+ */
+
+#pragma BLENDER_REQUIRE(gpu_shader_utildefines_lib.glsl)
+
+void main()
+{
+  int tile_id = gl_VertexID / 6;
+  int vertex_id = gl_VertexID % 6;
+  ivec2 tile_coord = ivec2(unpackUvec2x16(closure_tile_buf[tile_id]));
+
+  /* Generate Quad with 2 triangles with same winding.
+   * This way it can be merged on some hardware. */
+  int v = (vertex_id > 2) ? (3 - (vertex_id - 3)) : vertex_id;
+  ivec2 tile_corner = ivec2(v & 1, v >> 1);
+
+  int tile_size = (1 << closure_tile_size_shift);
+  vec2 ss_coord = vec2((tile_coord + tile_corner) * tile_size) /
+                  vec2(textureSize(direct_radiance_tx, 0));
+  vec2 ndc_coord = ss_coord * 2.0 - 1.0;
+
+  /* gl_Position expects Homogenous space coord. But this is the same thing as NDC in 2D mode. */
+  gl_Position = vec4(ndc_coord, 1.0, 1.0);
+}
--- a/source/blender/draw/engines/eevee_next/shaders/eevee_gbuffer_lib.glsl
+++ b/source/blender/draw/engines/eevee_next/shaders/eevee_gbuffer_lib.glsl
@ -163,6 +163,13 @@ bool gbuffer_has_closure(uint header, eClosureBits closure)
    return has_diffuse;
  }

+  bool has_sss = (gbuffer_header_unpack(header, layer) == GBUF_SSS);
+  layer += int(has_sss);
+
+  if (closure == eClosureBits(CLOSURE_SSS)) {
+    return has_sss;
+  }
+
  return false;
 }

@ -265,8 +272,10 @@ struct GBufferData {
  bool has_diffuse;
  bool has_reflection;
  bool has_refraction;
+  bool has_sss;
  bool has_any_surface;
  uint header;
+  uint closure_count;
 };

 GBufferData gbuffer_read(usampler2D header_tx,
@ -287,6 +296,7 @@ GBufferData gbuffer_read(usampler2D header_tx,
  }

  gbuf.thickness = 0.0;
+  gbuf.closure_count = 0u;

  /* First closure is always written. */
  gbuf.surface_N = gbuffer_normal_unpack(texelFetch(closure_tx, ivec3(texel, 0), 0).xy);
@ -318,6 +328,8 @@ GBufferData gbuffer_read(usampler2D header_tx,
    gbuf.diffuse.sss_radius = vec3(0.0, 0.0, 0.0);
    gbuf.diffuse.sss_id = 0u;

+    gbuf.closure_count = 2u;
+
    return gbuf;
  }

@ -333,6 +345,7 @@ GBufferData gbuffer_read(usampler2D header_tx,
    gbuf.refraction.N = gbuffer_normal_unpack(closure_packed.xy);
    gbuf.refraction.roughness = closure_packed.z;
    gbuf.refraction.ior = gbuffer_ior_unpack(closure_packed.w);
+    gbuf.closure_count += 1u;
    layer += 1;
  }
  else {
@ -352,6 +365,7 @@ GBufferData gbuffer_read(usampler2D header_tx,
    gbuf.reflection.color = gbuffer_color_unpack(color_packed);
    gbuf.reflection.N = gbuffer_normal_unpack(closure_packed.xy);
    gbuf.reflection.roughness = closure_packed.z;
+    gbuf.closure_count += 1u;
    layer += 1;
  }
  else {
@ -370,6 +384,7 @@ GBufferData gbuffer_read(usampler2D header_tx,
    gbuf.diffuse.color = gbuffer_color_unpack(color_packed);
    gbuf.diffuse.N = gbuffer_normal_unpack(closure_packed.xy);
    gbuf.thickness = gbuffer_thickness_unpack(closure_packed.w);
+    gbuf.closure_count += 1u;
    layer += 1;
  }
  else {
@ -379,9 +394,9 @@ GBufferData gbuffer_read(usampler2D header_tx,
    gbuf.thickness = 0.0;
  }

-  bool has_sss = (gbuffer_header_unpack(gbuf.header, layer) == GBUF_SSS);
+  gbuf.has_sss = (gbuffer_header_unpack(gbuf.header, layer) == GBUF_SSS);

-  if (has_sss) {
+  if (gbuf.has_sss) {
    vec4 closure_packed = texelFetch(closure_tx, ivec3(texel, layer), 0);

    gbuf.diffuse.sss_radius = gbuffer_sss_radii_unpack(closure_packed.xyz);
--- a/source/blender/draw/engines/eevee_next/shaders/infos/eevee_deferred_info.hh
+++ b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_deferred_info.hh
@ -17,17 +17,51 @@ GPU_SHADER_CREATE_INFO(eevee_gbuffer_data)
    .sampler(9, ImageType::FLOAT_2D_ARRAY, "gbuf_closure_tx")
    .sampler(10, ImageType::FLOAT_2D_ARRAY, "gbuf_color_tx");

+GPU_SHADER_CREATE_INFO(eevee_deferred_tile_classify)
+    .fragment_source("eevee_deferred_tile_classify_frag.glsl")
+    /* Early fragment test is needed to avoid processing background fragments. */
+    .early_fragment_test(true)
+    .additional_info("eevee_shared", "draw_fullscreen")
+    .subpass_in(1, Type::UINT, "in_gbuffer_header", DEFERRED_GBUFFER_ROG_ID)
+    .typedef_source("draw_shader_shared.h")
+    .image(0, GPU_R8UI, Qualifier::WRITE, ImageType::UINT_2D_ARRAY, "tile_mask_img")
+    .push_constant(Type::INT, "closure_tile_size_shift")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(eevee_deferred_tile_compact)
+    .additional_info("eevee_shared")
+    .typedef_source("draw_shader_shared.h")
+    .vertex_source("eevee_deferred_tile_compact_vert.glsl")
+    /* Reuse dummy stencil frag. */
+    .fragment_source("eevee_deferred_tile_stencil_frag.glsl")
+    .storage_buf(0, Qualifier::READ_WRITE, "DrawCommand", "closure_single_draw_buf")
+    .storage_buf(1, Qualifier::READ_WRITE, "DrawCommand", "closure_double_draw_buf")
+    .storage_buf(2, Qualifier::READ_WRITE, "DrawCommand", "closure_triple_draw_buf")
+    .storage_buf(3, Qualifier::WRITE, "uint", "closure_single_tile_buf[]")
+    .storage_buf(4, Qualifier::WRITE, "uint", "closure_double_tile_buf[]")
+    .storage_buf(5, Qualifier::WRITE, "uint", "closure_triple_tile_buf[]")
+    .sampler(0, ImageType::UINT_2D_ARRAY, "tile_mask_tx")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(eevee_deferred_tile_stencil)
+    .vertex_source("eevee_deferred_tile_stencil_vert.glsl")
+    .fragment_source("eevee_deferred_tile_stencil_frag.glsl")
+    .additional_info("eevee_shared")
+    /* Only for texture size. */
+    .sampler(0, ImageType::FLOAT_2D, "direct_radiance_tx")
+    .storage_buf(4, Qualifier::READ, "uint", "closure_tile_buf[]")
+    .push_constant(Type::INT, "closure_tile_size_shift")
+    .typedef_source("draw_shader_shared.h")
+    .do_static_compilation(true);
+
 GPU_SHADER_CREATE_INFO(eevee_deferred_light)
    .fragment_source("eevee_deferred_light_frag.glsl")
-    /* Early fragment test is needed to avoid processing fragments without correct GBuffer data. */
+    /* Early fragment test is needed to avoid processing background fragments. */
    .early_fragment_test(true)
    /* Chaining to next pass. */
-    /* TODO(@fclem): These could use the sub-pass feature. */
-    .image_out(2, GPU_RGBA16F, "direct_diffuse_img")
-    .image_out(3, GPU_RGBA16F, "direct_reflect_img")
-    .image_out(4, GPU_RGBA16F, "direct_refract_img")
-    .define("SSS_TRANSMITTANCE")
-    .define("LIGHT_CLOSURE_EVAL_COUNT", "3")
+    .image_out(2, DEFERRED_RADIANCE_FORMAT, "direct_radiance_1_img")
+    .image_out(3, DEFERRED_RADIANCE_FORMAT, "direct_radiance_2_img")
+    .image_out(4, DEFERRED_RADIANCE_FORMAT, "direct_radiance_3_img")
    .additional_info("eevee_shared",
                     "eevee_gbuffer_data",
                     "eevee_utility_texture",
@ -36,17 +70,32 @@ GPU_SHADER_CREATE_INFO(eevee_deferred_light)
                     "eevee_shadow_data",
                     "eevee_hiz_data",
                     "eevee_render_pass_out",
-                     "draw_view",
-                     "draw_fullscreen")
+                     "draw_fullscreen",
+                     "draw_view");
+
+GPU_SHADER_CREATE_INFO(eevee_deferred_light_single)
+    .additional_info("eevee_deferred_light")
+    .define("LIGHT_CLOSURE_EVAL_COUNT", "1")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(eevee_deferred_light_double)
+    .additional_info("eevee_deferred_light")
+    .define("LIGHT_CLOSURE_EVAL_COUNT", "2")
+    .do_static_compilation(true);
+
+GPU_SHADER_CREATE_INFO(eevee_deferred_light_triple)
+    .additional_info("eevee_deferred_light")
+    .define("SSS_TRANSMITTANCE")
+    .define("LIGHT_CLOSURE_EVAL_COUNT", "3")
    .do_static_compilation(true);

 GPU_SHADER_CREATE_INFO(eevee_deferred_combine)
-    /* Early fragment test is needed to avoid processing fragments without correct GBuffer data. */
+    /* Early fragment test is needed to avoid processing fragments background fragments. */
    .early_fragment_test(true)
    /* Inputs. */
-    .image_in(2, GPU_RGBA16F, "direct_diffuse_img")
-    .image_in(3, GPU_RGBA16F, "direct_reflect_img")
-    .image_in(4, GPU_RGBA16F, "direct_refract_img")
+    .image_in(2, DEFERRED_RADIANCE_FORMAT, "direct_radiance_1_img")
+    .image_in(3, DEFERRED_RADIANCE_FORMAT, "direct_radiance_2_img")
+    .image_in(4, DEFERRED_RADIANCE_FORMAT, "direct_radiance_3_img")
    .image_in(5, RAYTRACE_RADIANCE_FORMAT, "indirect_diffuse_img")
    .image_in(6, RAYTRACE_RADIANCE_FORMAT, "indirect_reflect_img")
    .image_in(7, RAYTRACE_RADIANCE_FORMAT, "indirect_refract_img")
@ -98,6 +147,7 @@ GPU_SHADER_CREATE_INFO(eevee_deferred_planar_eval)
    .fragment_source("eevee_deferred_planar_frag.glsl")
    .do_static_compilation(true);

+#undef image_array_out
 #undef image_out
 #undef image_in

--- a/source/blender/draw/engines/eevee_next/shaders/infos/eevee_material_info.hh
+++ b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_material_info.hh
@ -153,7 +153,7 @@ GPU_SHADER_CREATE_INFO(eevee_surf_deferred)
    .early_fragment_test(true)
    /* Direct output. (Emissive, Holdout) */
    .fragment_out(0, Type::VEC4, "out_radiance")
-    .fragment_out(1, Type::UINT, "out_gbuf_header")
+    .fragment_out(1, Type::UINT, "out_gbuf_header", DualBlend::NONE, DEFERRED_GBUFFER_ROG_ID)
    .fragment_out(2, Type::VEC4, "out_gbuf_color")
    .fragment_out(3, Type::VEC4, "out_gbuf_closure")
    /* Everything is stored inside a two layered target, one for each format. This is to fit the
--- a/source/blender/draw/engines/eevee_next/shaders/infos/eevee_subsurface_info.hh
+++ b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_subsurface_info.hh
@ -11,7 +11,7 @@ GPU_SHADER_CREATE_INFO(eevee_subsurface_setup)
    .typedef_source("draw_shader_shared.h")
    .additional_info("draw_view", "eevee_shared", "eevee_gbuffer_data")
    .sampler(2, ImageType::DEPTH_2D, "depth_tx")
-    .image(0, GPU_RGBA16F, Qualifier::READ, ImageType::FLOAT_2D, "direct_light_img")
+    .image(0, DEFERRED_RADIANCE_FORMAT, Qualifier::READ, ImageType::FLOAT_2D, "direct_light_img")
    .image(1, RAYTRACE_RADIANCE_FORMAT, Qualifier::READ, ImageType::FLOAT_2D, "indirect_light_img")
    .image(2, SUBSURFACE_OBJECT_ID_FORMAT, Qualifier::WRITE, ImageType::UINT_2D, "object_id_img")
    .image(3, SUBSURFACE_RADIANCE_FORMAT, Qualifier::WRITE, ImageType::FLOAT_2D, "radiance_img")
--- a/source/blender/draw/intern/DRW_render.h
+++ b/source/blender/draw/intern/DRW_render.h
@ -296,6 +296,7 @@ struct GPUShader *DRW_shader_create_fullscreen_with_shaderlib_ex(const char *fra

 struct GPUMaterial *DRW_shader_from_world(struct World *wo,
                                          struct bNodeTree *ntree,
+                                          eGPUMaterialEngine engine,
                                          const uint64_t shader_id,
                                          const bool is_volume_shader,
                                          bool deferred,
@ -303,6 +304,7 @@ struct GPUMaterial *DRW_shader_from_world(struct World *wo,
                                          void *thunk);
 struct GPUMaterial *DRW_shader_from_material(struct Material *ma,
                                             struct bNodeTree *ntree,
+                                             eGPUMaterialEngine engine,
                                             const uint64_t shader_id,
                                             const bool is_volume_shader,
                                             bool deferred,
--- a/source/blender/draw/intern/draw_manager_shader.cc
+++ b/source/blender/draw/intern/draw_manager_shader.cc
@ -493,6 +493,7 @@ GPUShader *DRW_shader_create_fullscreen_with_shaderlib_ex(const char *frag,

 GPUMaterial *DRW_shader_from_world(World *wo,
                                   bNodeTree *ntree,
+                                   eGPUMaterialEngine engine,
                                   const uint64_t shader_id,
                                   const bool is_volume_shader,
                                   bool deferred,
@ -505,6 +506,7 @@ GPUMaterial *DRW_shader_from_world(World *wo,
                                                ntree,
                                                &wo->gpumaterial,
                                                wo->id.name,
+                                                engine,
                                                shader_id,
                                                is_volume_shader,
                                                false,
@ -525,6 +527,7 @@ GPUMaterial *DRW_shader_from_world(World *wo,

 GPUMaterial *DRW_shader_from_material(Material *ma,
                                      bNodeTree *ntree,
+                                      eGPUMaterialEngine engine,
                                      const uint64_t shader_id,
                                      const bool is_volume_shader,
                                      bool deferred,
@ -537,6 +540,7 @@ GPUMaterial *DRW_shader_from_material(Material *ma,
                                                ntree,
                                                &ma->gpumaterial,
                                                ma->id.name,
+                                                engine,
                                                shader_id,
                                                is_volume_shader,
                                                false,
--- a/source/blender/editors/grease_pencil/intern/grease_pencil_geom.cc
+++ b/source/blender/editors/grease_pencil/intern/grease_pencil_geom.cc
@ -117,11 +117,15 @@ Array<float2> polyline_fit_curve(Span<float2> points,
    return {};
  }

+  if (r_cubic_array == nullptr) {
+    return {};
+  }
+
  Span<float2> r_cubic_array_span(reinterpret_cast<float2 *>(r_cubic_array),
                                  r_cubic_array_len * 3);
  Array<float2> curve_positions(r_cubic_array_span);
  /* Free the c-style array. */
-  MEM_freeN(r_cubic_array);
+  free(r_cubic_array);
  return curve_positions;
 }

@ -153,11 +157,16 @@ IndexMask polyline_detect_corners(Span<float2> points,
    /* Error occurred, return. */
    return IndexMask();
  }
+
+  if (r_corners == nullptr) {
+    return IndexMask();
+  }
+
  BLI_assert(samples_max < std::numeric_limits<int>::max());
  Span<int> indices(reinterpret_cast<int *>(r_corners), r_corner_len);
  const IndexMask corner_mask = IndexMask::from_indices<int>(indices, memory);
  /* Free the c-style array. */
-  MEM_freeN(r_corners);
+  free(r_corners);
  return corner_mask;
 }

--- a/source/blender/editors/sculpt_paint/paint_hide.cc
+++ b/source/blender/editors/sculpt_paint/paint_hide.cc
@ -720,7 +720,7 @@ static void invert_visibility_bmesh(Object &object, const Span<PBVHNode *> nodes
      bool fully_hidden = true;
      for (BMVert *vert : BKE_pbvh_bmesh_node_unique_verts(node)) {
        BM_elem_flag_toggle(vert, BM_ELEM_HIDDEN);
-        fully_hidden &= BM_elem_flag_test(vert, BM_ELEM_HIDDEN);
+        fully_hidden &= BM_elem_flag_test_bool(vert, BM_ELEM_HIDDEN);
      }
      BKE_pbvh_node_fully_hidden_set(node, fully_hidden);
      BKE_pbvh_node_mark_rebuild_draw(node);
--- a/source/blender/gpu/GPU_material.h
+++ b/source/blender/gpu/GPU_material.h
@ -233,19 +233,19 @@ struct GPUUniformBuf *GPU_material_sss_profile_get(GPUMaterial *material,
 /**
 * High level functions to create and use GPU materials.
 */
-GPUMaterial *GPU_material_from_nodetree_find(struct ListBase *gpumaterials,
-                                             const void *engine_type,
-                                             int options);
-/**
- * \note Caller must use #GPU_material_from_nodetree_find to re-use existing materials,
- * This is enforced since constructing other arguments to this function may be expensive
- * so only do this when they are needed.
- */
+
+typedef enum eGPUMaterialEngine {
+  GPU_MAT_EEVEE_LEGACY = 0,
+  GPU_MAT_EEVEE,
+  GPU_MAT_COMPOSITOR,
+} eGPUMaterialEngine;
+
 GPUMaterial *GPU_material_from_nodetree(struct Scene *scene,
                                        struct Material *ma,
                                        struct bNodeTree *ntree,
                                        struct ListBase *gpumaterials,
                                        const char *name,
+                                        eGPUMaterialEngine engine,
                                        uint64_t shader_uuid,
                                        bool is_volume_shader,
                                        bool is_lookdev,
@ -421,7 +421,8 @@ typedef void (*ConstructGPUMaterialFn)(void *thunk, GPUMaterial *material);

 /* Construct a GPU material from a set of callbacks. See the callback types for more information.
 * The given thunk will be passed as the first parameter of each callback. */
-GPUMaterial *GPU_material_from_callbacks(ConstructGPUMaterialFn construct_function_cb,
+GPUMaterial *GPU_material_from_callbacks(eGPUMaterialEngine engine,
+                                         ConstructGPUMaterialFn construct_function_cb,
                                         GPUCodegenCallbackFn generate_code_function_cb,
                                         void *thunk);

--- a/source/blender/gpu/intern/gpu_codegen.cc
+++ b/source/blender/gpu/intern/gpu_codegen.cc
@ -97,6 +97,8 @@ struct GPUPass {
  uint refcount;
  /** The last time the refcount was greater than 0. */
  int gc_timestamp;
+  /** The engine type this pass is compiled for. */
+  eGPUMaterialEngine engine;
  /** Identity hash generated from all GLSL code. */
  uint32_t hash;
  /** Did we already tried to compile the attached GPUShader. */
@ -122,12 +124,12 @@ static SpinLock pass_cache_spin;

 /* Search by hash only. Return first pass with the same hash.
 * There is hash collision if (pass->next && pass->next->hash == hash) */
-static GPUPass *gpu_pass_cache_lookup(uint32_t hash)
+static GPUPass *gpu_pass_cache_lookup(eGPUMaterialEngine engine, uint32_t hash)
 {
  BLI_spin_lock(&pass_cache_spin);
  /* Could be optimized with a Lookup table. */
  for (GPUPass *pass = pass_cache; pass; pass = pass->next) {
-    if (pass->hash == hash) {
+    if (pass->hash == hash && pass->engine == engine) {
      BLI_spin_unlock(&pass_cache_spin);
      return pass;
    }
@ -157,10 +159,12 @@ static GPUPass *gpu_pass_cache_resolve_collision(GPUPass *pass,
                                                 GPUShaderCreateInfo *info,
                                                 uint32_t hash)
 {
+  eGPUMaterialEngine engine = pass->engine;
  BLI_spin_lock(&pass_cache_spin);
  for (; pass && (pass->hash == hash); pass = pass->next) {
    if (*reinterpret_cast<ShaderCreateInfo *>(info) ==
-        *reinterpret_cast<ShaderCreateInfo *>(pass->create_info))
+            *reinterpret_cast<ShaderCreateInfo *>(pass->create_info) &&
+        pass->engine == engine)
    {
      BLI_spin_unlock(&pass_cache_spin);
      return pass;
@ -732,6 +736,7 @@ void GPUCodegen::generate_graphs()

 GPUPass *GPU_generate_pass(GPUMaterial *material,
                           GPUNodeGraph *graph,
+                           eGPUMaterialEngine engine,
                           GPUCodegenCallbackFn finalize_source_cb,
                           void *thunk,
                           bool optimize_graph)
@ -763,7 +768,7 @@ GPUPass *GPU_generate_pass(GPUMaterial *material,
     * NOTE: We only perform cache look-up for non-optimized shader
     * graphs, as baked constant data among other optimizations will generate too many
     * shader source permutations, with minimal re-usability. */
-    pass_hash = gpu_pass_cache_lookup(codegen.hash_get());
+    pass_hash = gpu_pass_cache_lookup(engine, codegen.hash_get());

    /* FIXME(fclem): This is broken. Since we only check for the hash and not the full source
     * there is no way to have a collision currently. Some advocated to only use a bigger hash. */
@ -813,6 +818,7 @@ GPUPass *GPU_generate_pass(GPUMaterial *material,
    pass->shader = nullptr;
    pass->refcount = 1;
    pass->create_info = codegen.create_info;
+    pass->engine = engine;
    pass->hash = codegen.hash_get();
    pass->compiled = false;
    pass->cached = false;
--- a/source/blender/gpu/intern/gpu_codegen.h
+++ b/source/blender/gpu/intern/gpu_codegen.h
@ -25,6 +25,7 @@ typedef struct GPUPass GPUPass;

 GPUPass *GPU_generate_pass(GPUMaterial *material,
                           struct GPUNodeGraph *graph,
+                           eGPUMaterialEngine engine,
                           GPUCodegenCallbackFn finalize_source_cb,
                           void *thunk,
                           bool optimize_graph);
--- a/source/blender/gpu/intern/gpu_material.cc
+++ b/source/blender/gpu/intern/gpu_material.cc
@ -99,8 +99,9 @@ struct GPUMaterial {
  eGPUMaterialStatus status;
  /** Some flags about the nodetree & the needed resources. */
  eGPUMaterialFlag flag;
-  /* Identify shader variations (shadow, probe, world background...).
-   * Should be unique even across render engines. */
+  /** The engine type this material is compiled for. */
+  eGPUMaterialEngine engine;
+  /* Identify shader variations (shadow, probe, world background...) */
  uint64_t uuid;
  /* Number of generated function. */
  int generated_function_len;
@ -821,6 +822,7 @@ GPUMaterial *GPU_material_from_nodetree(Scene *scene,
                                        bNodeTree *ntree,
                                        ListBase *gpumaterials,
                                        const char *name,
+                                        eGPUMaterialEngine engine,
                                        uint64_t shader_uuid,
                                        bool is_volume_shader,
                                        bool is_lookdev,
@ -830,7 +832,7 @@ GPUMaterial *GPU_material_from_nodetree(Scene *scene,
  /* Search if this material is not already compiled. */
  LISTBASE_FOREACH (LinkData *, link, gpumaterials) {
    GPUMaterial *mat = (GPUMaterial *)link->data;
-    if (mat->uuid == shader_uuid) {
+    if (mat->uuid == shader_uuid && mat->engine == engine) {
      return mat;
    }
  }
@ -838,6 +840,7 @@ GPUMaterial *GPU_material_from_nodetree(Scene *scene,
  GPUMaterial *mat = static_cast<GPUMaterial *>(MEM_callocN(sizeof(GPUMaterial), "GPUMaterial"));
  mat->ma = ma;
  mat->scene = scene;
+  mat->engine = engine;
  mat->uuid = shader_uuid;
  mat->flag = GPU_MATFLAG_UPDATED;
  mat->status = GPU_MAT_CREATED;
@ -860,7 +863,7 @@ GPUMaterial *GPU_material_from_nodetree(Scene *scene,

  {
    /* Create source code and search pass cache for an already compiled version. */
-    mat->pass = GPU_generate_pass(mat, &mat->graph, callback, thunk, false);
+    mat->pass = GPU_generate_pass(mat, &mat->graph, engine, callback, thunk, false);

    if (mat->pass == nullptr) {
      /* We had a cache hit and the shader has already failed to compile. */
@ -891,7 +894,7 @@ GPUMaterial *GPU_material_from_nodetree(Scene *scene,
        mat->optimize_pass_info.callback = callback;
        mat->optimize_pass_info.thunk = thunk;
 #else
-        mat->optimized_pass = GPU_generate_pass(mat, &mat->graph, callback, thunk, true);
+        mat->optimized_pass = GPU_generate_pass(mat, &mat->graph, engine, callback, thunk, true);
        if (mat->optimized_pass == nullptr) {
          /* Failed to create optimized pass. */
          gpu_node_graph_free_nodes(&mat->graph);
@ -1024,8 +1027,12 @@ void GPU_material_optimize(GPUMaterial *mat)
   * optimal, as these do not benefit from caching, due to baked constants. However, this could
   * possibly be cause for concern for certain cases. */
  if (!mat->optimized_pass) {
-    mat->optimized_pass = GPU_generate_pass(
-        mat, &mat->graph, mat->optimize_pass_info.callback, mat->optimize_pass_info.thunk, true);
+    mat->optimized_pass = GPU_generate_pass(mat,
+                                            &mat->graph,
+                                            mat->engine,
+                                            mat->optimize_pass_info.callback,
+                                            mat->optimize_pass_info.thunk,
+                                            true);
    BLI_assert(mat->optimized_pass);
  }
 #else
@ -1097,7 +1104,8 @@ void GPU_materials_free(Main *bmain)
  BKE_material_defaults_free_gpu();
 }

-GPUMaterial *GPU_material_from_callbacks(ConstructGPUMaterialFn construct_function_cb,
+GPUMaterial *GPU_material_from_callbacks(eGPUMaterialEngine engine,
+                                         ConstructGPUMaterialFn construct_function_cb,
                                         GPUCodegenCallbackFn generate_code_function_cb,
                                         void *thunk)
 {
@ -1110,6 +1118,7 @@ GPUMaterial *GPU_material_from_callbacks(ConstructGPUMaterialFn construct_functi
  material->optimization_status = GPU_MAT_OPTIMIZATION_SKIP;
  material->optimized_pass = nullptr;
  material->default_mat = nullptr;
+  material->engine = engine;

  /* Construct the material graph by adding and linking the necessary GPU material nodes. */
  construct_function_cb(thunk, material);
@ -1119,7 +1128,7 @@ GPUMaterial *GPU_material_from_callbacks(ConstructGPUMaterialFn construct_functi

  /* Lookup an existing pass in the cache or generate a new one. */
  material->pass = GPU_generate_pass(
-      material, &material->graph, generate_code_function_cb, thunk, false);
+      material, &material->graph, material->engine, generate_code_function_cb, thunk, false);
  material->optimized_pass = nullptr;

  /* The pass already exists in the pass cache but its shader already failed to compile. */