Merge branch 'master' into soc-2020-io-performance

Add more regression tests for new obj exporter.
2021-10-30 15:37:05 -04:00 · 2021-10-24 08:31:22 -04:00 · 2021-10-17 16:17:45 -04:00 · 2021-10-14 13:25:59 -04:00 · 2021-09-19 16:49:59 -04:00 · 2021-09-18 10:03:45 -04:00
688 changed files with 14293 additions and 11967 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -440,11 +440,7 @@ mark_as_advanced(WITH_CYCLES_CUDA_BUILD_SERIAL)
 mark_as_advanced(WITH_CUDA_DYNLOAD)

 # AMD HIP
-if(WIN32)
-  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" ON)
-else()
-  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" OFF)
-endif()
+option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" OFF)
 option(WITH_CYCLES_HIP_BINARIES      "Build Cycles AMD HIP binaries" OFF)
 set(CYCLES_HIP_BINARIES_ARCH gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 CACHE STRING "AMD HIP architectures to build binaries for")
 mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
@@ -646,7 +642,7 @@ if(WIN32)
  option(WITH_WINDOWS_PDB "Generate a pdb file for client side stacktraces" ON)
  mark_as_advanced(WITH_WINDOWS_PDB)

-  option(WITH_WINDOWS_STRIPPED_PDB "Use a stripped PDB file" ON)
+  option(WITH_WINDOWS_STRIPPED_PDB "Use a stripped PDB file" On)
  mark_as_advanced(WITH_WINDOWS_STRIPPED_PDB)

 endif()
--- a/build_files/cmake/clang_array_check.py
+++ b/build_files/cmake/clang_array_check.py
@@ -168,7 +168,7 @@ def function_parm_wash_tokens(parm):
    # if tokens[-1].kind == To
    # remove trailing char
    if tokens[-1].kind == TokenKind.PUNCTUATION:
-        if tokens[-1].spelling in {",", ")", ";"}:
+        if tokens[-1].spelling in (",", ")", ";"):
            tokens.pop()
        # else:
        #     print(tokens[-1].spelling)
@@ -179,7 +179,7 @@ def function_parm_wash_tokens(parm):
        t_spelling = t.spelling
        ok = True
        if t_kind == TokenKind.KEYWORD:
-            if t_spelling in {"const", "restrict", "volatile"}:
+            if t_spelling in ("const", "restrict", "volatile"):
                ok = False
            elif t_spelling.startswith("__"):
                ok = False  # __restrict
--- a/build_files/cmake/config/blender_release.cmake
+++ b/build_files/cmake/config/blender_release.cmake
@@ -81,5 +81,4 @@ if(NOT APPLE)
  set(WITH_CYCLES_DEVICE_OPTIX    ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_CUDA_BINARIES   ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_CUBIN_COMPILER  OFF CACHE BOOL "" FORCE)
-  set(WITH_CYCLES_HIP_BINARIES    ON  CACHE BOOL "" FORCE)
 endif()
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@@ -27,7 +27,7 @@ if(NOT MSVC)
 endif()

 if(CMAKE_C_COMPILER_ID MATCHES "Clang")
-  set(MSVC_CLANG ON)
+  set(MSVC_CLANG On)
  set(VC_TOOLS_DIR $ENV{VCToolsRedistDir} CACHE STRING "Location of the msvc redistributables")
  set(MSVC_REDIST_DIR ${VC_TOOLS_DIR})
  if(DEFINED MSVC_REDIST_DIR)
@@ -53,7 +53,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang")
  endif()
  if(WITH_WINDOWS_STRIPPED_PDB)
    message(WARNING "stripped pdb not supported with clang, disabling..")
-    set(WITH_WINDOWS_STRIPPED_PDB OFF)
+    set(WITH_WINDOWS_STRIPPED_PDB Off)
  endif()
 endif()

@@ -159,7 +159,7 @@ endif()
 if(WITH_COMPILER_ASAN AND MSVC AND NOT MSVC_CLANG)
  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 19.28.29828)
    #set a flag so we don't have to do this comparison all the time
-    SET(MSVC_ASAN ON)
+    SET(MSVC_ASAN On)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fsanitize=address")
    set(CMAKE_C_FLAGS     "${CMAKE_C_FLAGS} /fsanitize=address")
    string(APPEND CMAKE_EXE_LINKER_FLAGS_DEBUG " /INCREMENTAL:NO")
@@ -179,22 +179,22 @@ endif()

 if(WITH_WINDOWS_SCCACHE AND CMAKE_VS_MSBUILD_COMMAND)
    message(WARNING "Disabling sccache, sccache is not supported with msbuild")
-    set(WITH_WINDOWS_SCCACHE OFF)
+    set(WITH_WINDOWS_SCCACHE Off)
 endif()

 # Debug Symbol format
 # sccache # MSVC_ASAN # format # why
-# ON      # ON        # Z7     # sccache will only play nice with Z7
-# ON      # OFF       # Z7     # sccache will only play nice with Z7
-# OFF     # ON        # Zi     # Asan will not play nice with Edit and Continue
-# OFF     # OFF       # ZI     # Neither asan nor sscache is enabled Edit and Continue is available
+# On      # On        # Z7     # sccache will only play nice with Z7
+# On      # Off       # Z7     # sccache will only play nice with Z7
+# Off     # On        # Zi     # Asan will not play nice with Edit and Continue
+# Off     # Off       # ZI     # Neither asan nor sscache is enabled Edit and Continue is available

 # Release Symbol format
 # sccache # MSVC_ASAN # format # why
-# ON      # ON        # Z7     # sccache will only play nice with Z7
-# ON      # OFF       # Z7     # sccache will only play nice with Z7
-# OFF     # ON        # Zi     # Asan will not play nice with Edit and Continue
-# OFF     # OFF       # Zi     # Edit and Continue disables some optimizations
+# On      # On        # Z7     # sccache will only play nice with Z7
+# On      # Off       # Z7     # sccache will only play nice with Z7
+# Off     # On        # Zi     # Asan will not play nice with Edit and Continue
+# Off     # Off       # Zi     # Edit and Continue disables some optimizations


 if(WITH_WINDOWS_SCCACHE)
@@ -288,7 +288,7 @@ if(CMAKE_GENERATOR MATCHES "^Visual Studio.+" AND # Only supported in the VS IDE
    "EnableMicrosoftCodeAnalysis=false"
    "EnableClangTidyCodeAnalysis=true"
  )
-  set(VS_CLANG_TIDY ON)
+  set(VS_CLANG_TIDY On)
 endif()

 # Mark libdir as system headers with a lower warn level, to resolve some warnings
@@ -469,7 +469,7 @@ if(WITH_PYTHON)

  set(PYTHON_INCLUDE_DIR ${LIBDIR}/python/${_PYTHON_VERSION_NO_DOTS}/include)
  set(PYTHON_NUMPY_INCLUDE_DIRS ${LIBDIR}/python/${_PYTHON_VERSION_NO_DOTS}/lib/site-packages/numpy/core/include)
-  set(NUMPY_FOUND ON)
+  set(NUMPY_FOUND On)
  unset(_PYTHON_VERSION_NO_DOTS)
  # uncached vars
  set(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}")
@@ -853,18 +853,18 @@ if(WITH_GMP)
  set(GMP_INCLUDE_DIRS ${LIBDIR}/gmp/include)
  set(GMP_LIBRARIES ${LIBDIR}/gmp/lib/libgmp-10.lib optimized ${LIBDIR}/gmp/lib/libgmpxx.lib debug ${LIBDIR}/gmp/lib/libgmpxx_d.lib)
  set(GMP_ROOT_DIR ${LIBDIR}/gmp)
-  set(GMP_FOUND ON)
+  set(GMP_FOUND On)
 endif()

 if(WITH_POTRACE)
  set(POTRACE_INCLUDE_DIRS ${LIBDIR}/potrace/include)
  set(POTRACE_LIBRARIES ${LIBDIR}/potrace/lib/potrace.lib)
-  set(POTRACE_FOUND ON)
+  set(POTRACE_FOUND On)
 endif()

 if(WITH_HARU)
  if(EXISTS ${LIBDIR}/haru)
-    set(HARU_FOUND ON)
+    set(HARU_FOUND On)
    set(HARU_ROOT_DIR ${LIBDIR}/haru)
    set(HARU_INCLUDE_DIRS ${HARU_ROOT_DIR}/include)
    set(HARU_LIBRARIES ${HARU_ROOT_DIR}/lib/libhpdfs.lib)
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -2254,7 +2254,7 @@ def main():
    # First monkey patch to load in fake members.
    setup_monkey_patch()

-    # Perform changes to Blender itself.
+    # Perform changes to Blender it's self.
    setup_data = setup_blender()

    # eventually, create the dirs
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -138,6 +138,11 @@ endif()

 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")

+# avoid link failure with clang 3.4 debug
+if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS '3.4')
+  string(APPEND CMAKE_CXX_FLAGS_DEBUG " -gline-tables-only")
+endif()
+
 add_dependencies(bf_intern_cycles bf_rna)

 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${ADDON_FILES}" ${CYCLES_INSTALL_PATH})
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -233,7 +233,6 @@ def list_render_passes(scene, srl):
    if crl.denoising_store_passes:
        yield ("Denoising Normal",          "XYZ", 'VECTOR')
        yield ("Denoising Albedo",          "RGB", 'COLOR')
-        yield ("Denoising Depth",           "Z", 'VALUE')

    # Custom AOV passes.
    for aov in srl.aovs:
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -40,10 +40,10 @@ class AddPresetIntegrator(AddPresetBase, Operator):
        "cycles.transparent_max_bounces",
        "cycles.caustics_reflective",
        "cycles.caustics_refractive",
-        "cycles.blur_glossy",
-        "cycles.use_fast_gi",
-        "cycles.ao_bounces",
-        "cycles.ao_bounces_render",
+        "cycles.blur_glossy"
+        "cycles.use_fast_gi"
+        "cycles.ao_bounces"
+        "cycles.ao_bounces_render"
    ]

    preset_subdir = "cycles/integrator"
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -87,7 +87,7 @@ enum_use_layer_samples = (

 enum_sampling_pattern = (
    ('SOBOL', "Sobol", "Use Sobol random sampling pattern", 0),
-    ('PROGRESSIVE_MULTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern", 1),
+    ('PROGRESSIVE_MUTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern", 1),
 )

 enum_volume_sampling = (
@@ -325,13 +325,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        default=1024,
    )

-    sample_offset: IntProperty(
-        name="Sample Offset",
-        description="Number of samples to skip when starting render",
-        min=0, max=(1 << 24),
-        default=0,
-    )
-
    time_limit: FloatProperty(
        name="Time Limit",
        description="Limit the render time (excluding synchronization time)."
@@ -346,7 +339,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        name="Sampling Pattern",
        description="Random sampling pattern used by the integrator. When adaptive sampling is enabled, Progressive Multi-Jitter is always used instead of Sobol",
        items=enum_sampling_pattern,
-        default='PROGRESSIVE_MULTI_JITTER',
+        default='PROGRESSIVE_MUTI_JITTER',
    )

    scrambling_distance: FloatProperty(
@@ -1367,7 +1360,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            elif entry.type == 'CPU':
                cpu_devices.append(entry)
        # Extend all GPU devices with CPU.
-        if len(devices) and compute_device_type != 'CPU':
+        if compute_device_type != 'CPU':
            devices.extend(cpu_devices)
        return devices

@@ -1385,18 +1378,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        self.refresh_devices()
        return None

-    def get_compute_device_type(self):
-        if self.compute_device_type == '':
-            return 'NONE'
-        return self.compute_device_type
-
    def get_num_gpu_devices(self):
        import _cycles
-        compute_device_type = self.get_compute_device_type()
-        device_list = _cycles.available_devices(compute_device_type)
+        device_list = _cycles.available_devices(self.compute_device_type)
        num = 0
        for device in device_list:
-            if device[1] != compute_device_type:
+            if device[1] != self.compute_device_type:
                continue
            for dev in self.devices:
                if dev.use and dev.id == device[2]:
@@ -1426,9 +1413,9 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                col.label(text="and NVIDIA driver version 470 or newer", icon='BLANK1')
            elif device_type == 'HIP':
                import sys
-                col.label(text="Requires discrete AMD GPU with RDNA architecture", icon='BLANK1')
+                col.label(text="Requires discrete AMD GPU with ??? architecture", icon='BLANK1')
                if sys.platform[:3] == "win":
-                    col.label(text="and AMD Radeon Pro 21.Q4 driver or newer", icon='BLANK1')
+                    col.label(text="and AMD driver version ??? or newer", icon='BLANK1')
            return

        for device in devices:
@@ -1438,16 +1425,15 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        row = layout.row()
        row.prop(self, "compute_device_type", expand=True)

-        compute_device_type = self.get_compute_device_type()
-        if compute_device_type == 'NONE':
+        if self.compute_device_type == 'NONE':
            return
        row = layout.row()
-        devices = self.get_devices_for_type(compute_device_type)
-        self._draw_devices(row, compute_device_type, devices)
+        devices = self.get_devices_for_type(self.compute_device_type)
+        self._draw_devices(row, self.compute_device_type, devices)

        import _cycles
        has_peer_memory = 0
-        for device in _cycles.available_devices(compute_device_type):
+        for device in _cycles.available_devices(self.compute_device_type):
            if device[3] and self.find_existing_device_entry(device).use:
                has_peer_memory += 1
        if has_peer_memory > 1:
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -290,9 +290,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
        col.active = not (cscene.use_adaptive_sampling and cscene.use_preview_adaptive_sampling)
        col.prop(cscene, "sampling_pattern", text="Pattern")

-        col = layout.column(align=True)
-        col.prop(cscene, "sample_offset")
-
        layout.separator()

        col = layout.column(align=True)
@@ -1054,7 +1051,7 @@ class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel):


 def has_geometry_visibility(ob):
-    return ob and ((ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LIGHT', 'VOLUME', 'POINTCLOUD', 'HAIR'}) or
+    return ob and ((ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LIGHT'}) or
                   (ob.instance_type == 'COLLECTION' and ob.instance_collection))


--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -86,7 +86,7 @@ def do_versions(self):
            # Device might not currently be available so this can fail
            try:
                if system.legacy_compute_device_type == 1:
-                    prop.compute_device_type = 'NONE' # Was OpenCL
+                    prop.compute_device_type = 'OPENCL'
                elif system.legacy_compute_device_type == 2:
                    prop.compute_device_type = 'CUDA'
                else:
@@ -97,12 +97,6 @@ def do_versions(self):
            # Init device list for UI
            prop.get_devices(prop.compute_device_type)

-    if bpy.context.preferences.version <= (3, 0, 40):
-        # Disable OpenCL device
-        prop = bpy.context.preferences.addons[__package__].preferences
-        if prop.is_property_set("compute_device_type") and prop['compute_device_type'] == 4:
-            prop.compute_device_type = 'NONE'
-
    # We don't modify startup file because it assumes to
    # have all the default values only.
    if not bpy.data.is_saved:
@@ -243,7 +237,7 @@ def do_versions(self):
                    cscene.use_preview_denoising = False
                if not cscene.is_property_set("sampling_pattern") or \
                   cscene.get('sampling_pattern') >= 2:
-                    cscene.sampling_pattern = 'PROGRESSIVE_MULTI_JITTER'
+                    cscene.sampling_pattern = 'PROGRESSIVE_MUTI_JITTER'

                # Removal of square samples.
                cscene = scene.cycles
--- a/intern/cycles/blender/camera.cpp
+++ b/intern/cycles/blender/camera.cpp
@@ -639,7 +639,7 @@ void BlenderSync::sync_camera_motion(
    /* TODO(sergey): De-duplicate calculation with camera sync. */
    float fov = 2.0f * atanf((0.5f * sensor_size) / bcam.lens / aspectratio);
    if (fov != cam->get_fov()) {
-      VLOG(3) << "Camera " << b_ob.name() << " FOV change detected.";
+      VLOG(1) << "Camera " << b_ob.name() << " FOV change detected.";
      if (motion_time == 0.0f) {
        cam->set_fov(fov);
      }
--- a/intern/cycles/blender/curves.cpp
+++ b/intern/cycles/blender/curves.cpp
@@ -304,6 +304,10 @@ static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CDa
    }
  }

+  if (num_curves > 0) {
+    VLOG(1) << "Exporting curve segments for mesh " << hair->name;
+  }
+
  hair->reserve_curves(hair->num_curves() + num_curves, hair->get_curve_keys().size() + num_keys);

  num_keys = 0;
@@ -352,7 +356,7 @@ static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CDa

  /* check allocation */
  if ((hair->get_curve_keys().size() != num_keys) || (hair->num_curves() != num_curves)) {
-    VLOG(1) << "Hair memory allocation failed, clearing data.";
+    VLOG(1) << "Allocation failed, clearing data";
    hair->clear(true);
  }
 }
@@ -408,11 +412,16 @@ static void export_hair_motion_validate_attribute(Hair *hair,
  if (num_motion_keys != num_keys || !have_motion) {
    /* No motion or hair "topology" changed, remove attributes again. */
    if (num_motion_keys != num_keys) {
-      VLOG(1) << "Hair topology changed, removing motion attribute.";
+      VLOG(1) << "Hair topology changed, removing attribute.";
+    }
+    else {
+      VLOG(1) << "No motion, removing attribute.";
    }
    hair->attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
  }
  else if (motion_step > 0) {
+    VLOG(1) << "Filling in new motion vertex position for motion_step " << motion_step;
+
    /* Motion, fill up previous steps that we might have skipped because
     * they had no motion, but we need them anyway now. */
    for (int step = 0; step < motion_step; step++) {
@@ -428,12 +437,16 @@ static void export_hair_motion_validate_attribute(Hair *hair,

 static void ExportCurveSegmentsMotion(Hair *hair, ParticleCurveData *CData, int motion_step)
 {
+  VLOG(1) << "Exporting curve motion segments for hair " << hair->name << ", motion step "
+          << motion_step;
+
  /* find attribute */
  Attribute *attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
  bool new_attribute = false;

  /* add new attribute if it doesn't exist already */
  if (!attr_mP) {
+    VLOG(1) << "Creating new motion vertex position attribute";
    attr_mP = hair->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
    new_attribute = true;
  }
@@ -669,6 +682,10 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
  const int num_keys = b_hair.points.length();
  const int num_curves = b_hair.curves.length();

+  if (num_curves > 0) {
+    VLOG(1) << "Exporting curve segments for hair " << hair->name;
+  }
+
  hair->reserve_curves(num_curves, num_keys);

  /* Export curves and points. */
@@ -726,11 +743,15 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)

 static void export_hair_curves_motion(Hair *hair, BL::Hair b_hair, int motion_step)
 {
+  VLOG(1) << "Exporting curve motion segments for hair " << hair->name << ", motion step "
+          << motion_step;
+
  /* Find or add attribute. */
  Attribute *attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
  bool new_attribute = false;

  if (!attr_mP) {
+    VLOG(1) << "Creating new motion vertex position attribute";
    attr_mP = hair->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
    new_attribute = true;
  }
--- a/intern/cycles/blender/object.cpp
+++ b/intern/cycles/blender/object.cpp
@@ -62,46 +62,31 @@ bool BlenderSync::BKE_object_is_modified(BL::Object &b_ob)
  return false;
 }

-bool BlenderSync::object_is_geometry(BObjectInfo &b_ob_info)
+bool BlenderSync::object_is_geometry(BL::Object &b_ob)
 {
-  BL::ID b_ob_data = b_ob_info.object_data;
+  BL::ID b_ob_data = b_ob.data();

  if (!b_ob_data) {
    return false;
  }

-  BL::Object::type_enum type = b_ob_info.iter_object.type();
+  BL::Object::type_enum type = b_ob.type();

  if (type == BL::Object::type_VOLUME || type == BL::Object::type_HAIR) {
    /* Will be exported attached to mesh. */
    return true;
  }
+  else if (type == BL::Object::type_CURVE) {
+    /* Skip exporting curves without faces, overhead can be
+     * significant if there are many for path animation. */
+    BL::Curve b_curve(b_ob_data);

-  /* Other object types that are not meshes but evaluate to meshes are presented to render engines
-   * as separate instance objects. Metaballs and surface objects have not been affected by that
-   * change yet. */
-  if (type == BL::Object::type_SURFACE || type == BL::Object::type_META) {
-    return true;
+    return (b_curve.bevel_object() || b_curve.extrude() != 0.0f || b_curve.bevel_depth() != 0.0f ||
+            b_curve.dimensions() == BL::Curve::dimensions_2D || b_ob.modifiers.length());
  }
-
-  return b_ob_data.is_a(&RNA_Mesh);
-}
-
-bool BlenderSync::object_can_have_geometry(BL::Object &b_ob)
-{
-  BL::Object::type_enum type = b_ob.type();
-  switch (type) {
-    case BL::Object::type_MESH:
-    case BL::Object::type_CURVE:
-    case BL::Object::type_SURFACE:
-    case BL::Object::type_META:
-    case BL::Object::type_FONT:
-    case BL::Object::type_HAIR:
-    case BL::Object::type_POINTCLOUD:
-    case BL::Object::type_VOLUME:
-      return true;
-    default:
-      return false;
+  else {
+    return (b_ob_data.is_a(&RNA_Mesh) || b_ob_data.is_a(&RNA_Curve) ||
+            b_ob_data.is_a(&RNA_MetaBall));
  }
 }

@@ -176,11 +161,6 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
  if (is_instance) {
    persistent_id_array = b_instance.persistent_id();
    persistent_id = persistent_id_array.data;
-    if (!b_ob_info.is_real_object_data()) {
-      /* Remember which object data the geometry is coming from, so that we can sync it when the
-       * object has changed. */
-      instance_geometries_by_object[b_ob_info.real_object.ptr.data].insert(b_ob_info.object_data);
-    }
  }

  /* light is handled separately */
@@ -207,7 +187,7 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
  }

  /* only interested in object that we can create meshes from */
-  if (!object_is_geometry(b_ob_info)) {
+  if (!object_is_geometry(b_ob)) {
    return NULL;
  }

@@ -294,7 +274,7 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,

  object->set_visibility(visibility);

-  object->set_is_shadow_catcher(b_ob.is_shadow_catcher() || b_parent.is_shadow_catcher());
+  object->set_is_shadow_catcher(b_ob.is_shadow_catcher());

  float shadow_terminator_shading_offset = get_float(cobject, "shadow_terminator_offset");
  object->set_shadow_terminator_shading_offset(shadow_terminator_shading_offset);
@@ -580,7 +560,6 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph,
  else {
    geometry_motion_synced.clear();
  }
-  instance_geometries_by_object.clear();

  /* initialize culling */
  BlenderObjectCulling culling(scene, b_scene);
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -157,6 +157,8 @@ static PyObject *init_func(PyObject * /*self*/, PyObject *args)

  DebugFlags().running_inside_blender = true;

+  VLOG(2) << "Debug flags initialized to:\n" << DebugFlags();
+
  Py_RETURN_NONE;
 }

@@ -883,6 +885,8 @@ static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args)

  debug_flags_sync_from_scene(b_scene);

+  VLOG(2) << "Debug flags set to:\n" << DebugFlags();
+
  debug_flags_set = true;

  Py_RETURN_NONE;
@@ -892,6 +896,7 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/
 {
  debug_flags_reset();
  if (debug_flags_set) {
+    VLOG(2) << "Debug flags reset to:\n" << DebugFlags();
    debug_flags_set = false;
  }
  Py_RETURN_NONE;
--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -606,19 +606,6 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
  pass->set_type(bake_type_to_pass(bake_type, bake_filter));
  pass->set_include_albedo((bake_filter & BL::BakeSettings::pass_filter_COLOR));

-  if (pass->get_type() == PASS_COMBINED) {
-    /* Filtering settings for combined pass. */
-    Integrator *integrator = scene->integrator;
-    integrator->set_use_direct_light((bake_filter & BL::BakeSettings::pass_filter_DIRECT) != 0);
-    integrator->set_use_indirect_light((bake_filter & BL::BakeSettings::pass_filter_INDIRECT) !=
-                                       0);
-    integrator->set_use_diffuse((bake_filter & BL::BakeSettings::pass_filter_DIFFUSE) != 0);
-    integrator->set_use_glossy((bake_filter & BL::BakeSettings::pass_filter_GLOSSY) != 0);
-    integrator->set_use_transmission((bake_filter & BL::BakeSettings::pass_filter_TRANSMISSION) !=
-                                     0);
-    integrator->set_use_emission((bake_filter & BL::BakeSettings::pass_filter_EMIT) != 0);
-  }
-
  session->set_display_driver(nullptr);
  session->set_output_driver(make_unique<BlenderOutputDriver>(b_engine));

--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -162,19 +162,19 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
    /* Object */
    else if (b_id.is_a(&RNA_Object)) {
      BL::Object b_ob(b_id);
-      const bool can_have_geometry = object_can_have_geometry(b_ob);
-      const bool is_light = !can_have_geometry && object_is_light(b_ob);
+      const bool is_geometry = object_is_geometry(b_ob);
+      const bool is_light = !is_geometry && object_is_light(b_ob);

      if (b_ob.is_instancer() && b_update.is_updated_shading()) {
        /* Needed for e.g. object color updates on instancer. */
        object_map.set_recalc(b_ob);
      }

-      if (can_have_geometry || is_light) {
+      if (is_geometry || is_light) {
        const bool updated_geometry = b_update.is_updated_geometry();

        /* Geometry (mesh, hair, volume). */
-        if (can_have_geometry) {
+        if (is_geometry) {
          if (b_update.is_updated_transform() || b_update.is_updated_shading()) {
            object_map.set_recalc(b_ob);
          }
@@ -183,15 +183,6 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
              (object_subdivision_type(b_ob, preview, experimental) != Mesh::SUBDIVISION_NONE)) {
            BL::ID key = BKE_object_is_modified(b_ob) ? b_ob : b_ob.data();
            geometry_map.set_recalc(key);
-
-            /* Sync all contained geometry instances as well when the object changed.. */
-            map<void *, set<BL::ID>>::const_iterator instance_geometries =
-                instance_geometries_by_object.find(b_ob.ptr.data);
-            if (instance_geometries != instance_geometries_by_object.end()) {
-              for (BL::ID geometry : instance_geometries->second) {
-                geometry_map.set_recalc(geometry);
-              }
-            }
          }

          if (updated_geometry) {
@@ -375,9 +366,7 @@ void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)
  if ((preview && !preview_scrambling_distance) || use_adaptive_sampling)
    scrambling_distance = 1.0f;

-  if (scrambling_distance != 1.0f) {
-    VLOG(3) << "Using scrambling distance: " << scrambling_distance;
-  }
+  VLOG(1) << "Used Scrambling Distance: " << scrambling_distance;
  integrator->set_scrambling_distance(scrambling_distance);

  if (get_boolean(cscene, "use_fast_gi")) {
@@ -835,25 +824,18 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
  /* samples */
  int samples = get_int(cscene, "samples");
  int preview_samples = get_int(cscene, "preview_samples");
-  int sample_offset = get_int(cscene, "sample_offset");

  if (background) {
    params.samples = samples;
-    params.sample_offset = sample_offset;
  }
  else {
    params.samples = preview_samples;
-    if (params.samples == 0) {
+    if (params.samples == 0)
      params.samples = INT_MAX;
-    }
-    params.sample_offset = 0;
  }

-  /* Clamp sample offset. */
-  params.sample_offset = clamp(params.sample_offset, 0, Integrator::MAX_SAMPLES);
-
  /* Clamp samples. */
-  params.samples = clamp(params.samples, 0, Integrator::MAX_SAMPLES - params.sample_offset);
+  params.samples = min(params.samples, Integrator::MAX_SAMPLES);

  /* Viewport Performance */
  params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
--- a/intern/cycles/blender/sync.h
+++ b/intern/cycles/blender/sync.h
@@ -208,8 +208,7 @@ class BlenderSync {
  /* util */
  void find_shader(BL::ID &id, array<Node *> &used_shaders, Shader *default_shader);
  bool BKE_object_is_modified(BL::Object &b_ob);
-  bool object_is_geometry(BObjectInfo &b_ob_info);
-  bool object_can_have_geometry(BL::Object &b_ob);
+  bool object_is_geometry(BL::Object &b_ob);
  bool object_is_light(BL::Object &b_ob);

  /* variables */
@@ -226,8 +225,6 @@ class BlenderSync {
  set<Geometry *> geometry_synced;
  set<Geometry *> geometry_motion_synced;
  set<Geometry *> geometry_motion_attribute_synced;
-  /** Remember which geometries come from which objects to be able to sync them after changes. */
-  map<void *, set<BL::ID>> instance_geometries_by_object;
  set<float> motion_times;
  void *world_map;
  bool world_recalc;
--- a/intern/cycles/device/cpu/device.cpp
+++ b/intern/cycles/device/cpu/device.cpp
@@ -38,6 +38,7 @@ void device_cpu_info(vector<DeviceInfo> &devices)
  info.id = "CPU";
  info.num = 0;
  info.has_osl = true;
+  info.has_half_images = true;
  info.has_nanovdb = true;
  info.has_profiling = true;
  if (openimagedenoise_supported()) {
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -68,8 +68,8 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
 {
  /* Pick any kernel, all of them are supposed to have same level of microarchitecture
   * optimization. */
-  VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name()
-          << " CPU kernels.";
+  VLOG(1) << "Will be using " << kernels.integrator_init_from_camera.get_uarch_name()
+          << " kernels.";

  if (info.cpu_threads == 0) {
    info.cpu_threads = TaskScheduler::num_threads();
@@ -297,6 +297,11 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    Device::build_bvh(bvh, progress, refit);
 }

+const CPUKernels *CPUDevice::get_cpu_kernels() const
+{
+  return &kernels;
+}
+
 void CPUDevice::get_cpu_kernel_thread_globals(
    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
 {
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -57,6 +57,8 @@ class CPUDevice : public Device {
  RTCDevice embree_device;
 #endif

+  CPUKernels kernels;
+
  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
  ~CPUDevice();

@@ -88,6 +90,7 @@ class CPUDevice : public Device {

  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;

+  virtual const CPUKernels *get_cpu_kernels() const override;
  virtual void get_cpu_kernel_thread_globals(
      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
  virtual void *get_cpu_osl_memory() override;
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -26,9 +26,6 @@ CCL_NAMESPACE_BEGIN
      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)

 #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
-#define REGISTER_KERNEL_FILM_CONVERT(name) \
-  film_convert_##name(KERNEL_FUNCTIONS(film_convert_##name)), \
-      film_convert_half_rgba_##name(KERNEL_FUNCTIONS(film_convert_half_rgba_##name))

 CPUKernels::CPUKernels()
    : /* Integrator. */
@@ -53,25 +50,11 @@ CPUKernels::CPUKernels()
      REGISTER_KERNEL(adaptive_sampling_filter_x),
      REGISTER_KERNEL(adaptive_sampling_filter_y),
      /* Cryptomatte. */
-      REGISTER_KERNEL(cryptomatte_postprocess),
-      /* Film Convert. */
-      REGISTER_KERNEL_FILM_CONVERT(depth),
-      REGISTER_KERNEL_FILM_CONVERT(mist),
-      REGISTER_KERNEL_FILM_CONVERT(sample_count),
-      REGISTER_KERNEL_FILM_CONVERT(float),
-      REGISTER_KERNEL_FILM_CONVERT(light_path),
-      REGISTER_KERNEL_FILM_CONVERT(float3),
-      REGISTER_KERNEL_FILM_CONVERT(motion),
-      REGISTER_KERNEL_FILM_CONVERT(cryptomatte),
-      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher),
-      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow),
-      REGISTER_KERNEL_FILM_CONVERT(combined),
-      REGISTER_KERNEL_FILM_CONVERT(float4)
+      REGISTER_KERNEL(cryptomatte_postprocess)
 {
 }

 #undef REGISTER_KERNEL
-#undef REGISTER_KERNEL_FILM_CONVERT
 #undef KERNEL_FUNCTIONS

 CCL_NAMESPACE_END
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -17,13 +17,11 @@
 #pragma once

 #include "device/cpu/kernel_function.h"
-#include "util/half.h"
 #include "util/types.h"

 CCL_NAMESPACE_BEGIN

 struct KernelGlobalsCPU;
-struct KernelFilmConvert;
 struct IntegratorStateCPU;
 struct TileInfo;

@@ -42,7 +40,7 @@ class CPUKernels {

  IntegratorInitFunction integrator_init_from_camera;
  IntegratorInitFunction integrator_init_from_bake;
-  IntegratorShadeFunction integrator_intersect_closest;
+  IntegratorFunction integrator_intersect_closest;
  IntegratorFunction integrator_intersect_shadow;
  IntegratorFunction integrator_intersect_subsurface;
  IntegratorFunction integrator_intersect_volume_stack;
@@ -104,41 +102,6 @@ class CPUKernels {

  CryptomattePostprocessFunction cryptomatte_postprocess;

-  /* Film Convert. */
-  using FilmConvertFunction = CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
-                                                         const float *buffer,
-                                                         float *pixel,
-                                                         const int width,
-                                                         const int buffer_stride,
-                                                         const int pixel_stride)>;
-  using FilmConvertHalfRGBAFunction =
-      CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
-                                 const float *buffer,
-                                 half4 *pixel,
-                                 const int width,
-                                 const int buffer_stride)>;
-
-#define KERNEL_FILM_CONVERT_FUNCTION(name) \
-  FilmConvertFunction film_convert_##name; \
-  FilmConvertHalfRGBAFunction film_convert_half_rgba_##name;
-
-  KERNEL_FILM_CONVERT_FUNCTION(depth)
-  KERNEL_FILM_CONVERT_FUNCTION(mist)
-  KERNEL_FILM_CONVERT_FUNCTION(sample_count)
-  KERNEL_FILM_CONVERT_FUNCTION(float)
-
-  KERNEL_FILM_CONVERT_FUNCTION(light_path)
-  KERNEL_FILM_CONVERT_FUNCTION(float3)
-
-  KERNEL_FILM_CONVERT_FUNCTION(motion)
-  KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
-  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
-  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
-  KERNEL_FILM_CONVERT_FUNCTION(combined)
-  KERNEL_FILM_CONVERT_FUNCTION(float4)
-
-#undef KERNEL_FILM_CONVERT_FUNCTION
-
  CPUKernels();
 };

--- a/intern/cycles/device/cuda/device.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -144,6 +144,7 @@ void device_cuda_info(vector<DeviceInfo> &devices)
    info.description = string(name);
    info.num = num;

+    info.has_half_images = (major >= 3);
    info.has_nanovdb = true;
    info.denoisers = 0;

--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -378,9 +378,7 @@ string CUDADevice::compile_kernel(const uint kernel_features,
      cubin.c_str(),
      common_cflags.c_str());

-  printf("Compiling %sCUDA kernel ...\n%s\n",
-         (use_adaptive_compilation()) ? "adaptive " : "",
-         command.c_str());
+  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());

 #  ifdef _WIN32
  command = "call " + command;
@@ -407,15 +405,13 @@ string CUDADevice::compile_kernel(const uint kernel_features,

 bool CUDADevice::load_kernels(const uint kernel_features)
 {
-  /* TODO(sergey): Support kernels re-load for CUDA devices adaptive compile.
+  /* TODO(sergey): Support kernels re-load for CUDA devices.
   *
   * Currently re-loading kernel will invalidate memory pointers,
   * causing problems in cuCtxSynchronize.
   */
  if (cuModule) {
-    if (use_adaptive_compilation()) {
-      VLOG(1) << "Skipping CUDA kernel reload for adaptive compilation, not currently supported.";
-    }
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
    return true;
  }

@@ -931,6 +927,7 @@ void CUDADevice::tex_alloc(device_texture &mem)
 {
  CUDAContextScope scope(this);

+  /* General variables for both architectures */
  string bind_name = mem.name;
  size_t dsize = datatype_size(mem.data_type);
  size_t size = mem.memory_size();
@@ -1093,6 +1090,7 @@ void CUDADevice::tex_alloc(device_texture &mem)

  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    /* Kepler+, bindless textures. */
    CUDA_RESOURCE_DESC resDesc;
    memset(&resDesc, 0, sizeof(resDesc));

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -23,7 +23,6 @@
 #include "device/queue.h"

 #include "device/cpu/device.h"
-#include "device/cpu/kernel.h"
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
 #include "device/hip/device.h"
@@ -286,6 +285,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
  info.description = "Multi Device";
  info.num = 0;

+  info.has_half_images = true;
  info.has_nanovdb = true;
  info.has_osl = true;
  info.has_profiling = true;
@@ -332,6 +332,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
    }

    /* Accumulate device info. */
+    info.has_half_images &= device.has_half_images;
    info.has_nanovdb &= device.has_nanovdb;
    info.has_osl &= device.has_osl;
    info.has_profiling &= device.has_profiling;
@@ -362,11 +363,10 @@ unique_ptr<DeviceQueue> Device::gpu_queue_create()
  return nullptr;
 }

-const CPUKernels &Device::get_cpu_kernels()
+const CPUKernels *Device::get_cpu_kernels() const
 {
-  /* Initialize CPU kernels once and reuse. */
-  static CPUKernels kernels;
-  return kernels;
+  LOG(FATAL) << "Device does not support CPU kernels.";
+  return nullptr;
 }

 void Device::get_cpu_kernel_thread_globals(
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -73,6 +73,7 @@ class DeviceInfo {
  int num;
  bool display_device;        /* GPU is used as a display device. */
  bool has_nanovdb;           /* Support NanoVDB volumes. */
+  bool has_half_images;       /* Support half-float textures. */
  bool has_osl;               /* Support Open Shading Language. */
  bool has_profiling;         /* Supports runtime collection of profiling info. */
  bool has_peer_memory;       /* GPU has P2P access to memory of another GPU. */
@@ -89,6 +90,7 @@ class DeviceInfo {
    num = 0;
    cpu_threads = 0;
    display_device = false;
+    has_half_images = false;
    has_nanovdb = false;
    has_osl = false;
    has_profiling = false;
@@ -178,7 +180,7 @@ class Device {
   * These may not be used on GPU or multi-devices. */

  /* Get CPU kernel functions for native instruction set. */
-  static const CPUKernels &get_cpu_kernels();
+  virtual const CPUKernels *get_cpu_kernels() const;
  /* Get kernel globals to pass to kernels. */
  virtual void get_cpu_kernel_thread_globals(
      vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
--- a/intern/cycles/device/hip/device.cpp
+++ b/intern/cycles/device/hip/device.cpp
@@ -131,9 +131,9 @@ void device_hip_info(vector<DeviceInfo> &devices)
      continue;
    }

-    if (!hipSupportsDevice(num)) {
-      continue;
-    }
+    int major;
+    hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, num);
+    // TODO : (Arya) What is the last major version we are supporting?

    DeviceInfo info;

@@ -141,6 +141,7 @@ void device_hip_info(vector<DeviceInfo> &devices)
    info.description = string(name);
    info.num = num;

+    info.has_half_images = (major >= 3);
    info.has_nanovdb = true;
    info.denoisers = 0;

--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -146,18 +146,12 @@ HIPDevice::~HIPDevice()

 bool HIPDevice::support_device(const uint /*kernel_features*/)
 {
-  if (hipSupportsDevice(hipDevId)) {
-    return true;
-  }
-  else {
-    /* We only support Navi and above. */
-    hipDeviceProp_t props;
-    hipGetDeviceProperties(&props, hipDevId);
+  int major, minor;
+  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
+  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);

-    set_error(string_printf("HIP backend requires AMD RDNA graphics card or up, but found %s.",
-                            props.name));
-    return false;
-  }
+  // TODO : (Arya) What versions do we plan to support?
+  return true;
 }

 bool HIPDevice::check_peer_access(Device *peer_device)
@@ -222,6 +216,7 @@ string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features)
  const string include_path = source_path;
  string cflags = string_printf(
      "-m%d "
+      "--ptxas-options=\"-v\" "
      "--use_fast_math "
      "-DHIPCC "
      "-I\"%s\"",
@@ -233,7 +228,10 @@ string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features)
  return cflags;
 }

-string HIPDevice::compile_kernel(const uint kernel_features, const char *name, const char *base)
+string HIPDevice::compile_kernel(const uint kernel_features,
+                                 const char *name,
+                                 const char *base,
+                                 bool force_ptx)
 {
  /* Compute kernel name. */
  int major, minor;
@@ -242,20 +240,35 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c
  hipDeviceProp_t props;
  hipGetDeviceProperties(&props, hipDevId);

-  /* gcnArchName can contain tokens after the arch name with features, ie.
-   * `gfx1010:sramecc-:xnack-` so we tokenize it to get the first part. */
-  char *arch = strtok(props.gcnArchName, ":");
-  if (arch == NULL) {
-    arch = props.gcnArchName;
-  }
-
  /* Attempt to use kernel provided with Blender. */
  if (!use_adaptive_compilation()) {
-    const string fatbin = path_get(string_printf("lib/%s_%s.fatbin", name, arch));
-    VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
-    if (path_exists(fatbin)) {
-      VLOG(1) << "Using precompiled kernel.";
-      return fatbin;
+    if (!force_ptx) {
+      const string fatbin = path_get(string_printf("lib/%s_%s.fatbin", name, props.gcnArchName));
+      VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
+      if (path_exists(fatbin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return fatbin;
+      }
+    }
+
+    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+    int ptx_major = major, ptx_minor = minor;
+    while (ptx_major >= 3) {
+      const string ptx = path_get(
+          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
+
+      if (ptx_minor > 0) {
+        ptx_minor--;
+      }
+      else {
+        ptx_major--;
+        ptx_minor = 9;
+      }
    }
  }

@@ -279,10 +292,12 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c
 #  ifdef _DEBUG
  options.append(" -save-temps");
 #  endif
-  options.append(" --amdgpu-target=").append(arch);
+  options.append(" --amdgpu-target=").append(props.gcnArchName);

  const string include_path = source_path;
-  const string fatbin_file = string_printf("cycles_%s_%s_%s", name, arch, kernel_md5.c_str());
+  const char *const kernel_arch = props.gcnArchName;
+  const string fatbin_file = string_printf(
+      "cycles_%s_%s_%s", name, kernel_arch, kernel_md5.c_str());
  const string fatbin = path_cache_get(path_join("kernels", fatbin_file));
  VLOG(1) << "Testing for locally compiled kernel " << fatbin << ".";
  if (path_exists(fatbin)) {
@@ -292,9 +307,9 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c

 #  ifdef _WIN32
  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
-    if (!hipSupportsDevice(hipDevId)) {
+    if (major < 3) {
      set_error(
-          string_printf("HIP backend requires compute capability 10.1 or up, but found %d.%d. "
+          string_printf("HIP backend requires compute capability 3.0 or up, but found %d.%d. "
                        "Your GPU is not supported.",
                        major,
                        minor));
@@ -345,9 +360,7 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c
                                 source_path.c_str(),
                                 fatbin.c_str());

-  printf("Compiling %sHIP kernel ...\n%s\n",
-         (use_adaptive_compilation()) ? "adaptive " : "",
-         command.c_str());
+  printf("Compiling HIP kernel ...\n%s\n", command.c_str());

 #  ifdef _WIN32
  command = "call " + command;
@@ -374,14 +387,13 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c

 bool HIPDevice::load_kernels(const uint kernel_features)
 {
-  /* TODO(sergey): Support kernels re-load for HIP devices adaptive compile.
+  /* TODO(sergey): Support kernels re-load for HIP devices.
   *
-   * Currently re-loading kernels will invalidate memory pointers.
+   * Currently re-loading kernel will invalidate memory pointers,
+   * causing problems in hipCtxSynchronize.
   */
  if (hipModule) {
-    if (use_adaptive_compilation()) {
-      VLOG(1) << "Skipping HIP kernel reload for adaptive compilation, not currently supported.";
-    }
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
    return true;
  }

@@ -390,9 +402,8 @@ bool HIPDevice::load_kernels(const uint kernel_features)
    return false;

  /* check if GPU is supported */
-  if (!support_device(kernel_features)) {
+  if (!support_device(kernel_features))
    return false;
-  }

  /* get kernel */
  const char *kernel_name = "kernel";
@@ -897,6 +908,7 @@ void HIPDevice::tex_alloc(device_texture &mem)
 {
  HIPContextScope scope(this);

+  /* General variables for both architectures */
  string bind_name = mem.name;
  size_t dsize = datatype_size(mem.data_type);
  size_t size = mem.memory_size();
@@ -1061,6 +1073,7 @@ void HIPDevice::tex_alloc(device_texture &mem)

  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    /* Kepler+, bindless textures. */
    hipResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));

@@ -1151,8 +1164,6 @@ bool HIPDevice::should_use_graphics_interop()
   * possible, but from the empiric measurements it can be considerably slower than using naive
   * pixels copy. */

-  /* Disable graphics interop for now, because of driver bug in 21.40. See T92972 */
-#  if 0
  HIPContextScope scope(this);

  int num_all_devices = 0;
@@ -1171,7 +1182,6 @@ bool HIPDevice::should_use_graphics_interop()
      return true;
    }
  }
-#  endif

  return false;
 }
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -93,7 +93,10 @@ class HIPDevice : public Device {

  virtual string compile_kernel_get_common_cflags(const uint kernel_features);

-  string compile_kernel(const uint kernel_features, const char *name, const char *base = "hip");
+  string compile_kernel(const uint kernel_features,
+                        const char *name,
+                        const char *base = "hip",
+                        bool force_ptx = false);

  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);
--- a/intern/cycles/device/hip/graphics_interop.h
+++ b/intern/cycles/device/hip/graphics_interop.h
@@ -48,7 +48,7 @@ class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop {
  HIPDeviceQueue *queue_ = nullptr;
  HIPDevice *device_ = nullptr;

-  /* OpenGL PBO which is currently registered as the destination for the HIP buffer. */
+  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
  uint opengl_pbo_id_ = 0;
  /* Buffer area in pixels of the corresponding PBO. */
  int64_t buffer_area_ = 0;
--- a/intern/cycles/device/hip/util.h
+++ b/intern/cycles/device/hip/util.h
@@ -58,15 +58,6 @@ const char *hipewCompilerPath();
 int hipewCompilerVersion();
 #  endif /* WITH_HIP_DYNLOAD */

-static inline bool hipSupportsDevice(const int hipDevId)
-{
-  int major, minor;
-  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
-  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
-
-  return (major > 10) || (major == 10 && minor >= 1);
-}
-
 CCL_NAMESPACE_END

 #endif /* WITH_HIP */
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -48,6 +48,14 @@ OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
 {
 }

+OptiXDevice::Denoiser::~Denoiser()
+{
+  const CUDAContextScope scope(device);
+  if (optix_denoiser != nullptr) {
+    optixDenoiserDestroy(optix_denoiser);
+  }
+}
+
 OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
    : CUDADevice(info, stats, profiler),
      sbt_data(this, "__sbt", MEM_READ_ONLY),
@@ -83,7 +91,6 @@ OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
  };
 #  endif
  if (DebugFlags().optix.use_debug) {
-    VLOG(1) << "Using OptiX debug mode.";
    options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
  }
  optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
@@ -125,11 +132,6 @@ OptiXDevice::~OptiXDevice()
    }
  }

-  /* Make sure denoiser is destroyed before device context! */
-  if (denoiser_.optix_denoiser != nullptr) {
-    optixDenoiserDestroy(denoiser_.optix_denoiser);
-  }
-
  optixDeviceContextDestroy(context);
 }

@@ -881,31 +883,27 @@ bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
  optix_assert(optixDenoiserComputeMemoryResources(
      denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));

-  /* Denoiser is invoked on whole images only, so no overlap needed (would be used for tiling). */
-  denoiser_.scratch_size = sizes.withoutOverlapScratchSizeInBytes;
+  denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes;
  denoiser_.scratch_offset = sizes.stateSizeInBytes;

  /* Allocate denoiser state if tile size has changed since last setup. */
  denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);

  /* Initialize denoiser state for the current tile size. */
-  const OptixResult result = optixDenoiserSetup(
-      denoiser_.optix_denoiser,
-      0, /* Work around bug in r495 drivers that causes artifacts when denoiser setup is called
-            on a stream that is not the default stream */
-      buffer_params.width,
-      buffer_params.height,
-      denoiser_.state.device_pointer,
-      denoiser_.scratch_offset,
-      denoiser_.state.device_pointer + denoiser_.scratch_offset,
-      denoiser_.scratch_size);
+  const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser,
+                                                denoiser_.queue.stream(),
+                                                buffer_params.width,
+                                                buffer_params.height,
+                                                denoiser_.state.device_pointer,
+                                                denoiser_.scratch_offset,
+                                                denoiser_.state.device_pointer +
+                                                    denoiser_.scratch_offset,
+                                                denoiser_.scratch_size);
  if (result != OPTIX_SUCCESS) {
    set_error("Failed to set up OptiX denoiser");
    return false;
  }

-  cuda_assert(cuCtxSynchronize());
-
  denoiser_.is_configured = true;
  denoiser_.configured_size.x = buffer_params.width;
  denoiser_.configured_size.y = buffer_params.height;
@@ -940,6 +938,8 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
  }

+  device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE);
+
  /* Optional albedo and color passes. */
  if (context.num_input_passes > 1) {
    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
@@ -970,7 +970,6 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)

  /* Finally run denoising. */
  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
-
  OptixDenoiserLayer image_layers = {};
  image_layers.input = color_layer;
  image_layers.output = output_layer;
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -82,6 +82,7 @@ class OptiXDevice : public CUDADevice {
  class Denoiser {
   public:
    explicit Denoiser(OptiXDevice *device);
+    ~Denoiser();

    OptiXDevice *device;
    OptiXDeviceQueue queue;
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -73,8 +73,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
                        sizeof(device_ptr),
                        cuda_stream_));

-  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
    cuda_device_assert(
        cuda_device_,
        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
--- a/intern/cycles/integrator/denoiser.cpp
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -29,14 +29,23 @@ unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoisePa
 {
  DCHECK(params.use);

-  if (params.type == DENOISER_OPTIX && Device::available_devices(DEVICE_MASK_OPTIX).size()) {
-    return make_unique<OptiXDenoiser>(path_trace_device, params);
+  switch (params.type) {
+    case DENOISER_OPTIX:
+      return make_unique<OptiXDenoiser>(path_trace_device, params);
+
+    case DENOISER_OPENIMAGEDENOISE:
+      return make_unique<OIDNDenoiser>(path_trace_device, params);
+
+    case DENOISER_NUM:
+    case DENOISER_NONE:
+    case DENOISER_ALL:
+      /* pass */
+      break;
  }

-  /* Always fallback to OIDN. */
-  DenoiseParams oidn_params = params;
-  oidn_params.type = DENOISER_OPENIMAGEDENOISE;
-  return make_unique<OIDNDenoiser>(path_trace_device, oidn_params);
+  LOG(FATAL) << "Unhandled denoiser type " << params.type << ", should never happen.";
+
+  return nullptr;
 }

 Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams &params)
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -138,6 +138,10 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
    return false;
  }

+  if (pass_access_info_.offset == PASS_UNUSED) {
+    return false;
+  }
+
  const PassType type = pass_access_info_.type;
  const PassMode mode = pass_access_info_.mode;
  const PassInfo pass_info = Pass::get_info(type, pass_access_info_.include_albedo);
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -14,12 +14,9 @@
 * limitations under the License.
 */

-#include "device/device.h"
-
 #include "integrator/pass_accessor_cpu.h"

 #include "session/buffers.h"
-
 #include "util/log.h"
 #include "util/tbb.h"

@@ -36,16 +33,70 @@ CCL_NAMESPACE_BEGIN
 * Kernel processing.
 */

+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                                           const BufferParams &buffer_params,
+                                                           const Destination &destination,
+                                                           const Processor &processor) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  if (destination.pixels) {
+    /* NOTE: No overlays are applied since they are not used for final renders.
+     * Can be supported via some sort of specialization to avoid code duplication. */
+
+    run_get_pass_kernel_processor_float(
+        &kfilm_convert, render_buffers, buffer_params, destination, processor);
+  }
+
+  if (destination.pixels_half_rgba) {
+    /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
+
+    if (destination.num_components == 1) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                float pixel;
+                                                processor(kfilm_convert, buffer, &pixel);
+
+                                                pixel_rgba[0] = pixel;
+                                                pixel_rgba[1] = pixel;
+                                                pixel_rgba[2] = pixel;
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 3) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                processor(kfilm_convert, buffer, pixel_rgba);
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 4) {
+      run_get_pass_kernel_processor_half_rgba(
+          &kfilm_convert, render_buffers, buffer_params, destination, processor);
+    }
+  }
+}
+
+template<typename Processor>
 inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
    const KernelFilmConvert *kfilm_convert,
    const RenderBuffers *render_buffers,
    const BufferParams &buffer_params,
    const Destination &destination,
-    const CPUKernels::FilmConvertFunction func) const
+    const Processor &processor) const
 {
-  /* NOTE: No overlays are applied since they are not used for final renders.
-   * Can be supported via some sort of specialization to avoid code duplication. */
-
  DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";

  const int64_t pass_stride = buffer_params.pass_stride;
@@ -61,16 +112,21 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
    const float *buffer = window_data + y * buffer_row_stride;
    float *pixel = destination.pixels +
                   (y * buffer_params.width + destination.offset) * pixel_stride;
-    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride);
+
+    for (int64_t x = 0; x < buffer_params.window_width;
+         ++x, buffer += pass_stride, pixel += pixel_stride) {
+      processor(kfilm_convert, buffer, pixel);
+    }
  });
 }

+template<typename Processor>
 inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
    const KernelFilmConvert *kfilm_convert,
    const RenderBuffers *render_buffers,
    const BufferParams &buffer_params,
    const Destination &destination,
-    const CPUKernels::FilmConvertHalfRGBAFunction func) const
+    const Processor &processor) const
 {
  const int64_t pass_stride = buffer_params.pass_stride;
  const int64_t buffer_row_stride = buffer_params.stride * buffer_params.pass_stride;
@@ -85,7 +141,16 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
  tbb::parallel_for(0, buffer_params.window_height, [&](int64_t y) {
    const float *buffer = window_data + y * buffer_row_stride;
    half4 *pixel = dst_start + y * destination_stride;
-    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride);
+    for (int64_t x = 0; x < buffer_params.window_width; ++x, buffer += pass_stride, ++pixel) {
+
+      float pixel_rgba[4];
+      processor(kfilm_convert, buffer, pixel_rgba);
+
+      film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba);
+
+      *pixel = float4_to_half4_display(
+          make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3]));
+    }
  });
 }

@@ -98,25 +163,8 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
                                        const BufferParams &buffer_params, \
                                        const Destination &destination) const \
  { \
-    const CPUKernels &kernels = Device::get_cpu_kernels(); \
-    KernelFilmConvert kfilm_convert; \
-    init_kernel_film_convert(&kfilm_convert, buffer_params, destination); \
-\
-    if (destination.pixels) { \
-      run_get_pass_kernel_processor_float(&kfilm_convert, \
-                                          render_buffers, \
-                                          buffer_params, \
-                                          destination, \
-                                          kernels.film_convert_##pass); \
-    } \
-\
-    if (destination.pixels_half_rgba) { \
-      run_get_pass_kernel_processor_half_rgba(&kfilm_convert, \
-                                              render_buffers, \
-                                              buffer_params, \
-                                              destination, \
-                                              kernels.film_convert_half_rgba_##pass); \
-    } \
+    run_get_pass_kernel_processor( \
+        render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
  }

 /* Float (scalar) passes. */
--- a/intern/cycles/integrator/pass_accessor_cpu.h
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -16,8 +16,6 @@

 #pragma once

-#include "device/cpu/kernel.h"
-
 #include "integrator/pass_accessor.h"

 CCL_NAMESPACE_BEGIN
@@ -30,19 +28,25 @@ class PassAccessorCPU : public PassAccessor {
  using PassAccessor::PassAccessor;

 protected:
-  inline void run_get_pass_kernel_processor_float(
-      const KernelFilmConvert *kfilm_convert,
-      const RenderBuffers *render_buffers,
-      const BufferParams &buffer_params,
-      const Destination &destination,
-      const CPUKernels::FilmConvertFunction func) const;
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination,
+                                            const Processor &processor) const;

-  inline void run_get_pass_kernel_processor_half_rgba(
-      const KernelFilmConvert *kfilm_convert,
-      const RenderBuffers *render_buffers,
-      const BufferParams &buffer_params,
-      const Destination &destination,
-      const CPUKernels::FilmConvertHalfRGBAFunction func) const;
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
+                                                  const RenderBuffers *render_buffers,
+                                                  const BufferParams &buffer_params,
+                                                  const Destination &destination,
+                                                  const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
+                                                      const RenderBuffers *render_buffers,
+                                                      const BufferParams &buffer_params,
+                                                      const Destination &destination,
+                                                      const Processor &processor) const;

 #define DECLARE_PASS_ACCESSOR(pass) \
  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -380,10 +380,7 @@ void PathTrace::path_trace(RenderWork &render_work)
    PathTraceWork *path_trace_work = path_trace_works_[i].get();

    PathTraceWork::RenderStatistics statistics;
-    path_trace_work->render_samples(statistics,
-                                    render_work.path_trace.start_sample,
-                                    num_samples,
-                                    render_work.path_trace.sample_offset);
+    path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);

    const double work_time = time_dt() - work_start_time;
    work_balance_infos_[i].time_spent += work_time;
@@ -852,8 +849,7 @@ void PathTrace::progress_update_if_needed(const RenderWork &render_work)
    const int2 tile_size = get_render_tile_size();
    const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples;
    const int current_sample = render_work.path_trace.start_sample +
-                               render_work.path_trace.num_samples -
-                               render_work.path_trace.sample_offset;
+                               render_work.path_trace.num_samples;
    progress_->add_samples(num_samples_added, current_sample);
  }

--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -75,10 +75,7 @@ class PathTraceWork {

  /* Render given number of samples as a synchronous blocking call.
   * The samples are added to the render buffer associated with this work. */
-  virtual void render_samples(RenderStatistics &statistics,
-                              int start_sample,
-                              int samples_num,
-                              int sample_offset) = 0;
+  virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;

  /* Copy render result from this work to the corresponding place of the GPU display.
   *
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -58,7 +58,7 @@ PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
                                   DeviceScene *device_scene,
                                   bool *cancel_requested_flag)
    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
-      kernels_(Device::get_cpu_kernels())
+      kernels_(*(device->get_cpu_kernels()))
 {
  DCHECK_EQ(device->info.type, DEVICE_CPU);
 }
@@ -71,17 +71,14 @@ void PathTraceWorkCPU::init_execution()

 void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
                                      int start_sample,
-                                      int samples_num,
-                                      int sample_offset)
+                                      int samples_num)
 {
  const int64_t image_width = effective_buffer_params_.width;
  const int64_t image_height = effective_buffer_params_.height;
  const int64_t total_pixels_num = image_width * image_height;

-  if (device_->profiler.active()) {
-    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
-      kernel_globals.start_profiling();
-    }
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.start_profiling();
  }

  tbb::task_arena local_arena = local_tbb_arena_create(device_);
@@ -100,7 +97,6 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
      work_tile.w = 1;
      work_tile.h = 1;
      work_tile.start_sample = start_sample;
-      work_tile.sample_offset = sample_offset;
      work_tile.num_samples = 1;
      work_tile.offset = effective_buffer_params_.offset;
      work_tile.stride = effective_buffer_params_.stride;
@@ -110,10 +106,9 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
      render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
    });
  });
-  if (device_->profiler.active()) {
-    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
-      kernel_globals.stop_profiling();
-    }
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.stop_profiling();
  }

  statistics.occupancy = 1.0f;
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -48,8 +48,7 @@ class PathTraceWorkCPU : public PathTraceWork {

  virtual void render_samples(RenderStatistics &statistics,
                              int start_sample,
-                              int samples_num,
-                              int sample_offset) override;
+                              int samples_num) override;

  virtual void copy_to_display(PathTraceDisplay *display,
                               PassMode pass_mode,
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -250,8 +250,7 @@ void PathTraceWorkGPU::init_execution()

 void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
                                      int start_sample,
-                                      int samples_num,
-                                      int sample_offset)
+                                      int samples_num)
 {
  /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
   * add more work (because tiles are smaller, so there is higher chance that more paths will
@@ -262,7 +261,6 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
  work_tile_scheduler_.reset(effective_buffer_params_,
                             start_sample,
                             samples_num,
-                             sample_offset,
                             device_scene_->data.integrator.scrambling_distance);

  enqueue_reset();
@@ -439,15 +437,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
  DCHECK_LE(work_size, max_num_paths_);

  switch (kernel) {
-    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: {
-      /* Closest ray intersection kernels with integrator state and render buffer. */
-      void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
-      void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
-
-      queue_->enqueue(kernel, work_size, args);
-      break;
-    }
-
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
@@ -817,10 +807,10 @@ bool PathTraceWorkGPU::should_use_graphics_interop()
    interop_use_ = device->should_use_graphics_interop();

    if (interop_use_) {
-      VLOG(2) << "Using graphics interop GPU display update.";
+      VLOG(2) << "Will be using graphics interop GPU display update.";
    }
    else {
-      VLOG(2) << "Using naive GPU display update.";
+      VLOG(2) << "Will be using naive GPU display update.";
    }

    interop_use_checked_ = true;
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -46,8 +46,7 @@ class PathTraceWorkGPU : public PathTraceWork {

  virtual void render_samples(RenderStatistics &statistics,
                              int start_sample,
-                              int samples_num,
-                              int sample_offset) override;
+                              int samples_num) override;

  virtual void copy_to_display(PathTraceDisplay *display,
                               PassMode pass_mode,
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -88,16 +88,6 @@ int RenderScheduler::get_num_samples() const
  return num_samples_;
 }

-void RenderScheduler::set_sample_offset(int sample_offset)
-{
-  sample_offset_ = sample_offset;
-}
-
-int RenderScheduler::get_sample_offset() const
-{
-  return sample_offset_;
-}
-
 void RenderScheduler::set_time_limit(double time_limit)
 {
  time_limit_ = time_limit;
@@ -120,15 +110,13 @@ int RenderScheduler::get_num_rendered_samples() const
  return state_.num_rendered_samples;
 }

-void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples, int sample_offset)
+void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
 {
  buffer_params_ = buffer_params;

  update_start_resolution_divider();

  set_num_samples(num_samples);
-  set_start_sample(sample_offset);
-  set_sample_offset(sample_offset);

  /* In background mode never do lower resolution render preview, as it is not really supported
   * by the software. */
@@ -183,7 +171,7 @@ void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples,

 void RenderScheduler::reset_for_next_tile()
 {
-  reset(buffer_params_, num_samples_, sample_offset_);
+  reset(buffer_params_, num_samples_);
 }

 bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work)
@@ -329,7 +317,6 @@ RenderWork RenderScheduler::get_render_work()

  render_work.path_trace.start_sample = get_start_sample_to_path_trace();
  render_work.path_trace.num_samples = get_num_samples_to_path_trace();
-  render_work.path_trace.sample_offset = get_sample_offset();

  render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample());

--- a/intern/cycles/integrator/render_scheduler.h
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -39,7 +39,6 @@ class RenderWork {
  struct {
    int start_sample = 0;
    int num_samples = 0;
-    int sample_offset = 0;
  } path_trace;

  struct {
@@ -126,9 +125,6 @@ class RenderScheduler {
  void set_num_samples(int num_samples);
  int get_num_samples() const;

-  void set_sample_offset(int sample_offset);
-  int get_sample_offset() const;
-
  /* Time limit for the path tracing tasks, in minutes.
   * Zero disables the limit. */
  void set_time_limit(double time_limit);
@@ -154,7 +150,7 @@ class RenderScheduler {

  /* Reset scheduler, indicating that rendering will happen from scratch.
   * Resets current rendered state, as well as scheduling information. */
-  void reset(const BufferParams &buffer_params, int num_samples, int sample_offset);
+  void reset(const BufferParams &buffer_params, int num_samples);

  /* Reset scheduler upon switching to a next tile.
   * Will keep the same number of samples and full-frame render parameters, but will reset progress
@@ -423,8 +419,6 @@ class RenderScheduler {
  int start_sample_ = 0;
  int num_samples_ = 0;

-  int sample_offset_ = 0;
-
  /* Limit in seconds for how long path tracing is allowed to happen.
   * Zero means no limit is applied. */
  double time_limit_ = 0.0;
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -96,7 +96,7 @@ bool ShaderEval::eval_cpu(Device *device,
  device->get_cpu_kernel_thread_globals(kernel_thread_globals);

  /* Find required kernel function. */
-  const CPUKernels &kernels = Device::get_cpu_kernels();
+  const CPUKernels &kernels = *(device->get_cpu_kernels());

  /* Simple parallel_for over all work items. */
  KernelShaderEvalInput *input_data = input.data();
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -36,7 +36,6 @@ void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
 void WorkTileScheduler::reset(const BufferParams &buffer_params,
                              int sample_start,
                              int samples_num,
-                              int sample_offset,
                              float scrambling_distance)
 {
  /* Image buffer parameters. */
@@ -52,7 +51,6 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
  /* Samples parameters. */
  sample_start_ = sample_start;
  samples_num_ = samples_num;
-  sample_offset_ = sample_offset;

  /* Initialize new scheduling. */
  reset_scheduler_state();
@@ -113,7 +111,6 @@ bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_
  work_tile.h = tile_size_.height;
  work_tile.start_sample = sample_start_ + start_sample;
  work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample);
-  work_tile.sample_offset = sample_offset_;
  work_tile.offset = offset_;
  work_tile.stride = stride_;

--- a/intern/cycles/integrator/work_tile_scheduler.h
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -41,7 +41,6 @@ class WorkTileScheduler {
  void reset(const BufferParams &buffer_params,
             int sample_start,
             int samples_num,
-             int sample_offset,
             float scrambling_distance);

  /* Get work for a device.
@@ -80,7 +79,6 @@ class WorkTileScheduler {
   * (splitting into a smaller work tiles). */
  int sample_start_ = 0;
  int samples_num_ = 0;
-  int sample_offset_ = 0;

  /* Tile size which be scheduled for rendering. */
  TileSize tile_size_;
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -39,10 +39,6 @@ set(SRC_KERNEL_DEVICE_HIP
  device/hip/kernel.cpp
 )

-set(SRC_KERNEL_DEVICE_METAL
-  device/metal/kernel.metal
-)
-
 set(SRC_KERNEL_DEVICE_OPTIX
  device/optix/kernel.cu
  device/optix/kernel_shader_raytrace.cu
@@ -83,13 +79,6 @@ set(SRC_KERNEL_DEVICE_OPTIX_HEADERS
  device/optix/globals.h
 )

-set(SRC_KERNEL_DEVICE_METAL_HEADERS
-  device/metal/compat.h
-  device/metal/context_begin.h
-  device/metal/context_end.h
-  device/metal/globals.h
-)
-
 set(SRC_KERNEL_CLOSURE_HEADERS
  closure/alloc.h
  closure/bsdf.h
@@ -734,14 +723,12 @@ cycles_add_library(cycles_kernel "${LIB}"
  ${SRC_KERNEL_DEVICE_CUDA}
  ${SRC_KERNEL_DEVICE_HIP}
  ${SRC_KERNEL_DEVICE_OPTIX}
-  ${SRC_KERNEL_DEVICE_METAL}
  ${SRC_KERNEL_HEADERS}
  ${SRC_KERNEL_DEVICE_CPU_HEADERS}
  ${SRC_KERNEL_DEVICE_GPU_HEADERS}
  ${SRC_KERNEL_DEVICE_CUDA_HEADERS}
  ${SRC_KERNEL_DEVICE_HIP_HEADERS}
  ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}
-  ${SRC_KERNEL_DEVICE_METAL_HEADERS}
 )

 source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS})
@@ -753,7 +740,6 @@ source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_
 source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS})
 source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS})
 source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS})
-source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS})
 source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS})
 source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
@@ -786,8 +772,6 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP}" ${CYCLES_
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/film)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
--- a/intern/cycles/kernel/device/cpu/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -18,7 +18,6 @@

 /* CPU Kernel Interface */

-#include "util/half.h"
 #include "util/types.h"

 #include "kernel/types.h"
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -37,7 +37,7 @@

 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
-KERNEL_INTEGRATOR_SHADE_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
 KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
 KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
 KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
@@ -52,37 +52,6 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
 #undef KERNEL_INTEGRATOR_INIT_FUNCTION
 #undef KERNEL_INTEGRATOR_SHADE_FUNCTION

-#define KERNEL_FILM_CONVERT_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
-                                                      const float *buffer, \
-                                                      float *pixel, \
-                                                      const int width, \
-                                                      const int buffer_stride, \
-                                                      const int pixel_stride); \
-  void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
-      const KernelFilmConvert *kfilm_convert, \
-      const float *buffer, \
-      half4 *pixel, \
-      const int width, \
-      const int buffer_stride);
-
-KERNEL_FILM_CONVERT_FUNCTION(depth)
-KERNEL_FILM_CONVERT_FUNCTION(mist)
-KERNEL_FILM_CONVERT_FUNCTION(sample_count)
-KERNEL_FILM_CONVERT_FUNCTION(float)
-
-KERNEL_FILM_CONVERT_FUNCTION(light_path)
-KERNEL_FILM_CONVERT_FUNCTION(float3)
-
-KERNEL_FILM_CONVERT_FUNCTION(motion)
-KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
-KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
-KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
-KERNEL_FILM_CONVERT_FUNCTION(combined)
-KERNEL_FILM_CONVERT_FUNCTION(float4)
-
-#undef KERNEL_FILM_CONVERT_FUNCTION
-
 /* --------------------------------------------------------------------
 * Shader evaluation.
 */
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -47,8 +47,8 @@
 #    include "kernel/integrator/megakernel.h"

 #    include "kernel/film/adaptive_sampling.h"
-#    include "kernel/film/id_passes.h"
 #    include "kernel/film/read.h"
+#    include "kernel/film/id_passes.h"

 #    include "kernel/bake/bake.h"

@@ -112,7 +112,7 @@ CCL_NAMESPACE_BEGIN

 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
-DEFINE_INTEGRATOR_SHADE_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
 DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
 DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
@@ -232,85 +232,6 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *
 #endif
 }

-/* --------------------------------------------------------------------
- * Film Convert.
- */
-
-#ifdef KERNEL_STUB
-
-#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
-    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
-                                                        const float *buffer, \
-                                                        float *pixel, \
-                                                        const int width, \
-                                                        const int buffer_stride, \
-                                                        const int pixel_stride) \
-    { \
-      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
-    } \
-    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
-        const KernelFilmConvert *kfilm_convert, \
-        const float *buffer, \
-        half4 *pixel, \
-        const int width, \
-        const int buffer_stride) \
-    { \
-      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
-    }
-
-#else
-
-#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
-    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
-                                                        const float *buffer, \
-                                                        float *pixel, \
-                                                        const int width, \
-                                                        const int buffer_stride, \
-                                                        const int pixel_stride) \
-    { \
-      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel += pixel_stride) { \
-        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel); \
-      } \
-    } \
-    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
-        const KernelFilmConvert *kfilm_convert, \
-        const float *buffer, \
-        half4 *pixel, \
-        const int width, \
-        const int buffer_stride) \
-    { \
-      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel++) { \
-        float pixel_rgba[4] = {0.0f, 0.0f, 0.0f, 1.0f}; \
-        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel_rgba); \
-        if (is_float) { \
-          pixel_rgba[1] = pixel_rgba[0]; \
-          pixel_rgba[2] = pixel_rgba[0]; \
-        } \
-        film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); \
-        *pixel = float4_to_half4_display( \
-            make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); \
-      } \
-    }
-
-#endif
-
-KERNEL_FILM_CONVERT_FUNCTION(depth, true)
-KERNEL_FILM_CONVERT_FUNCTION(mist, true)
-KERNEL_FILM_CONVERT_FUNCTION(sample_count, true)
-KERNEL_FILM_CONVERT_FUNCTION(float, true)
-
-KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
-KERNEL_FILM_CONVERT_FUNCTION(float3, false)
-
-KERNEL_FILM_CONVERT_FUNCTION(motion, false)
-KERNEL_FILM_CONVERT_FUNCTION(cryptomatte, false)
-KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher, false)
-KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow, false)
-KERNEL_FILM_CONVERT_FUNCTION(combined, false)
-KERNEL_FILM_CONVERT_FUNCTION(float4, false)
-
-#undef KERNEL_FILM_CONVERT_FUNCTION
-
 #undef KERNEL_INVOKE
 #undef DEFINE_INTEGRATOR_KERNEL
 #undef DEFINE_INTEGRATOR_SHADE_KERNEL
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -75,7 +75,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -92,29 +92,12 @@

 /* Compute number of threads per block and minimum blocks per multiprocessor
 * given the maximum number of registers per thread. */
+
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
  extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                   (block_num_threads * thread_num_registers))

-#define ccl_gpu_kernel_threads(block_num_threads) \
-  extern "C" __global__ void __launch_bounds__(block_num_threads)
-
-#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
-
-#define ccl_gpu_kernel_call(x) x
-
-/* Define a function object where "func" is the lambda body, and additional parameters are used to
- * specify captured state  */
-#define ccl_gpu_kernel_lambda(func, ...) \
-  struct KernelLambda { \
-    __VA_ARGS__; \
-    __device__ int operator()(const int state) \
-    { \
-      return (func); \
-    } \
-  } ccl_gpu_kernel_lambda_pass
-
 /* sanity checks */

 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,9 +65,7 @@ ccl_device float cubic_h1(float a)

 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
-                                                      float x,
-                                                      float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
 {
  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;

@@ -96,7 +94,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureIn
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
 ccl_device_noinline T
-kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
 {
  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;

@@ -171,7 +169,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl

 template<typename T>
 ccl_device_noinline T kernel_tex_image_interp_nanovdb(
-    ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
+    const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
  using namespace nanovdb;

@@ -193,7 +191,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(

 ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);

  /* float4, byte4, ushort4 and half4 */
  const int texture_type = info.data_type;
@@ -228,7 +226,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                             float3 P,
                                             InterpolationType interp)
 {
-  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);

  if (info.use_transform_3d) {
    P = transform_point(&info.transform_3d, P);
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,43 +31,10 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif

-#ifdef __KERNEL_METAL__
-struct ActiveIndexContext {
-  ActiveIndexContext(int _thread_index,
-                     int _global_index,
-                     int _threadgroup_size,
-                     int _simdgroup_size,
-                     int _simd_lane_index,
-                     int _simd_group_index,
-                     int _num_simd_groups,
-                     threadgroup int *_simdgroup_offset)
-      : thread_index(_thread_index),
-        global_index(_global_index),
-        blocksize(_threadgroup_size),
-        ccl_gpu_warp_size(_simdgroup_size),
-        thread_warp(_simd_lane_index),
-        warp_index(_simd_group_index),
-        num_warps(_num_simd_groups),
-        warp_offset(_simdgroup_offset)
-  {
-  }
-
-  const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
-      num_warps;
-  threadgroup int *warp_offset;
-
-  template<uint blocksizeDummy, typename IsActiveOp>
-  void active_index_array(const uint num_states,
-                          ccl_global int *indices,
-                          ccl_global int *num_indices,
-                          IsActiveOp is_active_op)
-  {
-    const uint state_index = global_index;
-#else
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
-                                                ccl_global int *indices,
-                                                ccl_global int *num_indices,
+                                                int *indices,
+                                                int *num_indices,
                                                IsActiveOp is_active_op)
 {
  extern ccl_gpu_shared int warp_offset[];
@@ -78,62 +45,43 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
  const uint warp_index = thread_index / ccl_gpu_warp_size;
  const uint num_warps = blocksize / ccl_gpu_warp_size;

+  /* Test if state corresponding to this thread is active. */
  const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
-#endif
+  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;

-    /* Test if state corresponding to this thread is active. */
-    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+  /* For each thread within a warp compute how many other active states precede it. */
+  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
+  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);

-    /* For each thread within a warp compute how many other active states precede it. */
-    const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
-                                            ccl_gpu_thread_mask(thread_warp));
-
-    /* Last thread in warp stores number of active states for each warp. */
-    if (thread_warp == ccl_gpu_warp_size - 1) {
-      warp_offset[warp_index] = thread_offset + is_active;
-    }
-
-    ccl_gpu_syncthreads();
-
-    /* Last thread in block converts per-warp sizes to offsets, increments global size of
-     * index array and gets offset to write to. */
-    if (thread_index == blocksize - 1) {
-      /* TODO: parallelize this. */
-      int offset = 0;
-      for (int i = 0; i < num_warps; i++) {
-        int num_active = warp_offset[i];
-        warp_offset[i] = offset;
-        offset += num_active;
-      }
-
-      const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
-      warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
-    }
-
-    ccl_gpu_syncthreads();
-
-    /* Write to index array. */
-    if (is_active) {
-      const uint block_offset = warp_offset[num_warps];
-      indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
-    }
+  /* Last thread in warp stores number of active states for each warp. */
+  if (thread_warp == ccl_gpu_warp_size - 1) {
+    warp_offset[warp_index] = thread_offset + is_active;
  }

-#ifdef __KERNEL_METAL__
-}; /* end class ActiveIndexContext */
+  ccl_gpu_syncthreads();

-/* inject the required thread params into a struct, and redirect to its templated member function
- */
-#  define gpu_parallel_active_index_array \
-    ActiveIndexContext(metal_local_id, \
-                       metal_global_id, \
-                       metal_local_size, \
-                       simdgroup_size, \
-                       simd_lane_index, \
-                       simd_group_index, \
-                       num_simd_groups, \
-                       simdgroup_offset) \
-        .active_index_array
-#endif
+  /* Last thread in block converts per-warp sizes to offsets, increments global size of
+   * index array and gets offset to write to. */
+  if (thread_index == blocksize - 1) {
+    /* TODO: parallelize this. */
+    int offset = 0;
+    for (int i = 0; i < num_warps; i++) {
+      int num_active = warp_offset[i];
+      warp_offset[i] = offset;
+      offset += num_active;
+    }
+
+    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write to index array. */
+  if (is_active) {
+    const uint block_offset = warp_offset[num_warps];
+    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+  }
+}

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,12 +33,10 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif

-__device__ void gpu_parallel_prefix_sum(const int global_id,
-                                        ccl_global int *counter,
-                                        ccl_global int *prefix_sum,
-                                        const int num_values)
+template<uint blocksize>
+__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
 {
-  if (global_id != 0) {
+  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
    return;
  }

--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -33,16 +33,16 @@ CCL_NAMESPACE_BEGIN
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)

-template<typename GetKeyOp>
-__device__ void gpu_parallel_sorted_index_array(const uint state_index,
-                                                const uint num_states,
+template<uint blocksize, typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint num_states,
                                                const int num_states_limit,
-                                                ccl_global int *indices,
-                                                ccl_global int *num_indices,
-                                                ccl_global int *key_counter,
-                                                ccl_global int *key_prefix_sum,
+                                                int *indices,
+                                                int *num_indices,
+                                                int *key_counter,
+                                                int *key_prefix_sum,
                                                GetKeyOp get_key_op)
 {
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
  const int key = (state_index < num_states) ? get_key_op(state_index) :
                                               GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;

--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -74,7 +74,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -35,29 +35,12 @@

 /* Compute number of threads per block and minimum blocks per multiprocessor
 * given the maximum number of registers per thread. */
+
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
  extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                   (block_num_threads * thread_num_registers))

-#define ccl_gpu_kernel_threads(block_num_threads) \
-  extern "C" __global__ void __launch_bounds__(block_num_threads)
-
-#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
-
-#define ccl_gpu_kernel_call(x) x
-
-/* Define a function object where "func" is the lambda body, and additional parameters are used to
- * specify captured state  */
-#define ccl_gpu_kernel_lambda(func, ...) \
-  struct KernelLambda { \
-    __VA_ARGS__; \
-    __device__ int operator()(const int state) \
-    { \
-      return (func); \
-    } \
-  } ccl_gpu_kernel_lambda_pass
-
 /* sanity checks */

 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -58,98 +58,6 @@ using namespace metal;

 #define kernel_assert(cond)

-#define ccl_gpu_global_id_x() metal_global_id
-#define ccl_gpu_warp_size simdgroup_size
-#define ccl_gpu_thread_idx_x simd_group_index
-#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
-
-#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
-#define ccl_gpu_popc(x) popcount(x)
-
-// clang-format off
-
-/* kernel.h adapters */
-
-#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
-#define ccl_gpu_kernel_threads(block_num_threads)
-
-/* Convert a comma-separated list into a semicolon-separated list
- * (so that we can generate a struct based on kernel entry-point parameters). */
-#define FN0()
-#define FN1(p1) p1;
-#define FN2(p1, p2) p1; p2;
-#define FN3(p1, p2, p3) p1; p2; p3;
-#define FN4(p1, p2, p3, p4) p1; p2; p3; p4;
-#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5;
-#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6;
-#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7;
-#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8;
-#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9;
-#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10;
-#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11;
-#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12;
-#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13;
-#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
-#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
-#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
-#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16
-#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
-
-/* Generate a struct containing the entry-point parameters and a "run"
- * method which can access them implicitly via this-> */
-#define ccl_gpu_kernel_signature(name, ...) \
-struct kernel_gpu_##name \
-{ \
-  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
-  void run(thread MetalKernelContext& context, \
-           threadgroup int *simdgroup_offset, \
-           const uint metal_global_id, \
-           const ushort metal_local_id, \
-           const ushort metal_local_size, \
-           uint simdgroup_size, \
-           uint simd_lane_index, \
-           uint simd_group_index, \
-           uint num_simd_groups) ccl_global const; \
-}; \
-kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
-                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
-                                constant MetalAncillaries *_metal_ancillaries, \
-                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
-                                const uint metal_global_id [[thread_position_in_grid]], \
-                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
-                                const ushort metal_local_size [[threads_per_threadgroup]], \
-                                uint simdgroup_size [[threads_per_simdgroup]], \
-                                uint simd_lane_index [[thread_index_in_simdgroup]], \
-                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
-                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
-  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
-  INIT_DEBUG_BUFFER \
-  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
-} \
-void kernel_gpu_##name::run(thread MetalKernelContext& context, \
-                  threadgroup int *simdgroup_offset, \
-                  const uint metal_global_id, \
-                  const ushort metal_local_id, \
-                  const ushort metal_local_size, \
-                  uint simdgroup_size, \
-                  uint simd_lane_index, \
-                  uint simd_group_index, \
-                  uint num_simd_groups) ccl_global const
-
-#define ccl_gpu_kernel_call(x) context.x
-
-/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state  */
-#define ccl_gpu_kernel_lambda(func, ...) \
-  struct KernelLambda \
-  { \
-    KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \
-    ccl_private MetalKernelContext &context; \
-    __VA_ARGS__; \
-    int operator()(const int state) const { return (func); } \
-  } ccl_gpu_kernel_lambda_pass(context)
-
-// clang-format on
-
 /* make_type definitions with Metal style element initializers */
 #ifdef make_float2
 #  undef make_float2
@@ -216,38 +124,3 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
 #define logf(x) trigmode::log(float(x))

 #define NULL 0
-
-/* texture bindings and sampler setup */
-
-struct Texture2DParamsMetal {
-  texture2d<float, access::sample> tex;
-};
-struct Texture3DParamsMetal {
-  texture3d<float, access::sample> tex;
-};
-
-struct MetalAncillaries {
-  device Texture2DParamsMetal *textures_2d;
-  device Texture3DParamsMetal *textures_3d;
-};
-
-enum SamplerType {
-  SamplerFilterNearest_AddressRepeat,
-  SamplerFilterNearest_AddressClampEdge,
-  SamplerFilterNearest_AddressClampZero,
-
-  SamplerFilterLinear_AddressRepeat,
-  SamplerFilterLinear_AddressClampEdge,
-  SamplerFilterLinear_AddressClampZero,
-
-  SamplerCount
-};
-
-constant constexpr array<sampler, SamplerCount> metal_samplers = {
-    sampler(address::repeat, filter::nearest),
-    sampler(address::clamp_to_edge, filter::nearest),
-    sampler(address::clamp_to_zero, filter::nearest),
-    sampler(address::repeat, filter::linear),
-    sampler(address::clamp_to_edge, filter::linear),
-    sampler(address::clamp_to_zero, filter::linear),
-};
--- a/intern/cycles/kernel/device/metal/context_begin.h
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -1,79 +0,0 @@
-/*
- * Copyright 2021 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// clang-format off
-
-/* Open the Metal kernel context class
- * Necessary to access resource bindings */
-class MetalKernelContext {
-  public:
-    constant KernelParamsMetal &launch_params_metal;
-    constant MetalAncillaries *metal_ancillaries;
-
-    MetalKernelContext(constant KernelParamsMetal &_launch_params_metal, constant MetalAncillaries * _metal_ancillaries)
-    : launch_params_metal(_launch_params_metal), metal_ancillaries(_metal_ancillaries)
-    {}
-
-    /* texture fetch adapter functions */
-    typedef uint64_t ccl_gpu_tex_object;
-
-    template<typename T>
-    inline __attribute__((__always_inline__))
-    T ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
-      kernel_assert(0);
-      return 0;
-    }
-    template<typename T>
-    inline __attribute__((__always_inline__))
-    T ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
-      kernel_assert(0);
-      return 0;
-    }
-
-    // texture2d
-    template<>
-    inline __attribute__((__always_inline__))
-    float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
-      const uint tid(tex);
-      const uint sid(tex >> 32);
-      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
-    }
-    template<>
-    inline __attribute__((__always_inline__))
-    float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
-      const uint tid(tex);
-      const uint sid(tex >> 32);
-      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
-    }
-
-    // texture3d
-    template<>
-    inline __attribute__((__always_inline__))
-    float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
-      const uint tid(tex);
-      const uint sid(tex >> 32);
-      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
-    }
-    template<>
-    inline __attribute__((__always_inline__))
-    float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
-      const uint tid(tex);
-      const uint sid(tex >> 32);
-      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
-    }
-#    include "kernel/device/gpu/image.h"
-
-  // clang-format on
--- a/intern/cycles/kernel/device/metal/context_end.h
+++ b/intern/cycles/kernel/device/metal/context_end.h
@@ -1,23 +0,0 @@
-/*
- * Copyright 2021 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-}
-; /* end of MetalKernelContext class definition */
-
-/* Silently redirect into the MetalKernelContext instance */
-/* NOTE: These macros will need maintaining as entry-points change. */
-
-#undef kernel_integrator_state
-#define kernel_integrator_state context.launch_params_metal.__integrator_state
--- a/intern/cycles/kernel/device/metal/globals.h
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -1,51 +0,0 @@
-/*
- * Copyright 2021 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Constant Globals */
-
-#include "kernel/types.h"
-#include "kernel/util/profiling.h"
-
-#include "kernel/integrator/state.h"
-
-CCL_NAMESPACE_BEGIN
-
-typedef struct KernelParamsMetal {
-
-#define KERNEL_TEX(type, name) ccl_constant type *name;
-#include "kernel/textures.h"
-#undef KERNEL_TEX
-
-  const IntegratorStateGPU __integrator_state;
-  const KernelData data;
-
-} KernelParamsMetal;
-
-typedef struct KernelGlobalsGPU {
-  int unused[1];
-} KernelGlobalsGPU;
-
-typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
-
-#define kernel_data launch_params_metal.data
-#define kernel_integrator_state launch_params_metal.__integrator_state
-
-/* data lookup defines */
-
-#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index]
-#define kernel_tex_array(tex) launch_params_metal.tex
-
-CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -1,25 +0,0 @@
-/*
- * Copyright 2021 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Metal kernel entry points */
-
-// clang-format off
-
-#include "kernel/device/metal/compat.h"
-#include "kernel/device/metal/globals.h"
-#include "kernel/device/gpu/kernel.h"
-
-// clang-format on
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -76,7 +76,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -57,7 +57,7 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
  const int global_index = optixGetLaunchIndex().x;
  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
                                                       global_index;
-  integrator_intersect_closest(nullptr, path_index, __params.render_buffer);
+  integrator_intersect_closest(nullptr, path_index);
 }

 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
--- a/intern/cycles/kernel/film/accumulate.h
+++ b/intern/cycles/kernel/film/accumulate.h
@@ -33,72 +33,62 @@ CCL_NAMESPACE_BEGIN
 * them separately. */

 ccl_device_inline void bsdf_eval_init(ccl_private BsdfEval *eval,
-                                      const ClosureType closure_type,
+                                      const bool is_diffuse,
                                      float3 value)
 {
  eval->diffuse = zero_float3();
  eval->glossy = zero_float3();

-  if (CLOSURE_IS_BSDF_DIFFUSE(closure_type)) {
+  if (is_diffuse) {
    eval->diffuse = value;
  }
-  else if (CLOSURE_IS_BSDF_GLOSSY(closure_type)) {
+  else {
    eval->glossy = value;
  }
-
-  eval->sum = value;
 }

 ccl_device_inline void bsdf_eval_accum(ccl_private BsdfEval *eval,
-                                       const ClosureType closure_type,
-                                       float3 value)
+                                       const bool is_diffuse,
+                                       float3 value,
+                                       float mis_weight)
 {
-  if (CLOSURE_IS_BSDF_DIFFUSE(closure_type)) {
+  value *= mis_weight;
+
+  if (is_diffuse) {
    eval->diffuse += value;
  }
-  else if (CLOSURE_IS_BSDF_GLOSSY(closure_type)) {
+  else {
    eval->glossy += value;
  }
-
-  eval->sum += value;
 }

 ccl_device_inline bool bsdf_eval_is_zero(ccl_private BsdfEval *eval)
 {
-  return is_zero(eval->sum);
+  return is_zero(eval->diffuse) && is_zero(eval->glossy);
 }

 ccl_device_inline void bsdf_eval_mul(ccl_private BsdfEval *eval, float value)
 {
  eval->diffuse *= value;
  eval->glossy *= value;
-  eval->sum *= value;
 }

 ccl_device_inline void bsdf_eval_mul3(ccl_private BsdfEval *eval, float3 value)
 {
  eval->diffuse *= value;
  eval->glossy *= value;
-  eval->sum *= value;
 }

 ccl_device_inline float3 bsdf_eval_sum(ccl_private const BsdfEval *eval)
 {
-  return eval->sum;
+  return eval->diffuse + eval->glossy;
 }

-ccl_device_inline float3 bsdf_eval_pass_diffuse_weight(ccl_private const BsdfEval *eval)
+ccl_device_inline float3 bsdf_eval_diffuse_glossy_ratio(ccl_private const BsdfEval *eval)
 {
-  /* Ratio of diffuse weight to recover proportions for writing to render pass.
+  /* Ratio of diffuse and glossy to recover proportions for writing to render pass.
   * We assume reflection, transmission and volume scatter to be exclusive. */
-  return safe_divide_float3_float3(eval->diffuse, eval->sum);
-}
-
-ccl_device_inline float3 bsdf_eval_pass_glossy_weight(ccl_private const BsdfEval *eval)
-{
-  /* Ratio of glossy weight to recover proportions for writing to render pass.
-   * We assume reflection, transmission and volume scatter to be exclusive. */
-  return safe_divide_float3_float3(eval->glossy, eval->sum);
+  return safe_divide_float3_float3(eval->diffuse, eval->diffuse + eval->glossy);
 }

 /* --------------------------------------------------------------------
@@ -151,8 +141,7 @@ ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
 ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
                                          ConstIntegratorState state,
                                          ccl_global float *ccl_restrict render_buffer,
-                                          int sample,
-                                          int sample_offset)
+                                          int sample)
 {
  if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
    return sample;
@@ -160,8 +149,7 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg,

  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);

-  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
-         sample_offset;
+  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
 }

 ccl_device void kernel_accum_adaptive_buffer(KernelGlobals kg,
@@ -363,48 +351,38 @@ ccl_device_inline void kernel_accum_emission_or_background_pass(KernelGlobals kg
    /* Directly visible, write to emission or background pass. */
    pass_offset = pass;
  }
-  else if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    if (path_flag & PATH_RAY_SURFACE_PASS) {
-      /* Indirectly visible through reflection. */
-      const float3 diffuse_weight = INTEGRATOR_STATE(state, path, pass_diffuse_weight);
-      const float3 glossy_weight = INTEGRATOR_STATE(state, path, pass_glossy_weight);
+  else if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+    /* Indirectly visible through reflection. */
+    const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                       ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                                            kernel_data.film.pass_glossy_direct :
+                                            kernel_data.film.pass_glossy_indirect) :
+                                       ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                                            kernel_data.film.pass_transmission_direct :
+                                            kernel_data.film.pass_transmission_indirect);

-      /* Glossy */
-      const int glossy_pass_offset = ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                                          kernel_data.film.pass_glossy_direct :
-                                          kernel_data.film.pass_glossy_indirect);
-      if (glossy_pass_offset != PASS_UNUSED) {
-        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_weight * contribution);
-      }
-
-      /* Transmission */
-      const int transmission_pass_offset = ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                                                kernel_data.film.pass_transmission_direct :
-                                                kernel_data.film.pass_transmission_indirect);
-
-      if (transmission_pass_offset != PASS_UNUSED) {
-        /* Transmission is what remains if not diffuse and glossy, not stored explicitly to save
-         * GPU memory. */
-        const float3 transmission_weight = one_float3() - diffuse_weight - glossy_weight;
-        kernel_write_pass_float3(buffer + transmission_pass_offset,
-                                 transmission_weight * contribution);
-      }
-
-      /* Reconstruct diffuse subset of throughput. */
-      pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                        kernel_data.film.pass_diffuse_direct :
-                        kernel_data.film.pass_diffuse_indirect;
-      if (pass_offset != PASS_UNUSED) {
-        contribution *= diffuse_weight;
-      }
+    if (glossy_pass_offset != PASS_UNUSED) {
+      /* Glossy is a subset of the throughput, reconstruct it here using the
+       * diffuse-glossy ratio. */
+      const float3 ratio = INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
+      const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+      kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
    }
-    else if (path_flag & PATH_RAY_VOLUME_PASS) {
-      /* Indirectly visible through volume. */
-      pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                        kernel_data.film.pass_volume_direct :
-                        kernel_data.film.pass_volume_indirect;
+
+    /* Reconstruct diffuse subset of throughput. */
+    pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                      kernel_data.film.pass_diffuse_direct :
+                      kernel_data.film.pass_diffuse_indirect;
+    if (pass_offset != PASS_UNUSED) {
+      contribution *= INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
    }
  }
+  else if (path_flag & PATH_RAY_VOLUME_PASS) {
+    /* Indirectly visible through volume. */
+    pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                      kernel_data.film.pass_volume_direct :
+                      kernel_data.film.pass_volume_indirect;
+  }

  /* Single write call for GPU coherence. */
  if (pass_offset != PASS_UNUSED) {
@@ -448,56 +426,45 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
 #ifdef __PASSES__
  if (kernel_data.film.light_pass_flag & PASS_ANY) {
    const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag);
+    int pass_offset = PASS_UNUSED;

-    if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-      int pass_offset = PASS_UNUSED;
+    if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+      /* Indirectly visible through reflection. */
+      const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                         ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_glossy_direct :
+                                              kernel_data.film.pass_glossy_indirect) :
+                                         ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_transmission_direct :
+                                              kernel_data.film.pass_transmission_indirect);

-      if (path_flag & PATH_RAY_SURFACE_PASS) {
-        /* Indirectly visible through reflection. */
-        const float3 diffuse_weight = INTEGRATOR_STATE(state, shadow_path, pass_diffuse_weight);
-        const float3 glossy_weight = INTEGRATOR_STATE(state, shadow_path, pass_glossy_weight);
-
-        /* Glossy */
-        const int glossy_pass_offset = ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                                            kernel_data.film.pass_glossy_direct :
-                                            kernel_data.film.pass_glossy_indirect);
-        if (glossy_pass_offset != PASS_UNUSED) {
-          kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_weight * contribution);
-        }
-
-        /* Transmission */
-        const int transmission_pass_offset = ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                                                  kernel_data.film.pass_transmission_direct :
-                                                  kernel_data.film.pass_transmission_indirect);
-
-        if (transmission_pass_offset != PASS_UNUSED) {
-          /* Transmission is what remains if not diffuse and glossy, not stored explicitly to save
-           * GPU memory. */
-          const float3 transmission_weight = one_float3() - diffuse_weight - glossy_weight;
-          kernel_write_pass_float3(buffer + transmission_pass_offset,
-                                   transmission_weight * contribution);
-        }
-
-        /* Reconstruct diffuse subset of throughput. */
-        pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                          kernel_data.film.pass_diffuse_direct :
-                          kernel_data.film.pass_diffuse_indirect;
-        if (pass_offset != PASS_UNUSED) {
-          contribution *= diffuse_weight;
-        }
-      }
-      else if (path_flag & PATH_RAY_VOLUME_PASS) {
-        /* Indirectly visible through volume. */
-        pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                          kernel_data.film.pass_volume_direct :
-                          kernel_data.film.pass_volume_indirect;
+      if (glossy_pass_offset != PASS_UNUSED) {
+        /* Glossy is a subset of the throughput, reconstruct it here using the
+         * diffuse-glossy ratio. */
+        const float3 ratio = INTEGRATOR_STATE(state, shadow_path, diffuse_glossy_ratio);
+        const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
      }

-      /* Single write call for GPU coherence. */
+      /* Reconstruct diffuse subset of throughput. */
+      pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_diffuse_direct :
+                        kernel_data.film.pass_diffuse_indirect;
      if (pass_offset != PASS_UNUSED) {
-        kernel_write_pass_float3(buffer + pass_offset, contribution);
+        contribution *= INTEGRATOR_STATE(state, shadow_path, diffuse_glossy_ratio);
      }
    }
+    else if (path_flag & PATH_RAY_VOLUME_PASS) {
+      /* Indirectly visible through volume. */
+      pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_volume_direct :
+                        kernel_data.film.pass_volume_indirect;
+    }
+
+    /* Single write call for GPU coherence. */
+    if (pass_offset != PASS_UNUSED) {
+      kernel_write_pass_float3(buffer + pass_offset, contribution);
+    }

    /* Write shadow pass. */
    if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
@@ -573,10 +540,11 @@ ccl_device_inline void kernel_accum_background(KernelGlobals kg,
 /* Write emission to render buffer. */
 ccl_device_inline void kernel_accum_emission(KernelGlobals kg,
                                             ConstIntegratorState state,
+                                             const float3 throughput,
                                             const float3 L,
                                             ccl_global float *ccl_restrict render_buffer)
 {
-  float3 contribution = L;
+  float3 contribution = throughput * L;
  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);

  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
--- a/intern/cycles/kernel/film/passes.h
+++ b/intern/cycles/kernel/film/passes.h
@@ -160,6 +160,40 @@ ccl_device_forceinline void kernel_write_denoising_features_volume(KernelGlobals
 }
 #endif /* __DENOISING_FEATURES__ */

+#ifdef __SHADOW_CATCHER__
+
+/* Write shadow catcher passes on a bounce from the shadow catcher object. */
+ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
+    KernelGlobals kg,
+    IntegratorState state,
+    ccl_private const ShaderData *sd,
+    ccl_global float *ccl_restrict render_buffer)
+{
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
+
+  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, sd->object_flag)) {
+    return;
+  }
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
+
+  /* Count sample for the shadow catcher object. */
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
+
+  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
+   * transparency to the matte. */
+  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
+                          average(throughput));
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
 ccl_device_inline size_t kernel_write_id_pass(ccl_global float *ccl_restrict buffer,
                                              size_t depth,
                                              float id,
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -65,8 +65,7 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
  }

  /* Always count the sample, even if the camera sample will reject the ray. */
-  const int sample = kernel_accum_sample(
-      kg, state, render_buffer, scheduled_sample, tile->sample_offset);
+  const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample);

  /* Setup render buffers. */
  const int index = INTEGRATOR_STATE(state, path, render_pixel_index);
--- a/intern/cycles/kernel/integrator/init_from_camera.h
+++ b/intern/cycles/kernel/integrator/init_from_camera.h
@@ -89,8 +89,7 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
   * This logic allows to both count actual number of samples per pixel, and to add samples to this
   * pixel after it was converged and samples were added somewhere else (in which case the
   * `scheduled_sample` will be different from actual number of samples in this pixel). */
-  const int sample = kernel_accum_sample(
-      kg, state, render_buffer, scheduled_sample, tile->sample_offset);
+  const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample);

  /* Initialize random number seed for path. */
  const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -31,6 +31,7 @@

 CCL_NAMESPACE_BEGIN

+template<uint32_t current_kernel>
 ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
                                                           IntegratorState state,
                                                           const int shader_flags)
@@ -62,7 +63,6 @@ ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
   * perform MIS as part of indirect rays. */
  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
  const float probability = path_state_continuation_probability(kg, state, path_flag);
-  INTEGRATOR_STATE_WRITE(state, path, continuation_probability) = probability;

  if (probability != 1.0f) {
    const float terminate = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE);
@@ -85,80 +85,36 @@ ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
  return false;
 }

-#ifdef __SHADOW_CATCHER__
-/* Split path if a shadow catcher was hit. */
-ccl_device_forceinline void integrator_split_shadow_catcher(
+/* Note that current_kernel is a template value since making this a variable
+ * leads to poor performance with CUDA atomics. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_shader_next_kernel(
    KernelGlobals kg,
    IntegratorState state,
    ccl_private const Intersection *ccl_restrict isect,
-    ccl_global float *ccl_restrict render_buffer)
+    const int shader,
+    const int shader_flags)
 {
-  /* Test if we hit a shadow catcher object, and potentially split the path to continue tracing two
-   * paths from here. */
-  const int object_flags = intersection_get_object_flags(kg, isect);
-  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, object_flags)) {
-    return;
-  }
-
-  kernel_write_shadow_catcher_bounce_data(kg, state, render_buffer);
-
-  /* Mark state as having done a shadow catcher split so that it stops contributing to
-   * the shadow catcher matte pass, but keeps contributing to the combined pass. */
-  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
-
-  /* Copy current state to new state. */
-  state = integrator_state_shadow_catcher_split(kg, state);
-
-  /* Initialize new state.
+  /* Note on scheduling.
+   *
+   * When there is no shadow catcher split the scheduling is simple: schedule surface shading with
+   * or without raytrace support, depending on the shader used.
+   *
+   * When there is a shadow catcher split the general idea is to have the following configuration:
+   *
+   *  - Schedule surface shading kernel (with corresponding raytrace support) for the ray which
+   *    will trace shadow catcher object.
+   *
+   *  - When no alpha-over of approximate shadow catcher is needed, schedule surface shading for
+   *    the matte ray.
+   *
+   *  - Otherwise schedule background shading kernel, so that we have a background to alpha-over
+   *    on. The background kernel will then schedule surface shading for the matte ray.
   *
   * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
   * the matte path. */

-  /* Mark current state so that it will only track contribution of shadow catcher objects ignoring
-   * non-catcher objects. */
-  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
-
-  if (kernel_data.film.pass_background != PASS_UNUSED && !kernel_data.background.transparent) {
-    /* If using background pass, schedule background shading kernel so that we have a background
-     * to alpha-over on. The background kernel will then continue the path afterwards. */
-    INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-    return;
-  }
-
-  if (!integrator_state_volume_stack_is_empty(kg, state)) {
-    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
-     * objects from it, and then continue shading volume and shadow catcher surface after. */
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
-    return;
-  }
-
-  /* Continue with shading shadow catcher surface. */
-  const int shader = intersection_get_shader(kg, isect);
-  const int flags = kernel_tex_fetch(__shaders, shader).flags;
-  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
-
-  if (use_raytrace_kernel) {
-    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
-  }
-  else {
-    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
-  }
-}
-
-/* Schedule next kernel to be executed after updating volume stack for shadow catcher. */
-template<uint32_t current_kernel>
-ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_volume(
-    KernelGlobals kg, IntegratorState state)
-{
-  /* Continue with shading shadow catcher surface. Same as integrator_split_shadow_catcher, but
-   * using NEXT instead of INIT. */
-  Intersection isect ccl_optional_struct_init;
-  integrator_state_read_isect(kg, state, &isect);
-
-  const int shader = intersection_get_shader(kg, &isect);
-  const int flags = kernel_tex_fetch(__shaders, shader).flags;
-  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+  const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);

  if (use_raytrace_kernel) {
    INTEGRATOR_PATH_NEXT_SORTED(
@@ -167,141 +123,26 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche
  else {
    INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
  }
-}
-
-/* Schedule next kernel to be executed after executing background shader for shadow catcher. */
-template<uint32_t current_kernel>
-ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_background(
-    KernelGlobals kg, IntegratorState state)
-{
-  /* Same logic as integrator_split_shadow_catcher, but using NEXT instead of INIT. */
-  if (!integrator_state_volume_stack_is_empty(kg, state)) {
-    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
-     * objects from it, and then continue shading volume and shadow catcher surface after. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
-    return;
-  }
-
-  /* Continue with shading shadow catcher surface. */
-  integrator_intersect_next_kernel_after_shadow_catcher_volume<current_kernel>(kg, state);
-}
-#endif
-
-/* Schedule next kernel to be executed after intersect closest.
- *
- * Note that current_kernel is a template value since making this a variable
- * leads to poor performance with CUDA atomics. */
-template<uint32_t current_kernel>
-ccl_device_forceinline void integrator_intersect_next_kernel(
-    KernelGlobals kg,
-    IntegratorState state,
-    ccl_private const Intersection *ccl_restrict isect,
-    ccl_global float *ccl_restrict render_buffer,
-    const bool hit)
-{
-  /* Continue with volume kernel if we are inside a volume, regardless if we hit anything. */
-#ifdef __VOLUME__
-  if (!integrator_state_volume_stack_is_empty(kg, state)) {
-    const bool hit_surface = hit && !(isect->type & PRIMITIVE_LAMP);
-    const int shader = (hit_surface) ? intersection_get_shader(kg, isect) : SHADER_NONE;
-    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
-
-    if (!integrator_intersect_terminate(kg, state, flags)) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
-    }
-    else {
-      INTEGRATOR_PATH_TERMINATE(current_kernel);
-    }
-    return;
-  }
-#endif
-
-  if (hit) {
-    /* Hit a surface, continue with light or surface kernel. */
-    if (isect->type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
-    }
-    else {
-      /* Hit a surface, continue with surface kernel unless terminated. */
-      const int shader = intersection_get_shader(kg, isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
-      if (!integrator_intersect_terminate(kg, state, flags)) {
-        const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
-        if (use_raytrace_kernel) {
-          INTEGRATOR_PATH_NEXT_SORTED(
-              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
-        }
-        else {
-          INTEGRATOR_PATH_NEXT_SORTED(
-              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
-        }

 #ifdef __SHADOW_CATCHER__
-        /* Handle shadow catcher. */
-        integrator_split_shadow_catcher(kg, state, isect, render_buffer);
-#endif
-      }
-      else {
-        INTEGRATOR_PATH_TERMINATE(current_kernel);
-      }
-    }
-  }
-  else {
-    /* Nothing hit, continue with background kernel. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-  }
-}
+  const int object_flags = intersection_get_object_flags(kg, isect);
+  if (kernel_shadow_catcher_split(kg, state, object_flags)) {
+    if (kernel_data.film.pass_background != PASS_UNUSED && !kernel_data.background.transparent) {
+      INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;

-/* Schedule next kernel to be executed after shade volume.
- *
- * The logic here matches integrator_intersect_next_kernel, except that
- * volume shading and termination testing have already been done. */
-template<uint32_t current_kernel>
-ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
-    KernelGlobals kg,
-    IntegratorState state,
-    ccl_private const Intersection *ccl_restrict isect,
-    ccl_global float *ccl_restrict render_buffer)
-{
-  if (isect->prim != PRIM_NONE) {
-    /* Hit a surface, continue with light or surface kernel. */
-    if (isect->type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
-      return;
+      INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    }
+    else if (use_raytrace_kernel) {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
    }
    else {
-      /* Hit a surface, continue with surface kernel unless terminated. */
-      const int shader = intersection_get_shader(kg, isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
-      const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
-
-      if (use_raytrace_kernel) {
-        INTEGRATOR_PATH_NEXT_SORTED(
-            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
-      }
-      else {
-        INTEGRATOR_PATH_NEXT_SORTED(
-            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
-      }
-
-#ifdef __SHADOW_CATCHER__
-      /* Handle shadow catcher. */
-      integrator_split_shadow_catcher(kg, state, isect, render_buffer);
-#endif
-      return;
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
    }
  }
-  else {
-    /* Nothing hit, continue with background kernel. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-    return;
-  }
+#endif
 }

-ccl_device void integrator_intersect_closest(KernelGlobals kg,
-                                             IntegratorState state,
-                                             ccl_global float *ccl_restrict render_buffer)
+ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState state)
 {
  PROFILING_INIT(kg, PROFILING_INTERSECT_CLOSEST);

@@ -350,9 +191,56 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg,
  /* Write intersection result into global integrator state memory. */
  integrator_state_write_isect(kg, state, &isect);

-  /* Setup up next kernel to be executed. */
-  integrator_intersect_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
-      kg, state, &isect, render_buffer, hit);
+#ifdef __VOLUME__
+  if (!integrator_state_volume_stack_is_empty(kg, state)) {
+    const bool hit_surface = hit && !(isect.type & PRIMITIVE_LAMP);
+    const int shader = (hit_surface) ? intersection_get_shader(kg, &isect) : SHADER_NONE;
+    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
+
+    if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            kg, state, flags)) {
+      /* Continue with volume kernel if we are inside a volume, regardless
+       * if we hit anything. */
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    }
+    else {
+      INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    }
+    return;
+  }
+#endif
+
+  if (hit) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+              kg, state, flags)) {
+        integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            kg, state, &isect, shader, flags);
+        return;
+      }
+      else {
+        INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+        return;
+      }
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                         DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h
@@ -42,13 +42,10 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
  /* Store to avoid global fetches on every intersection step. */
  const uint volume_stack_size = kernel_data.volume_stack_size;

-  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-  const uint32_t visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, PATH_RAY_ALL_VISIBILITY);
-
 #ifdef __VOLUME_RECORD_ALL__
  Intersection hits[2 * MAX_VOLUME_STACK_SIZE + 1];
  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
+      kg, &volume_ray, hits, 2 * volume_stack_size, PATH_RAY_ALL_VISIBILITY);
  if (num_hits > 0) {
    Intersection *isect = hits;

@@ -63,7 +60,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
  Intersection isect;
  int step = 0;
  while (step < 2 * volume_stack_size &&
-         scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
+         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
    volume_stack_enter_exit(kg, state, stack_sd);

@@ -77,7 +74,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
 #endif
 }

-ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState state)
+ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
 {
  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);

@@ -86,26 +83,16 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s

  Ray volume_ray ccl_optional_struct_init;
  integrator_state_read_ray(kg, state, &volume_ray);
-
-  /* Trace ray in random direction. Any direction works, Z up is a guess to get the
-   * fewest hits. */
-  volume_ray.D = make_float3(0.0f, 0.0f, 1.0f);
  volume_ray.t = FLT_MAX;

+  const uint visibility = (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_ALL_VISIBILITY);
  int stack_index = 0, enclosed_index = 0;

-  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-  const uint32_t visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, PATH_RAY_CAMERA);
-
-  /* Initialize volume stack with background volume For shadow catcher the
-   * background volume is always assumed to be CG. */
+  /* Write background shader. */
  if (kernel_data.background.volume_shader != SHADER_NONE) {
-    if (!(path_flag & PATH_RAY_SHADOW_CATCHER_PASS)) {
-      INTEGRATOR_STATE_ARRAY_WRITE(state, volume_stack, stack_index, object) = OBJECT_NONE;
-      INTEGRATOR_STATE_ARRAY_WRITE(
-          state, volume_stack, stack_index, shader) = kernel_data.background.volume_shader;
-      stack_index++;
-    }
+    const VolumeStack new_entry = {OBJECT_NONE, kernel_data.background.volume_shader};
+    integrator_state_write_volume_stack(state, stack_index, new_entry);
+    stack_index++;
  }

  /* Store to avoid global fetches on every intersection step. */
@@ -160,7 +147,7 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
  int enclosed_volumes[MAX_VOLUME_STACK_SIZE];
  int step = 0;

-  while (stack_index < volume_stack_size - 1 && enclosed_index < MAX_VOLUME_STACK_SIZE - 1 &&
+  while (stack_index < volume_stack_size - 1 && enclosed_index < volume_stack_size - 1 &&
         step < 2 * volume_stack_size) {
    Intersection isect;
    if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
@@ -211,22 +198,9 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
  /* Write terminator. */
  const VolumeStack new_entry = {OBJECT_NONE, SHADER_NONE};
  integrator_state_write_volume_stack(state, stack_index, new_entry);
-}

-ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
-{
-  integrator_volume_stack_init(kg, state);
-
-  if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SHADOW_CATCHER_PASS) {
-    /* Volume stack re-init for shadow catcher, continue with shading of hit. */
-    integrator_intersect_next_kernel_after_shadow_catcher_volume<
-        DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK>(kg, state);
-  }
-  else {
-    /* Volume stack init for camera rays, continue with intersection of camera ray. */
-    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
-                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
-  }
+  INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+                       DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/megakernel.h
+++ b/intern/cycles/kernel/integrator/megakernel.h
@@ -76,7 +76,7 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
    if (queued_kernel) {
      switch (queued_kernel) {
        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
-          integrator_intersect_closest(kg, state, render_buffer);
+          integrator_intersect_closest(kg, state);
          break;
        case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
          integrator_shade_background(kg, state, render_buffer);
--- a/intern/cycles/kernel/integrator/path_state.h
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -67,7 +67,6 @@ ccl_device_inline void path_state_init_integrator(KernelGlobals kg,
  INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = 0.0f;
  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = 0.0f;
  INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = FLT_MAX;
-  INTEGRATOR_STATE_WRITE(state, path, continuation_probability) = 1.0f;
  INTEGRATOR_STATE_WRITE(state, path, throughput) = make_float3(1.0f, 1.0f, 1.0f);

  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
@@ -185,7 +184,7 @@ ccl_device_inline void path_state_next(KernelGlobals kg, IntegratorState state,

    /* Render pass categories. */
    if (bounce == 1) {
-      flag |= PATH_RAY_SURFACE_PASS;
+      flag |= (label & LABEL_TRANSMIT) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
    }
  }

--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -175,7 +175,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,

      /* Write to render buffer. */
      const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-      kernel_accum_emission(kg, state, throughput * light_eval, render_buffer);
+      kernel_accum_emission(kg, state, throughput, light_eval, render_buffer);
    }
  }
 }
@@ -192,11 +192,23 @@ ccl_device void integrator_shade_background(KernelGlobals kg,

 #ifdef __SHADOW_CATCHER__
  if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
-    /* Special case for shadow catcher where we want to fill the background pass
-     * behind the shadow catcher but also continue tracing the path. */
    INTEGRATOR_STATE_WRITE(state, path, flag) &= ~PATH_RAY_SHADOW_CATCHER_BACKGROUND;
-    integrator_intersect_next_kernel_after_shadow_catcher_background<
-        DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND>(kg, state);
+
+    const int isect_prim = INTEGRATOR_STATE(state, isect, prim);
+    const int isect_type = INTEGRATOR_STATE(state, isect, type);
+    const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim, isect_type);
+    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+
+    if (shader_flags & SD_HAS_RAYTRACE) {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+                                  shader);
+    }
+    else {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+                                  shader);
+    }
    return;
  }
 #endif
--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -90,7 +90,7 @@ ccl_device_inline void integrate_light(KernelGlobals kg,

  /* Write to render buffer. */
  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_accum_emission(kg, state, throughput * light_eval, render_buffer);
+  kernel_accum_emission(kg, state, throughput, light_eval, render_buffer);
 }

 ccl_device void integrator_shade_light(KernelGlobals kg,
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -101,7 +101,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
  }

  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_accum_emission(kg, state, throughput * L, render_buffer);
+  kernel_accum_emission(kg, state, throughput, L, render_buffer);
 }
 #endif /* __EMISSION__ */

@@ -191,18 +191,14 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
  const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
  uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag);
  shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
-  shadow_flag |= PATH_RAY_SURFACE_PASS;
+  shadow_flag |= (is_transmission) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval);

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 pass_diffuse_weight = (bounce == 0) ?
-                                           bsdf_eval_pass_diffuse_weight(&bsdf_eval) :
-                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
-    const float3 pass_glossy_weight = (bounce == 0) ?
-                                          bsdf_eval_pass_glossy_weight(&bsdf_eval) :
-                                          INTEGRATOR_STATE(state, path, pass_glossy_weight);
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight;
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
+                                            INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
  }

  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
@@ -287,9 +283,7 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
    if (INTEGRATOR_STATE(state, path, bounce) == 0) {
-      INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = bsdf_eval_pass_diffuse_weight(
-          &bsdf_eval);
-      INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = bsdf_eval_pass_glossy_weight(
+      INTEGRATOR_STATE_WRITE(state, path, diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(
          &bsdf_eval);
    }
  }
@@ -451,7 +445,7 @@ ccl_device bool integrate_surface(KernelGlobals kg,
    }
 #endif

-    shader_prepare_surface_closures(kg, state, &sd, path_flag);
+    shader_prepare_surface_closures(kg, state, &sd);

 #ifdef __HOLDOUT__
    /* Evaluate holdout. */
@@ -485,7 +479,7 @@ ccl_device bool integrate_surface(KernelGlobals kg,
    if (!(path_flag & PATH_RAY_SUBSURFACE)) {
      const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
                                    0.0f :
-                                    INTEGRATOR_STATE(state, path, continuation_probability);
+                                    path_state_continuation_probability(kg, state, path_flag);
      if (probability == 0.0f) {
        return false;
      }
@@ -498,6 +492,10 @@ ccl_device bool integrate_surface(KernelGlobals kg,
    kernel_write_denoising_features_surface(kg, state, &sd, render_buffer);
 #endif

+#ifdef __SHADOW_CATCHER__
+    kernel_write_shadow_catcher_bounce_data(kg, state, &sd, render_buffer);
+#endif
+
    /* Direct light. */
    PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
    integrate_surface_direct_light(kg, state, &sd, &rng_state);
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -608,7 +608,7 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
        if (!result.indirect_scatter) {
          const float3 emission = volume_emission_integrate(
              &coeff, closure_flag, transmittance, dt);
-          accum_emission += result.indirect_throughput * emission;
+          accum_emission += emission;
        }
      }

@@ -661,7 +661,7 @@ ccl_device_forceinline void volume_integrate_heterogeneous(

  /* Write accumulated emission. */
  if (!is_zero(accum_emission)) {
-    kernel_accum_emission(kg, state, accum_emission, render_buffer);
+    kernel_accum_emission(kg, state, result.indirect_throughput, accum_emission, render_buffer);
  }

 #  ifdef __DENOISING_FEATURES__
@@ -794,11 +794,10 @@ ccl_device_forceinline void integrate_volume_direct_light(
  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 pass_diffuse_weight = (bounce == 0) ?
-                                           one_float3() :
-                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = zero_float3();
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            one_float3() :
+                                            INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
  }

  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
@@ -877,8 +876,7 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
  INTEGRATOR_STATE_WRITE(state, path, throughput) = throughput_phase;

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_float3();
-    INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_float3();
+    INTEGRATOR_STATE_WRITE(state, path, diffuse_glossy_ratio) = one_float3();
  }

  /* Update path state */
@@ -943,7 +941,7 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
  const float probability = (path_flag & PATH_RAY_TERMINATE_IN_NEXT_VOLUME) ?
                                0.0f :
-                                INTEGRATOR_STATE(state, path, continuation_probability);
+                                path_state_continuation_probability(kg, state, path_flag);
  if (probability == 0.0f) {
    return VOLUME_PATH_MISSED;
  }
@@ -1025,9 +1023,25 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,
  }
  else {
    /* Continue to background, light or surface. */
-    integrator_intersect_next_kernel_after_volume<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
-        kg, state, &isect, render_buffer);
-    return;
+    if (isect.prim == PRIM_NONE) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      return;
+    }
+    else if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
+          kg, state, &isect, shader, flags);
+      return;
+    }
  }
 #endif /* __VOLUME__ */
 }
--- a/intern/cycles/kernel/integrator/shader_eval.h
+++ b/intern/cycles/kernel/integrator/shader_eval.h
@@ -105,45 +105,8 @@ ccl_device_inline void shader_copy_volume_phases(ccl_private ShaderVolumePhases

 ccl_device_inline void shader_prepare_surface_closures(KernelGlobals kg,
                                                       ConstIntegratorState state,
-                                                       ccl_private ShaderData *sd,
-                                                       const uint32_t path_flag)
+                                                       ccl_private ShaderData *sd)
 {
-  /* Filter out closures. */
-  if (kernel_data.integrator.filter_closures) {
-    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_EMISSION) {
-      sd->closure_emission_background = zero_float3();
-    }
-
-    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIRECT_LIGHT) {
-      sd->flag &= ~SD_BSDF_HAS_EVAL;
-    }
-
-    if (path_flag & PATH_RAY_CAMERA) {
-      for (int i = 0; i < sd->num_closure; i++) {
-        ccl_private ShaderClosure *sc = &sd->closure[i];
-
-        if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
-          if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIFFUSE) {
-            sc->type = CLOSURE_NONE_ID;
-            sc->sample_weight = 0.0f;
-          }
-        }
-        else if (CLOSURE_IS_BSDF_GLOSSY(sc->type)) {
-          if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_GLOSSY) {
-            sc->type = CLOSURE_NONE_ID;
-            sc->sample_weight = 0.0f;
-          }
-        }
-        else if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type)) {
-          if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSMISSION) {
-            sc->type = CLOSURE_NONE_ID;
-            sc->sample_weight = 0.0f;
-          }
-        }
-      }
-    }
-  }
-
  /* Defensive sampling.
   *
   * We can likely also do defensive sampling at deeper bounces, particularly
@@ -246,7 +209,8 @@ ccl_device_inline float _shader_bsdf_multi_eval(KernelGlobals kg,
        float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);

        if (bsdf_pdf != 0.0f) {
-          bsdf_eval_accum(result_eval, sc->type, eval * sc->weight);
+          const bool is_diffuse = CLOSURE_IS_BSDF_DIFFUSE(sc->type);
+          bsdf_eval_accum(result_eval, is_diffuse, eval * sc->weight, 1.0f);
          sum_pdf += bsdf_pdf * sc->sample_weight;
        }
      }
@@ -271,7 +235,7 @@ ccl_device_inline
                     ccl_private BsdfEval *bsdf_eval,
                     const uint light_shader_flags)
 {
-  bsdf_eval_init(bsdf_eval, CLOSURE_NONE_ID, zero_float3());
+  bsdf_eval_init(bsdf_eval, false, zero_float3());

  return _shader_bsdf_multi_eval(
      kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
@@ -364,7 +328,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals kg,
  label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);

  if (*pdf != 0.0f) {
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight);
+    const bool is_diffuse = CLOSURE_IS_BSDF_DIFFUSE(sc->type);
+    bsdf_eval_init(bsdf_eval, is_diffuse, eval * sc->weight);

    if (sd->num_closure > 1) {
      const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in);
@@ -690,7 +655,7 @@ ccl_device_inline float _shader_volume_phase_multi_eval(
    float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);

    if (phase_pdf != 0.0f) {
-      bsdf_eval_accum(result_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+      bsdf_eval_accum(result_eval, false, eval, 1.0f);
      sum_pdf += phase_pdf * svc->sample_weight;
    }

@@ -706,7 +671,7 @@ ccl_device float shader_volume_phase_eval(KernelGlobals kg,
                                          const float3 omega_in,
                                          ccl_private BsdfEval *phase_eval)
 {
-  bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, zero_float3());
+  bsdf_eval_init(phase_eval, false, zero_float3());

  return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
 }
@@ -764,7 +729,7 @@ ccl_device int shader_volume_phase_sample(KernelGlobals kg,
  label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf);

  if (*pdf != 0.0f) {
-    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+    bsdf_eval_init(phase_eval, false, eval);
  }

  return label;
@@ -787,7 +752,7 @@ ccl_device int shader_phase_sample_closure(KernelGlobals kg,
  label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);

  if (*pdf != 0.0f)
-    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+    bsdf_eval_init(phase_eval, false, eval);

  return label;
 }
--- a/intern/cycles/kernel/integrator/shadow_catcher.h
+++ b/intern/cycles/kernel/integrator/shadow_catcher.h
@@ -16,7 +16,6 @@

 #pragma once

-#include "kernel/film/write_passes.h"
 #include "kernel/integrator/path_state.h"
 #include "kernel/integrator/state_util.h"

@@ -48,7 +47,7 @@ ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(KernelGlobals
    return false;
  }

-  if (path_flag & PATH_RAY_SHADOW_CATCHER_HIT) {
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
    return false;
  }

@@ -77,6 +76,33 @@ ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg,
  return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0;
 }

+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline bool kernel_shadow_catcher_split(KernelGlobals kg,
+                                                   IntegratorState state,
+                                                   const int object_flags)
+{
+#ifdef __SHADOW_CATCHER__
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, object_flags)) {
+    return false;
+  }
+
+  /* The split is to be done. Mark the current state as such, so that it stops contributing to the
+   * shadow catcher matte pass, but keeps contributing to the combined pass. */
+  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
+
+  /* Split new state from the current one. This new state will only track contribution of shadow
+   * catcher objects ignoring non-catcher objects. */
+  integrator_state_shadow_catcher_split(kg, state);
+
+  return true;
+#else
+  (void)object_flags;
+  return false;
+#endif
+}
+
 #ifdef __SHADOW_CATCHER__

 ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(const uint32_t path_flag)
@@ -89,28 +115,6 @@ ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(const uint32_t
  return path_flag & PATH_RAY_SHADOW_CATCHER_PASS;
 }

-/* Write shadow catcher passes on a bounce from the shadow catcher object. */
-ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
-    KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
-{
-  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
-  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
-
-  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
-                                        kernel_data.film.pass_stride;
-  ccl_global float *buffer = render_buffer + render_buffer_offset;
-
-  /* Count sample for the shadow catcher object. */
-  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
-
-  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
-   * transparency to the matte. */
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
-                          average(throughput));
-}
-
 #endif /* __SHADOW_CATCHER__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -20,7 +20,7 @@ KERNEL_STRUCT_BEGIN(shadow_path)
 /* Index of a pixel within the device render buffer. */
 KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
 /* Current sample number. */
-KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, sample, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
 /* Random number generator seed. */
 KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, rng_hash, KERNEL_FEATURE_PATH_TRACING)
 /* Random number dimension offset. */
@@ -46,9 +46,8 @@ KERNEL_STRUCT_MEMBER(shadow_path,
                     float3,
                     unshadowed_throughput,
                     KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE)
-/* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
 /* Number of intersections found by ray-tracing. */
 KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_END(shadow_path)
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -173,10 +173,10 @@ typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;

 /* Array access on GPU with Structure-of-Arrays. */

-typedef int IntegratorState;
-typedef int ConstIntegratorState;
-typedef int IntegratorShadowState;
-typedef int ConstIntegratorShadowState;
+typedef const int IntegratorState;
+typedef const int ConstIntegratorState;
+typedef const int IntegratorShadowState;
+typedef const int ConstIntegratorShadowState;

 #  define INTEGRATOR_STATE_NULL -1

--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -25,7 +25,7 @@ KERNEL_STRUCT_BEGIN(path)
 * The multiplication is delayed for later, so that state can use 32bit integer. */
 KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
 /* Current sample number. */
-KERNEL_STRUCT_MEMBER(path, uint32_t, sample, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
 /* Current ray bounce depth. */
 KERNEL_STRUCT_MEMBER(path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
 /* Current transparent ray bounce depth. */
@@ -56,13 +56,10 @@ KERNEL_STRUCT_MEMBER(path, float, mis_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(path, float, mis_ray_t, KERNEL_FEATURE_PATH_TRACING)
 /* Filter glossy. */
 KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
-/* Continuation probability for path termination. */
-KERNEL_STRUCT_MEMBER(path, float, continuation_probability, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
 KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
-/* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
 /* Denoising. */
 KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
 /* Shader sorting. */
--- a/intern/cycles/kernel/integrator/state_util.h
+++ b/intern/cycles/kernel/integrator/state_util.h
@@ -326,8 +326,8 @@ ccl_device_inline void integrator_shadow_state_move(KernelGlobals kg,

 /* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
 * after this function. */
-ccl_device_inline IntegratorState integrator_state_shadow_catcher_split(KernelGlobals kg,
-                                                                        IntegratorState state)
+ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
+                                                             IntegratorState state)
 {
 #if defined(__KERNEL_GPU__)
  ConstIntegratorState to_state = atomic_fetch_and_add_uint32(
@@ -337,14 +337,14 @@ ccl_device_inline IntegratorState integrator_state_shadow_catcher_split(KernelGl
 #else
  IntegratorStateCPU *ccl_restrict to_state = state + 1;

-  /* Only copy the required subset for performance. */
+  /* Only copy the required subset, since shadow intersections are big and irrelevant here. */
  to_state->path = state->path;
  to_state->ray = state->ray;
  to_state->isect = state->isect;
  integrator_state_copy_volume_stack(kg, to_state, state);
 #endif

-  return to_state;
+  INTEGRATOR_STATE_WRITE(to_state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
 }

 #ifdef __KERNEL_CPU__
--- a/intern/cycles/kernel/integrator/subsurface.h
+++ b/intern/cycles/kernel/integrator/subsurface.h
@@ -79,8 +79,7 @@ ccl_device int subsurface_bounce(KernelGlobals kg,

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
    if (INTEGRATOR_STATE(state, path, bounce) == 0) {
-      INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_float3();
-      INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_float3();
+      INTEGRATOR_STATE_WRITE(state, path, diffuse_glossy_ratio) = one_float3();
    }
  }

--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -353,8 +353,8 @@ ccl_device bool light_sample_from_distant_ray(KernelGlobals kg,
  /* compute pdf */
  float invarea = klight->distant.invarea;
  ls->pdf = invarea / (costheta * costheta * costheta);
-  ls->eval_fac = ls->pdf;
  ls->pdf *= kernel_data.integrator.pdf_lights;
+  ls->eval_fac = ls->pdf;

  return true;
 }
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -832,21 +832,16 @@ static bool get_object_attribute(const OSLGlobals::Attribute &attr,
 {
  if (attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
      attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float3(make_float3(data[0], data[1], data[2]), type, derivatives, val);
+    return set_attribute_float3(*(float3 *)attr.value.data(), type, derivatives, val);
  }
  else if (attr.type == TypeFloat2) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float2(make_float2(data[0], data[1]), type, derivatives, val);
+    return set_attribute_float2(*(float2 *)attr.value.data(), type, derivatives, val);
  }
  else if (attr.type == TypeDesc::TypeFloat) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float(data[0], type, derivatives, val);
+    return set_attribute_float(*(float *)attr.value.data(), type, derivatives, val);
  }
  else if (attr.type == TypeRGBA || attr.type == TypeDesc::TypeFloat4) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float4(
-        make_float4(data[0], data[1], data[2], data[3]), type, derivatives, val);
+    return set_attribute_float4(*(float4 *)attr.value.data(), type, derivatives, val);
  }
  else if (attr.type == type) {
    size_t datasize = attr.value.datasize();
--- a/intern/cycles/kernel/osl/shader.cpp
+++ b/intern/cycles/kernel/osl/shader.cpp
@@ -132,12 +132,10 @@ static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
  /* Used by render-services. */
  sd->osl_globals = kg;
  if (path_flag & PATH_RAY_SHADOW) {
-    sd->osl_path_state = nullptr;
    sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
  }
  else {
    sd->osl_path_state = (const IntegratorStateCPU *)state;
-    sd->osl_shadow_path_state = nullptr;
  }
 }

--- a/intern/cycles/kernel/osl/shaders/node_musgrave_texture.osl
+++ b/intern/cycles/kernel/osl/shaders/node_musgrave_texture.osl
@@ -697,7 +697,7 @@ shader node_musgrave_texture(
    output float Fac = 0.0)
 {
  float dimension = max(Dimension, 1e-5);
-  float octaves = clamp(Detail, 0.0, 15.0);
+  float octaves = clamp(Detail, 0.0, 16.0);
  float lacunarity = max(Lacunarity, 1e-5);

  vector3 s = Vector;
--- a/intern/cycles/kernel/osl/shaders/node_noise.h
+++ b/intern/cycles/kernel/osl/shaders/node_noise.h
@@ -90,7 +90,7 @@ float fractal_noise(float p, float details, float roughness)
  float amp = 1.0;
  float maxamp = 0.0;
  float sum = 0.0;
-  float octaves = clamp(details, 0.0, 15.0);
+  float octaves = clamp(details, 0.0, 16.0);
  int n = (int)octaves;
  for (int i = 0; i <= n; i++) {
    float t = safe_noise(fscale * p);
@@ -119,7 +119,7 @@ float fractal_noise(vector2 p, float details, float roughness)
  float amp = 1.0;
  float maxamp = 0.0;
  float sum = 0.0;
-  float octaves = clamp(details, 0.0, 15.0);
+  float octaves = clamp(details, 0.0, 16.0);
  int n = (int)octaves;
  for (int i = 0; i <= n; i++) {
    float t = safe_noise(fscale * p);
@@ -148,7 +148,7 @@ float fractal_noise(vector3 p, float details, float roughness)
  float amp = 1.0;
  float maxamp = 0.0;
  float sum = 0.0;
-  float octaves = clamp(details, 0.0, 15.0);
+  float octaves = clamp(details, 0.0, 16.0);
  int n = (int)octaves;
  for (int i = 0; i <= n; i++) {
    float t = safe_noise(fscale * p);
@@ -177,7 +177,7 @@ float fractal_noise(vector4 p, float details, float roughness)
  float amp = 1.0;
  float maxamp = 0.0;
  float sum = 0.0;
-  float octaves = clamp(details, 0.0, 15.0);
+  float octaves = clamp(details, 0.0, 16.0);
  int n = (int)octaves;
  for (int i = 0; i <= n; i++) {
    float t = safe_noise(fscale * p);
--- a/intern/cycles/kernel/svm/fractal_noise.h
+++ b/intern/cycles/kernel/svm/fractal_noise.h
@@ -27,7 +27,7 @@ ccl_device_noinline float fractal_noise_1d(float p, float octaves, float roughne
  float amp = 1.0f;
  float maxamp = 0.0f;
  float sum = 0.0f;
-  octaves = clamp(octaves, 0.0f, 15.0f);
+  octaves = clamp(octaves, 0.0f, 16.0f);
  int n = float_to_int(octaves);
  for (int i = 0; i <= n; i++) {
    float t = noise_1d(fscale * p);
@@ -56,7 +56,7 @@ ccl_device_noinline float fractal_noise_2d(float2 p, float octaves, float roughn
  float amp = 1.0f;
  float maxamp = 0.0f;
  float sum = 0.0f;
-  octaves = clamp(octaves, 0.0f, 15.0f);
+  octaves = clamp(octaves, 0.0f, 16.0f);
  int n = float_to_int(octaves);
  for (int i = 0; i <= n; i++) {
    float t = noise_2d(fscale * p);
@@ -85,7 +85,7 @@ ccl_device_noinline float fractal_noise_3d(float3 p, float octaves, float roughn
  float amp = 1.0f;
  float maxamp = 0.0f;
  float sum = 0.0f;
-  octaves = clamp(octaves, 0.0f, 15.0f);
+  octaves = clamp(octaves, 0.0f, 16.0f);
  int n = float_to_int(octaves);
  for (int i = 0; i <= n; i++) {
    float t = noise_3d(fscale * p);
@@ -114,7 +114,7 @@ ccl_device_noinline float fractal_noise_4d(float4 p, float octaves, float roughn
  float amp = 1.0f;
  float maxamp = 0.0f;
  float sum = 0.0f;
-  octaves = clamp(octaves, 0.0f, 15.0f);
+  octaves = clamp(octaves, 0.0f, 16.0f);
  int n = float_to_int(octaves);
  for (int i = 0; i <= n; i++) {
    float t = noise_4d(fscale * p);
--- a/intern/cycles/kernel/svm/musgrave.h
+++ b/intern/cycles/kernel/svm/musgrave.h
@@ -737,7 +737,7 @@ ccl_device_noinline int svm_node_tex_musgrave(KernelGlobals kg,
  float gain = stack_load_float_default(stack, gain_stack_offset, defaults2.z);

  dimension = fmaxf(dimension, 1e-5f);
-  detail = clamp(detail, 0.0f, 15.0f);
+  detail = clamp(detail, 0.0f, 16.0f);
  lacunarity = fmaxf(lacunarity, 1e-5f);

  float fac;
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -286,26 +286,27 @@ enum PathRayFlag {
  PATH_RAY_DENOISING_FEATURES = (1U << 23U),

  /* Render pass categories. */
-  PATH_RAY_SURFACE_PASS = (1U << 24U),
-  PATH_RAY_VOLUME_PASS = (1U << 25U),
-  PATH_RAY_ANY_PASS = (PATH_RAY_SURFACE_PASS | PATH_RAY_VOLUME_PASS),
+  PATH_RAY_REFLECT_PASS = (1U << 24U),
+  PATH_RAY_TRANSMISSION_PASS = (1U << 25U),
+  PATH_RAY_VOLUME_PASS = (1U << 26U),
+  PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),

  /* Shadow ray is for a light or surface, or AO. */
-  PATH_RAY_SHADOW_FOR_LIGHT = (1U << 26U),
-  PATH_RAY_SHADOW_FOR_AO = (1U << 27U),
+  PATH_RAY_SHADOW_FOR_LIGHT = (1U << 27U),
+  PATH_RAY_SHADOW_FOR_AO = (1U << 28U),

  /* A shadow catcher object was hit and the path was split into two. */
-  PATH_RAY_SHADOW_CATCHER_HIT = (1U << 28U),
+  PATH_RAY_SHADOW_CATCHER_HIT = (1U << 29U),

  /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into
   * their dedicated pass for later division.
   *
   * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling
   * which is separate from the light passes. */
-  PATH_RAY_SHADOW_CATCHER_PASS = (1U << 29U),
+  PATH_RAY_SHADOW_CATCHER_PASS = (1U << 30U),

  /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
-  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 30U),
+  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 31U),
 };

 /* Configure ray visibility bits for rays and objects respectively,
@@ -427,19 +428,8 @@ typedef enum CryptomatteType {
 typedef struct BsdfEval {
  float3 diffuse;
  float3 glossy;
-  float3 sum;
 } BsdfEval;

-/* Closure Filter */
-
-typedef enum FilterClosures {
-  FILTER_CLOSURE_EMISSION = (1 << 0),
-  FILTER_CLOSURE_DIFFUSE = (1 << 1),
-  FILTER_CLOSURE_GLOSSY = (1 << 2),
-  FILTER_CLOSURE_TRANSMISSION = (1 << 3),
-  FILTER_CLOSURE_DIRECT_LIGHT = (1 << 4),
-} FilterClosures;
-
 /* Shader Flag */

 typedef enum ShaderFlag {
@@ -1196,11 +1186,7 @@ typedef struct KernelIntegrator {
  int has_shadow_catcher;
  float scrambling_distance;

-  /* Closure filter. */
-  int filter_closures;
-
  /* padding */
-  int pad1, pad2, pad3;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);

@@ -1424,7 +1410,6 @@ typedef struct KernelWorkTile {

  uint start_sample;
  uint num_samples;
-  uint sample_offset;

  int offset;
  uint stride;
--- a/intern/cycles/scene/constant_fold.cpp
+++ b/intern/cycles/scene/constant_fold.cpp
@@ -43,7 +43,7 @@ bool ConstantFolder::all_inputs_constant() const

 void ConstantFolder::make_constant(float value) const
 {
-  VLOG(3) << "Folding " << node->name << "::" << output->name() << " to constant (" << value
+  VLOG(1) << "Folding " << node->name << "::" << output->name() << " to constant (" << value
          << ").";

  foreach (ShaderInput *sock, output->links) {
@@ -56,7 +56,7 @@ void ConstantFolder::make_constant(float value) const

 void ConstantFolder::make_constant(float3 value) const
 {
-  VLOG(3) << "Folding " << node->name << "::" << output->name() << " to constant " << value << ".";
+  VLOG(1) << "Folding " << node->name << "::" << output->name() << " to constant " << value << ".";

  foreach (ShaderInput *sock, output->links) {
    sock->set(value);
@@ -112,7 +112,7 @@ void ConstantFolder::bypass(ShaderOutput *new_output) const
 {
  assert(new_output);

-  VLOG(3) << "Folding " << node->name << "::" << output->name() << " to socket "
+  VLOG(1) << "Folding " << node->name << "::" << output->name() << " to socket "
          << new_output->parent->name << "::" << new_output->name() << ".";

  /* Remove all outgoing links from socket and connect them to new_output instead.
@@ -131,7 +131,7 @@ void ConstantFolder::discard() const
 {
  assert(output->type() == SocketType::CLOSURE);

-  VLOG(3) << "Discarding closure " << node->name << ".";
+  VLOG(1) << "Discarding closure " << node->name << ".";

  graph->disconnect(output);
 }
--- a/intern/cycles/scene/film.cpp
+++ b/intern/cycles/scene/film.cpp
@@ -187,6 +187,8 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
  kfilm->pass_transmission_indirect = PASS_UNUSED;
  kfilm->pass_volume_direct = PASS_UNUSED;
  kfilm->pass_volume_indirect = PASS_UNUSED;
+  kfilm->pass_volume_direct = PASS_UNUSED;
+  kfilm->pass_volume_indirect = PASS_UNUSED;
  kfilm->pass_shadow = PASS_UNUSED;

  /* Mark passes as unused so that the kernel knows the pass is inaccessible. */
@@ -671,12 +673,13 @@ uint Film::get_kernel_features(const Scene *scene) const
      kernel_features |= KERNEL_FEATURE_DENOISING;
    }

-    if (pass_type >= PASS_DIFFUSE && pass_type <= PASS_VOLUME_INDIRECT) {
+    if (pass_type != PASS_NONE && pass_type != PASS_COMBINED &&
+        pass_type <= PASS_CATEGORY_LIGHT_END) {
      kernel_features |= KERNEL_FEATURE_LIGHT_PASSES;
-    }

-    if (pass_type == PASS_SHADOW) {
-      kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
+      if (pass_type == PASS_SHADOW) {
+        kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
+      }
    }

    if (pass_type == PASS_AO) {
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@@ -1588,20 +1588,9 @@ void GeometryManager::device_update_displacement_images(Device *device,
  set<int> bump_images;
  foreach (Geometry *geom, scene->geometry) {
    if (geom->is_modified()) {
-      /* Geometry-level check for hair shadow transparency.
-       * This matches the logic in the `Hair::update_shadow_transparency()`, avoiding access to
-       * possible non-loaded images. */
-      bool need_shadow_transparency = false;
-      if (geom->geometry_type == Geometry::HAIR) {
-        Hair *hair = static_cast<Hair *>(geom);
-        need_shadow_transparency = hair->need_shadow_transparency();
-      }
-
      foreach (Node *node, geom->get_used_shaders()) {
        Shader *shader = static_cast<Shader *>(node);
-        const bool is_true_displacement = (shader->has_displacement &&
-                                           shader->get_displacement_method() != DISPLACE_BUMP);
-        if (!is_true_displacement && !need_shadow_transparency) {
+        if (!shader->has_displacement || shader->get_displacement_method() == DISPLACE_BUMP) {
          continue;
        }
        foreach (ShaderNode *node, shader->graph->nodes) {
--- a/Show More
+++ b/Show More