use multi-function

initial sound sampling
storage
2021-11-14 14:29:29 +01:00 · 2021-11-14 12:57:00 +01:00 · 2021-11-14 11:34:37 +01:00 · 2021-11-14 11:29:00 +01:00 · 2021-11-14 11:26:06 +11:00 · 2021-11-14 11:11:20 +11:00
591 changed files with 10360 additions and 6046 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -440,7 +440,11 @@ mark_as_advanced(WITH_CYCLES_CUDA_BUILD_SERIAL)
 mark_as_advanced(WITH_CUDA_DYNLOAD)

 # AMD HIP
-option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" OFF)
+if(WIN32)
+  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" ON)
+else()
+  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" OFF)
+endif()
 option(WITH_CYCLES_HIP_BINARIES      "Build Cycles AMD HIP binaries" OFF)
 set(CYCLES_HIP_BINARIES_ARCH gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 CACHE STRING "AMD HIP architectures to build binaries for")
 mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
--- a/build_files/cmake/clang_array_check.py
+++ b/build_files/cmake/clang_array_check.py
@@ -168,7 +168,7 @@ def function_parm_wash_tokens(parm):
    # if tokens[-1].kind == To
    # remove trailing char
    if tokens[-1].kind == TokenKind.PUNCTUATION:
-        if tokens[-1].spelling in (",", ")", ";"):
+        if tokens[-1].spelling in {",", ")", ";"}:
            tokens.pop()
        # else:
        #     print(tokens[-1].spelling)
@@ -179,7 +179,7 @@ def function_parm_wash_tokens(parm):
        t_spelling = t.spelling
        ok = True
        if t_kind == TokenKind.KEYWORD:
-            if t_spelling in ("const", "restrict", "volatile"):
+            if t_spelling in {"const", "restrict", "volatile"}:
                ok = False
            elif t_spelling.startswith("__"):
                ok = False  # __restrict
--- a/build_files/cmake/config/blender_release.cmake
+++ b/build_files/cmake/config/blender_release.cmake
@@ -81,4 +81,5 @@ if(NOT APPLE)
  set(WITH_CYCLES_DEVICE_OPTIX    ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_CUDA_BINARIES   ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_CUBIN_COMPILER  OFF CACHE BOOL "" FORCE)
+  set(WITH_CYCLES_HIP_BINARIES    ON  CACHE BOOL "" FORCE)
 endif()
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -5,38 +5,38 @@
 update-code:
    git:
        submodules:
-        -   branch: blender-v3.0-release
+        -   branch: master
            commit_id: HEAD
            path: release/scripts/addons
-        -   branch: blender-v3.0-release
+        -   branch: master
            commit_id: HEAD
            path: release/scripts/addons_contrib
-        -   branch: blender-v3.0-release
+        -   branch: master
            commit_id: HEAD
            path: release/datafiles/locale
-        -   branch: blender-v3.0-release
+        -   branch: master
            commit_id: HEAD
            path: source/tools
    svn:
        libraries:
            darwin-arm64:
-                branch: tags/blender-3.0-release
+                branch: trunk
                commit_id: HEAD
                path: lib/darwin_arm64
            darwin-x86_64:
-                branch: tags/blender-3.0-release
+                branch: trunk
                commit_id: HEAD
                path: lib/darwin
            linux-x86_64:
-                branch: tags/blender-3.0-release
+                branch: trunk
                commit_id: HEAD
                path: lib/linux_centos7_x86_64
            windows-amd64:
-                branch: tags/blender-3.0-release
+                branch: trunk
                commit_id: HEAD
                path: lib/win64_vc15
        tests:
-            branch: tags/blender-3.0-release
+            branch: trunk
            commit_id: HEAD
            path: lib/tests
        benchmarks:
--- a/doc/doxygen/Doxyfile
+++ b/doc/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = Blender
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = V3.0
+PROJECT_NUMBER         = V3.1

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -1224,7 +1224,10 @@ def pycontext2sphinx(basepath):
        while char_array[i] is not None:
            member = ctypes.string_at(char_array[i]).decode(encoding="ascii")
            fw(".. data:: %s\n\n" % member)
-            member_type, is_seq = context_type_map[member]
+            try:
+                member_type, is_seq = context_type_map[member]
+            except KeyError:
+                raise SystemExit("Error: context key %r not found in context_type_map; update %s" % (member, __file__)) from None
            fw("   :type: %s :class:`bpy.types.%s`\n\n" % ("sequence of " if is_seq else "", member_type))
            unique.add(member)
            i += 1
@@ -2251,7 +2254,7 @@ def main():
    # First monkey patch to load in fake members.
    setup_monkey_patch()

-    # Perform changes to Blender it's self.
+    # Perform changes to Blender itself.
    setup_data = setup_blender()

    # eventually, create the dirs
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -138,11 +138,6 @@ endif()

 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")

-# avoid link failure with clang 3.4 debug
-if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS '3.4')
-  string(APPEND CMAKE_CXX_FLAGS_DEBUG " -gline-tables-only")
-endif()
-
 add_dependencies(bf_intern_cycles bf_rna)

 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${ADDON_FILES}" ${CYCLES_INSTALL_PATH})
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -325,6 +325,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        default=1024,
    )

+    sample_offset: IntProperty(
+        name="Sample Offset",
+        description="Number of samples to skip when starting render",
+        min=0, max=(1 << 24),
+        default=0,
+    )
+
    time_limit: FloatProperty(
        name="Time Limit",
        description="Limit the render time (excluding synchronization time)."
@@ -1419,10 +1426,9 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                col.label(text="and NVIDIA driver version 470 or newer", icon='BLANK1')
            elif device_type == 'HIP':
                import sys
-                col.label(text="Requires discrete AMD GPU with RDNA2 architecture", icon='BLANK1')
-                # TODO: provide driver version info.
-                #if sys.platform[:3] == "win":
-                #    col.label(text="and AMD driver version ??? or newer", icon='BLANK1')
+                col.label(text="Requires discrete AMD GPU with RDNA architecture", icon='BLANK1')
+                if sys.platform[:3] == "win":
+                    col.label(text="and AMD Radeon Pro 21.Q4 driver or newer", icon='BLANK1')
            return

        for device in devices:
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -290,6 +290,9 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
        col.active = not (cscene.use_adaptive_sampling and cscene.use_preview_adaptive_sampling)
        col.prop(cscene, "sampling_pattern", text="Pattern")

+        col = layout.column(align=True)
+        col.prop(cscene, "sample_offset")
+
        layout.separator()

        col = layout.column(align=True)
@@ -1051,7 +1054,7 @@ class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel):


 def has_geometry_visibility(ob):
-    return ob and ((ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LIGHT'}) or
+    return ob and ((ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LIGHT', 'VOLUME', 'POINTCLOUD', 'HAIR'}) or
                   (ob.instance_type == 'COLLECTION' and ob.instance_collection))


--- a/intern/cycles/blender/object.cpp
+++ b/intern/cycles/blender/object.cpp
@@ -62,31 +62,46 @@ bool BlenderSync::BKE_object_is_modified(BL::Object &b_ob)
  return false;
 }

-bool BlenderSync::object_is_geometry(BL::Object &b_ob)
+bool BlenderSync::object_is_geometry(BObjectInfo &b_ob_info)
 {
-  BL::ID b_ob_data = b_ob.data();
+  BL::ID b_ob_data = b_ob_info.object_data;

  if (!b_ob_data) {
    return false;
  }

-  BL::Object::type_enum type = b_ob.type();
+  BL::Object::type_enum type = b_ob_info.iter_object.type();

  if (type == BL::Object::type_VOLUME || type == BL::Object::type_HAIR) {
    /* Will be exported attached to mesh. */
    return true;
  }
-  else if (type == BL::Object::type_CURVE) {
-    /* Skip exporting curves without faces, overhead can be
-     * significant if there are many for path animation. */
-    BL::Curve b_curve(b_ob_data);

-    return (b_curve.bevel_object() || b_curve.extrude() != 0.0f || b_curve.bevel_depth() != 0.0f ||
-            b_curve.dimensions() == BL::Curve::dimensions_2D || b_ob.modifiers.length());
+  /* Other object types that are not meshes but evaluate to meshes are presented to render engines
+   * as separate instance objects. Metaballs and surface objects have not been affected by that
+   * change yet. */
+  if (type == BL::Object::type_SURFACE || type == BL::Object::type_META) {
+    return true;
  }
-  else {
-    return (b_ob_data.is_a(&RNA_Mesh) || b_ob_data.is_a(&RNA_Curve) ||
-            b_ob_data.is_a(&RNA_MetaBall));
+
+  return b_ob_data.is_a(&RNA_Mesh);
+}
+
+bool BlenderSync::object_can_have_geometry(BL::Object &b_ob)
+{
+  BL::Object::type_enum type = b_ob.type();
+  switch (type) {
+    case BL::Object::type_MESH:
+    case BL::Object::type_CURVE:
+    case BL::Object::type_SURFACE:
+    case BL::Object::type_META:
+    case BL::Object::type_FONT:
+    case BL::Object::type_HAIR:
+    case BL::Object::type_POINTCLOUD:
+    case BL::Object::type_VOLUME:
+      return true;
+    default:
+      return false;
  }
 }

@@ -192,7 +207,7 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
  }

  /* only interested in object that we can create meshes from */
-  if (!object_is_geometry(b_ob)) {
+  if (!object_is_geometry(b_ob_info)) {
    return NULL;
  }

@@ -279,7 +294,7 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,

  object->set_visibility(visibility);

-  object->set_is_shadow_catcher(b_ob.is_shadow_catcher());
+  object->set_is_shadow_catcher(b_ob.is_shadow_catcher() || b_parent.is_shadow_catcher());

  float shadow_terminator_shading_offset = get_float(cobject, "shadow_terminator_offset");
  object->set_shadow_terminator_shading_offset(shadow_terminator_shading_offset);
--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -606,6 +606,19 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
  pass->set_type(bake_type_to_pass(bake_type, bake_filter));
  pass->set_include_albedo((bake_filter & BL::BakeSettings::pass_filter_COLOR));

+  if (pass->get_type() == PASS_COMBINED) {
+    /* Filtering settings for combined pass. */
+    Integrator *integrator = scene->integrator;
+    integrator->set_use_direct_light((bake_filter & BL::BakeSettings::pass_filter_DIRECT) != 0);
+    integrator->set_use_indirect_light((bake_filter & BL::BakeSettings::pass_filter_INDIRECT) !=
+                                       0);
+    integrator->set_use_diffuse((bake_filter & BL::BakeSettings::pass_filter_DIFFUSE) != 0);
+    integrator->set_use_glossy((bake_filter & BL::BakeSettings::pass_filter_GLOSSY) != 0);
+    integrator->set_use_transmission((bake_filter & BL::BakeSettings::pass_filter_TRANSMISSION) !=
+                                     0);
+    integrator->set_use_emission((bake_filter & BL::BakeSettings::pass_filter_EMIT) != 0);
+  }
+
  session->set_display_driver(nullptr);
  session->set_output_driver(make_unique<BlenderOutputDriver>(b_engine));

--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -162,19 +162,19 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
    /* Object */
    else if (b_id.is_a(&RNA_Object)) {
      BL::Object b_ob(b_id);
-      const bool is_geometry = object_is_geometry(b_ob);
-      const bool is_light = !is_geometry && object_is_light(b_ob);
+      const bool can_have_geometry = object_can_have_geometry(b_ob);
+      const bool is_light = !can_have_geometry && object_is_light(b_ob);

      if (b_ob.is_instancer() && b_update.is_updated_shading()) {
        /* Needed for e.g. object color updates on instancer. */
        object_map.set_recalc(b_ob);
      }

-      if (is_geometry || is_light) {
+      if (can_have_geometry || is_light) {
        const bool updated_geometry = b_update.is_updated_geometry();

        /* Geometry (mesh, hair, volume). */
-        if (is_geometry) {
+        if (can_have_geometry) {
          if (b_update.is_updated_transform() || b_update.is_updated_shading()) {
            object_map.set_recalc(b_ob);
          }
@@ -835,18 +835,25 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
  /* samples */
  int samples = get_int(cscene, "samples");
  int preview_samples = get_int(cscene, "preview_samples");
+  int sample_offset = get_int(cscene, "sample_offset");

  if (background) {
    params.samples = samples;
+    params.sample_offset = sample_offset;
  }
  else {
    params.samples = preview_samples;
-    if (params.samples == 0)
+    if (params.samples == 0) {
      params.samples = INT_MAX;
+    }
+    params.sample_offset = 0;
  }

+  /* Clamp sample offset. */
+  params.sample_offset = clamp(params.sample_offset, 0, Integrator::MAX_SAMPLES);
+
  /* Clamp samples. */
-  params.samples = min(params.samples, Integrator::MAX_SAMPLES);
+  params.samples = clamp(params.samples, 0, Integrator::MAX_SAMPLES - params.sample_offset);

  /* Viewport Performance */
  params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
--- a/intern/cycles/blender/sync.h
+++ b/intern/cycles/blender/sync.h
@@ -208,7 +208,8 @@ class BlenderSync {
  /* util */
  void find_shader(BL::ID &id, array<Node *> &used_shaders, Shader *default_shader);
  bool BKE_object_is_modified(BL::Object &b_ob);
-  bool object_is_geometry(BL::Object &b_ob);
+  bool object_is_geometry(BObjectInfo &b_ob_info);
+  bool object_can_have_geometry(BL::Object &b_ob);
  bool object_is_light(BL::Object &b_ob);

  /* variables */
--- a/intern/cycles/device/cpu/device.cpp
+++ b/intern/cycles/device/cpu/device.cpp
@@ -38,7 +38,6 @@ void device_cpu_info(vector<DeviceInfo> &devices)
  info.id = "CPU";
  info.num = 0;
  info.has_osl = true;
-  info.has_half_images = true;
  info.has_nanovdb = true;
  info.has_profiling = true;
  if (openimagedenoise_supported()) {
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -68,7 +68,8 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
 {
  /* Pick any kernel, all of them are supposed to have same level of microarchitecture
   * optimization. */
-  VLOG(1) << "Using " << kernels.integrator_init_from_camera.get_uarch_name() << " CPU kernels.";
+  VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name()
+          << " CPU kernels.";

  if (info.cpu_threads == 0) {
    info.cpu_threads = TaskScheduler::num_threads();
@@ -296,11 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    Device::build_bvh(bvh, progress, refit);
 }

-const CPUKernels *CPUDevice::get_cpu_kernels() const
-{
-  return &kernels;
-}
-
 void CPUDevice::get_cpu_kernel_thread_globals(
    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
 {
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -57,8 +57,6 @@ class CPUDevice : public Device {
  RTCDevice embree_device;
 #endif

-  CPUKernels kernels;
-
  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
  ~CPUDevice();

@@ -90,7 +88,6 @@ class CPUDevice : public Device {

  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;

-  virtual const CPUKernels *get_cpu_kernels() const override;
  virtual void get_cpu_kernel_thread_globals(
      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
  virtual void *get_cpu_osl_memory() override;
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -26,6 +26,9 @@ CCL_NAMESPACE_BEGIN
      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)

 #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+#define REGISTER_KERNEL_FILM_CONVERT(name) \
+  film_convert_##name(KERNEL_FUNCTIONS(film_convert_##name)), \
+      film_convert_half_rgba_##name(KERNEL_FUNCTIONS(film_convert_half_rgba_##name))

 CPUKernels::CPUKernels()
    : /* Integrator. */
@@ -50,11 +53,25 @@ CPUKernels::CPUKernels()
      REGISTER_KERNEL(adaptive_sampling_filter_x),
      REGISTER_KERNEL(adaptive_sampling_filter_y),
      /* Cryptomatte. */
-      REGISTER_KERNEL(cryptomatte_postprocess)
+      REGISTER_KERNEL(cryptomatte_postprocess),
+      /* Film Convert. */
+      REGISTER_KERNEL_FILM_CONVERT(depth),
+      REGISTER_KERNEL_FILM_CONVERT(mist),
+      REGISTER_KERNEL_FILM_CONVERT(sample_count),
+      REGISTER_KERNEL_FILM_CONVERT(float),
+      REGISTER_KERNEL_FILM_CONVERT(light_path),
+      REGISTER_KERNEL_FILM_CONVERT(float3),
+      REGISTER_KERNEL_FILM_CONVERT(motion),
+      REGISTER_KERNEL_FILM_CONVERT(cryptomatte),
+      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher),
+      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow),
+      REGISTER_KERNEL_FILM_CONVERT(combined),
+      REGISTER_KERNEL_FILM_CONVERT(float4)
 {
 }

 #undef REGISTER_KERNEL
+#undef REGISTER_KERNEL_FILM_CONVERT
 #undef KERNEL_FUNCTIONS

 CCL_NAMESPACE_END
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -17,11 +17,13 @@
 #pragma once

 #include "device/cpu/kernel_function.h"
+#include "util/half.h"
 #include "util/types.h"

 CCL_NAMESPACE_BEGIN

 struct KernelGlobalsCPU;
+struct KernelFilmConvert;
 struct IntegratorStateCPU;
 struct TileInfo;

@@ -40,7 +42,7 @@ class CPUKernels {

  IntegratorInitFunction integrator_init_from_camera;
  IntegratorInitFunction integrator_init_from_bake;
-  IntegratorFunction integrator_intersect_closest;
+  IntegratorShadeFunction integrator_intersect_closest;
  IntegratorFunction integrator_intersect_shadow;
  IntegratorFunction integrator_intersect_subsurface;
  IntegratorFunction integrator_intersect_volume_stack;
@@ -102,6 +104,41 @@ class CPUKernels {

  CryptomattePostprocessFunction cryptomatte_postprocess;

+  /* Film Convert. */
+  using FilmConvertFunction = CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+                                                         const float *buffer,
+                                                         float *pixel,
+                                                         const int width,
+                                                         const int buffer_stride,
+                                                         const int pixel_stride)>;
+  using FilmConvertHalfRGBAFunction =
+      CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+                                 const float *buffer,
+                                 half4 *pixel,
+                                 const int width,
+                                 const int buffer_stride)>;
+
+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+  FilmConvertFunction film_convert_##name; \
+  FilmConvertHalfRGBAFunction film_convert_half_rgba_##name;
+
+  KERNEL_FILM_CONVERT_FUNCTION(depth)
+  KERNEL_FILM_CONVERT_FUNCTION(mist)
+  KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+  KERNEL_FILM_CONVERT_FUNCTION(float)
+
+  KERNEL_FILM_CONVERT_FUNCTION(light_path)
+  KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+  KERNEL_FILM_CONVERT_FUNCTION(motion)
+  KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+  KERNEL_FILM_CONVERT_FUNCTION(combined)
+  KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
  CPUKernels();
 };

--- a/intern/cycles/device/cuda/device.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -144,7 +144,6 @@ void device_cuda_info(vector<DeviceInfo> &devices)
    info.description = string(name);
    info.num = num;

-    info.has_half_images = (major >= 3);
    info.has_nanovdb = true;
    info.denoisers = 0;

--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -931,7 +931,6 @@ void CUDADevice::tex_alloc(device_texture &mem)
 {
  CUDAContextScope scope(this);

-  /* General variables for both architectures */
  string bind_name = mem.name;
  size_t dsize = datatype_size(mem.data_type);
  size_t size = mem.memory_size();
@@ -1094,7 +1093,6 @@ void CUDADevice::tex_alloc(device_texture &mem)

  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    /* Kepler+, bindless textures. */
    CUDA_RESOURCE_DESC resDesc;
    memset(&resDesc, 0, sizeof(resDesc));

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -23,6 +23,7 @@
 #include "device/queue.h"

 #include "device/cpu/device.h"
+#include "device/cpu/kernel.h"
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
 #include "device/hip/device.h"
@@ -285,7 +286,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
  info.description = "Multi Device";
  info.num = 0;

-  info.has_half_images = true;
  info.has_nanovdb = true;
  info.has_osl = true;
  info.has_profiling = true;
@@ -332,7 +332,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
    }

    /* Accumulate device info. */
-    info.has_half_images &= device.has_half_images;
    info.has_nanovdb &= device.has_nanovdb;
    info.has_osl &= device.has_osl;
    info.has_profiling &= device.has_profiling;
@@ -363,10 +362,11 @@ unique_ptr<DeviceQueue> Device::gpu_queue_create()
  return nullptr;
 }

-const CPUKernels *Device::get_cpu_kernels() const
+const CPUKernels &Device::get_cpu_kernels()
 {
-  LOG(FATAL) << "Device does not support CPU kernels.";
-  return nullptr;
+  /* Initialize CPU kernels once and reuse. */
+  static CPUKernels kernels;
+  return kernels;
 }

 void Device::get_cpu_kernel_thread_globals(
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -73,7 +73,6 @@ class DeviceInfo {
  int num;
  bool display_device;        /* GPU is used as a display device. */
  bool has_nanovdb;           /* Support NanoVDB volumes. */
-  bool has_half_images;       /* Support half-float textures. */
  bool has_osl;               /* Support Open Shading Language. */
  bool has_profiling;         /* Supports runtime collection of profiling info. */
  bool has_peer_memory;       /* GPU has P2P access to memory of another GPU. */
@@ -90,7 +89,6 @@ class DeviceInfo {
    num = 0;
    cpu_threads = 0;
    display_device = false;
-    has_half_images = false;
    has_nanovdb = false;
    has_osl = false;
    has_profiling = false;
@@ -180,7 +178,7 @@ class Device {
   * These may not be used on GPU or multi-devices. */

  /* Get CPU kernel functions for native instruction set. */
-  virtual const CPUKernels *get_cpu_kernels() const;
+  static const CPUKernels &get_cpu_kernels();
  /* Get kernel globals to pass to kernels. */
  virtual void get_cpu_kernel_thread_globals(
      vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
--- a/intern/cycles/device/hip/device.cpp
+++ b/intern/cycles/device/hip/device.cpp
@@ -141,7 +141,6 @@ void device_hip_info(vector<DeviceInfo> &devices)
    info.description = string(name);
    info.num = num;

-    info.has_half_images = true;
    info.has_nanovdb = true;
    info.denoisers = 0;

--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -154,7 +154,7 @@ bool HIPDevice::support_device(const uint /*kernel_features*/)
    hipDeviceProp_t props;
    hipGetDeviceProperties(&props, hipDevId);

-    set_error(string_printf("HIP backend requires AMD RDNA2 graphics card or up, but found %s.",
+    set_error(string_printf("HIP backend requires AMD RDNA graphics card or up, but found %s.",
                            props.name));
    return false;
  }
@@ -222,7 +222,6 @@ string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features)
  const string include_path = source_path;
  string cflags = string_printf(
      "-m%d "
-      "--ptxas-options=\"-v\" "
      "--use_fast_math "
      "-DHIPCC "
      "-I\"%s\"",
@@ -234,10 +233,7 @@ string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features)
  return cflags;
 }

-string HIPDevice::compile_kernel(const uint kernel_features,
-                                 const char *name,
-                                 const char *base,
-                                 bool force_ptx)
+string HIPDevice::compile_kernel(const uint kernel_features, const char *name, const char *base)
 {
  /* Compute kernel name. */
  int major, minor;
@@ -247,7 +243,7 @@ string HIPDevice::compile_kernel(const uint kernel_features,
  hipGetDeviceProperties(&props, hipDevId);

  /* gcnArchName can contain tokens after the arch name with features, ie.
-    "gfx1010:sramecc-:xnack-" so we tokenize it to get the first part. */
+   * `gfx1010:sramecc-:xnack-` so we tokenize it to get the first part. */
  char *arch = strtok(props.gcnArchName, ":");
  if (arch == NULL) {
    arch = props.gcnArchName;
@@ -255,13 +251,11 @@ string HIPDevice::compile_kernel(const uint kernel_features,

  /* Attempt to use kernel provided with Blender. */
  if (!use_adaptive_compilation()) {
-    if (!force_ptx) {
-      const string fatbin = path_get(string_printf("lib/%s_%s.fatbin", name, arch));
-      VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
-      if (path_exists(fatbin)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return fatbin;
-      }
+    const string fatbin = path_get(string_printf("lib/%s_%s.fatbin", name, arch));
+    VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
+    if (path_exists(fatbin)) {
+      VLOG(1) << "Using precompiled kernel.";
+      return fatbin;
    }
  }

@@ -298,9 +292,9 @@ string HIPDevice::compile_kernel(const uint kernel_features,

 #  ifdef _WIN32
  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
-    if (major < 3) {
+    if (!hipSupportsDevice(hipDevId)) {
      set_error(
-          string_printf("HIP backend requires compute capability 3.0 or up, but found %d.%d. "
+          string_printf("HIP backend requires compute capability 10.1 or up, but found %d.%d. "
                        "Your GPU is not supported.",
                        major,
                        minor));
@@ -380,10 +374,9 @@ string HIPDevice::compile_kernel(const uint kernel_features,

 bool HIPDevice::load_kernels(const uint kernel_features)
 {
-  /* TODO(sergey): Support kernels re-load for CUDA devices adaptive compile.
+  /* TODO(sergey): Support kernels re-load for HIP devices adaptive compile.
   *
-   * Currently re-loading kernel will invalidate memory pointers,
-   * causing problems in cuCtxSynchronize.
+   * Currently re-loading kernels will invalidate memory pointers.
   */
  if (hipModule) {
    if (use_adaptive_compilation()) {
@@ -904,7 +897,6 @@ void HIPDevice::tex_alloc(device_texture &mem)
 {
  HIPContextScope scope(this);

-  /* General variables for both architectures */
  string bind_name = mem.name;
  size_t dsize = datatype_size(mem.data_type);
  size_t size = mem.memory_size();
@@ -1069,7 +1061,6 @@ void HIPDevice::tex_alloc(device_texture &mem)

  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    /* Kepler+, bindless textures. */
    hipResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));

@@ -1160,6 +1151,8 @@ bool HIPDevice::should_use_graphics_interop()
   * possible, but from the empiric measurements it can be considerably slower than using naive
   * pixels copy. */

+  /* Disable graphics interop for now, because of driver bug in 21.40. See T92972 */
+#  if 0
  HIPContextScope scope(this);

  int num_all_devices = 0;
@@ -1178,6 +1171,7 @@ bool HIPDevice::should_use_graphics_interop()
      return true;
    }
  }
+#  endif

  return false;
 }
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -93,10 +93,7 @@ class HIPDevice : public Device {

  virtual string compile_kernel_get_common_cflags(const uint kernel_features);

-  string compile_kernel(const uint kernel_features,
-                        const char *name,
-                        const char *base = "hip",
-                        bool force_ptx = false);
+  string compile_kernel(const uint kernel_features, const char *name, const char *base = "hip");

  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);
--- a/intern/cycles/device/hip/graphics_interop.h
+++ b/intern/cycles/device/hip/graphics_interop.h
@@ -48,7 +48,7 @@ class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop {
  HIPDeviceQueue *queue_ = nullptr;
  HIPDevice *device_ = nullptr;

-  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  /* OpenGL PBO which is currently registered as the destination for the HIP buffer. */
  uint opengl_pbo_id_ = 0;
  /* Buffer area in pixels of the corresponding PBO. */
  int64_t buffer_area_ = 0;
--- a/intern/cycles/device/hip/util.h
+++ b/intern/cycles/device/hip/util.h
@@ -64,7 +64,7 @@ static inline bool hipSupportsDevice(const int hipDevId)
  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);

-  return (major > 10) || (major == 10 && minor >= 3);
+  return (major > 10) || (major == 10 && minor >= 1);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -48,14 +48,6 @@ OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
 {
 }

-OptiXDevice::Denoiser::~Denoiser()
-{
-  const CUDAContextScope scope(device);
-  if (optix_denoiser != nullptr) {
-    optixDenoiserDestroy(optix_denoiser);
-  }
-}
-
 OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
    : CUDADevice(info, stats, profiler),
      sbt_data(this, "__sbt", MEM_READ_ONLY),
@@ -133,6 +125,11 @@ OptiXDevice::~OptiXDevice()
    }
  }

+  /* Make sure denoiser is destroyed before device context! */
+  if (denoiser_.optix_denoiser != nullptr) {
+    optixDenoiserDestroy(denoiser_.optix_denoiser);
+  }
+
  optixDeviceContextDestroy(context);
 }

@@ -884,27 +881,31 @@ bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
  optix_assert(optixDenoiserComputeMemoryResources(
      denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));

-  denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes;
+  /* Denoiser is invoked on whole images only, so no overlap needed (would be used for tiling). */
+  denoiser_.scratch_size = sizes.withoutOverlapScratchSizeInBytes;
  denoiser_.scratch_offset = sizes.stateSizeInBytes;

  /* Allocate denoiser state if tile size has changed since last setup. */
  denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);

  /* Initialize denoiser state for the current tile size. */
-  const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser,
-                                                denoiser_.queue.stream(),
-                                                buffer_params.width,
-                                                buffer_params.height,
-                                                denoiser_.state.device_pointer,
-                                                denoiser_.scratch_offset,
-                                                denoiser_.state.device_pointer +
-                                                    denoiser_.scratch_offset,
-                                                denoiser_.scratch_size);
+  const OptixResult result = optixDenoiserSetup(
+      denoiser_.optix_denoiser,
+      0, /* Work around bug in r495 drivers that causes artifacts when denoiser setup is called
+            on a stream that is not the default stream */
+      buffer_params.width,
+      buffer_params.height,
+      denoiser_.state.device_pointer,
+      denoiser_.scratch_offset,
+      denoiser_.state.device_pointer + denoiser_.scratch_offset,
+      denoiser_.scratch_size);
  if (result != OPTIX_SUCCESS) {
    set_error("Failed to set up OptiX denoiser");
    return false;
  }

+  cuda_assert(cuCtxSynchronize());
+
  denoiser_.is_configured = true;
  denoiser_.configured_size.x = buffer_params.width;
  denoiser_.configured_size.y = buffer_params.height;
@@ -939,8 +940,6 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
  }

-  device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE);
-
  /* Optional albedo and color passes. */
  if (context.num_input_passes > 1) {
    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
@@ -971,6 +970,7 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)

  /* Finally run denoising. */
  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+
  OptixDenoiserLayer image_layers = {};
  image_layers.input = color_layer;
  image_layers.output = output_layer;
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -82,7 +82,6 @@ class OptiXDevice : public CUDADevice {
  class Denoiser {
   public:
    explicit Denoiser(OptiXDevice *device);
-    ~Denoiser();

    OptiXDevice *device;
    OptiXDeviceQueue queue;
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -73,7 +73,8 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
                        sizeof(device_ptr),
                        cuda_stream_));

-  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
    cuda_device_assert(
        cuda_device_,
        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
--- a/intern/cycles/integrator/denoiser.cpp
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -33,7 +33,10 @@ unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoisePa
    return make_unique<OptiXDenoiser>(path_trace_device, params);
  }

-  return make_unique<OIDNDenoiser>(path_trace_device, params);
+  /* Always fallback to OIDN. */
+  DenoiseParams oidn_params = params;
+  oidn_params.type = DENOISER_OPENIMAGEDENOISE;
+  return make_unique<OIDNDenoiser>(path_trace_device, oidn_params);
 }

 Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams &params)
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -14,9 +14,12 @@
 * limitations under the License.
 */

+#include "device/device.h"
+
 #include "integrator/pass_accessor_cpu.h"

 #include "session/buffers.h"
+
 #include "util/log.h"
 #include "util/tbb.h"

@@ -33,70 +36,16 @@ CCL_NAMESPACE_BEGIN
 * Kernel processing.
 */

-template<typename Processor>
-inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
-                                                           const BufferParams &buffer_params,
-                                                           const Destination &destination,
-                                                           const Processor &processor) const
-{
-  KernelFilmConvert kfilm_convert;
-  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
-
-  if (destination.pixels) {
-    /* NOTE: No overlays are applied since they are not used for final renders.
-     * Can be supported via some sort of specialization to avoid code duplication. */
-
-    run_get_pass_kernel_processor_float(
-        &kfilm_convert, render_buffers, buffer_params, destination, processor);
-  }
-
-  if (destination.pixels_half_rgba) {
-    /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
-
-    if (destination.num_components == 1) {
-      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
-                                              render_buffers,
-                                              buffer_params,
-                                              destination,
-                                              [&processor](const KernelFilmConvert *kfilm_convert,
-                                                           ccl_global const float *buffer,
-                                                           float *pixel_rgba) {
-                                                float pixel;
-                                                processor(kfilm_convert, buffer, &pixel);
-
-                                                pixel_rgba[0] = pixel;
-                                                pixel_rgba[1] = pixel;
-                                                pixel_rgba[2] = pixel;
-                                                pixel_rgba[3] = 1.0f;
-                                              });
-    }
-    else if (destination.num_components == 3) {
-      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
-                                              render_buffers,
-                                              buffer_params,
-                                              destination,
-                                              [&processor](const KernelFilmConvert *kfilm_convert,
-                                                           ccl_global const float *buffer,
-                                                           float *pixel_rgba) {
-                                                processor(kfilm_convert, buffer, pixel_rgba);
-                                                pixel_rgba[3] = 1.0f;
-                                              });
-    }
-    else if (destination.num_components == 4) {
-      run_get_pass_kernel_processor_half_rgba(
-          &kfilm_convert, render_buffers, buffer_params, destination, processor);
-    }
-  }
-}
-
-template<typename Processor>
 inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
    const KernelFilmConvert *kfilm_convert,
    const RenderBuffers *render_buffers,
    const BufferParams &buffer_params,
    const Destination &destination,
-    const Processor &processor) const
+    const CPUKernels::FilmConvertFunction func) const
 {
+  /* NOTE: No overlays are applied since they are not used for final renders.
+   * Can be supported via some sort of specialization to avoid code duplication. */
+
  DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";

  const int64_t pass_stride = buffer_params.pass_stride;
@@ -112,21 +61,16 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
    const float *buffer = window_data + y * buffer_row_stride;
    float *pixel = destination.pixels +
                   (y * buffer_params.width + destination.offset) * pixel_stride;
-
-    for (int64_t x = 0; x < buffer_params.window_width;
-         ++x, buffer += pass_stride, pixel += pixel_stride) {
-      processor(kfilm_convert, buffer, pixel);
-    }
+    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride);
  });
 }

-template<typename Processor>
 inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
    const KernelFilmConvert *kfilm_convert,
    const RenderBuffers *render_buffers,
    const BufferParams &buffer_params,
    const Destination &destination,
-    const Processor &processor) const
+    const CPUKernels::FilmConvertHalfRGBAFunction func) const
 {
  const int64_t pass_stride = buffer_params.pass_stride;
  const int64_t buffer_row_stride = buffer_params.stride * buffer_params.pass_stride;
@@ -141,16 +85,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
  tbb::parallel_for(0, buffer_params.window_height, [&](int64_t y) {
    const float *buffer = window_data + y * buffer_row_stride;
    half4 *pixel = dst_start + y * destination_stride;
-    for (int64_t x = 0; x < buffer_params.window_width; ++x, buffer += pass_stride, ++pixel) {
-
-      float pixel_rgba[4];
-      processor(kfilm_convert, buffer, pixel_rgba);
-
-      film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba);
-
-      *pixel = float4_to_half4_display(
-          make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3]));
-    }
+    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride);
  });
 }

@@ -163,8 +98,25 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
                                        const BufferParams &buffer_params, \
                                        const Destination &destination) const \
  { \
-    run_get_pass_kernel_processor( \
-        render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
+    const CPUKernels &kernels = Device::get_cpu_kernels(); \
+    KernelFilmConvert kfilm_convert; \
+    init_kernel_film_convert(&kfilm_convert, buffer_params, destination); \
+\
+    if (destination.pixels) { \
+      run_get_pass_kernel_processor_float(&kfilm_convert, \
+                                          render_buffers, \
+                                          buffer_params, \
+                                          destination, \
+                                          kernels.film_convert_##pass); \
+    } \
+\
+    if (destination.pixels_half_rgba) { \
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert, \
+                                              render_buffers, \
+                                              buffer_params, \
+                                              destination, \
+                                              kernels.film_convert_half_rgba_##pass); \
+    } \
  }

 /* Float (scalar) passes. */
--- a/intern/cycles/integrator/pass_accessor_cpu.h
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -16,6 +16,8 @@

 #pragma once

+#include "device/cpu/kernel.h"
+
 #include "integrator/pass_accessor.h"

 CCL_NAMESPACE_BEGIN
@@ -28,25 +30,19 @@ class PassAccessorCPU : public PassAccessor {
  using PassAccessor::PassAccessor;

 protected:
-  template<typename Processor>
-  inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
-                                            const BufferParams &buffer_params,
-                                            const Destination &destination,
-                                            const Processor &processor) const;
+  inline void run_get_pass_kernel_processor_float(
+      const KernelFilmConvert *kfilm_convert,
+      const RenderBuffers *render_buffers,
+      const BufferParams &buffer_params,
+      const Destination &destination,
+      const CPUKernels::FilmConvertFunction func) const;

-  template<typename Processor>
-  inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
-                                                  const RenderBuffers *render_buffers,
-                                                  const BufferParams &buffer_params,
-                                                  const Destination &destination,
-                                                  const Processor &processor) const;
-
-  template<typename Processor>
-  inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
-                                                      const RenderBuffers *render_buffers,
-                                                      const BufferParams &buffer_params,
-                                                      const Destination &destination,
-                                                      const Processor &processor) const;
+  inline void run_get_pass_kernel_processor_half_rgba(
+      const KernelFilmConvert *kfilm_convert,
+      const RenderBuffers *render_buffers,
+      const BufferParams &buffer_params,
+      const Destination &destination,
+      const CPUKernels::FilmConvertHalfRGBAFunction func) const;

 #define DECLARE_PASS_ACCESSOR(pass) \
  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -380,7 +380,10 @@ void PathTrace::path_trace(RenderWork &render_work)
    PathTraceWork *path_trace_work = path_trace_works_[i].get();

    PathTraceWork::RenderStatistics statistics;
-    path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+    path_trace_work->render_samples(statistics,
+                                    render_work.path_trace.start_sample,
+                                    num_samples,
+                                    render_work.path_trace.sample_offset);

    const double work_time = time_dt() - work_start_time;
    work_balance_infos_[i].time_spent += work_time;
@@ -849,7 +852,8 @@ void PathTrace::progress_update_if_needed(const RenderWork &render_work)
    const int2 tile_size = get_render_tile_size();
    const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples;
    const int current_sample = render_work.path_trace.start_sample +
-                               render_work.path_trace.num_samples;
+                               render_work.path_trace.num_samples -
+                               render_work.path_trace.sample_offset;
    progress_->add_samples(num_samples_added, current_sample);
  }

--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -75,7 +75,10 @@ class PathTraceWork {

  /* Render given number of samples as a synchronous blocking call.
   * The samples are added to the render buffer associated with this work. */
-  virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num,
+                              int sample_offset) = 0;

  /* Copy render result from this work to the corresponding place of the GPU display.
   *
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -58,7 +58,7 @@ PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
                                   DeviceScene *device_scene,
                                   bool *cancel_requested_flag)
    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
-      kernels_(*(device->get_cpu_kernels()))
+      kernels_(Device::get_cpu_kernels())
 {
  DCHECK_EQ(device->info.type, DEVICE_CPU);
 }
@@ -71,14 +71,17 @@ void PathTraceWorkCPU::init_execution()

 void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
                                      int start_sample,
-                                      int samples_num)
+                                      int samples_num,
+                                      int sample_offset)
 {
  const int64_t image_width = effective_buffer_params_.width;
  const int64_t image_height = effective_buffer_params_.height;
  const int64_t total_pixels_num = image_width * image_height;

-  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
-    kernel_globals.start_profiling();
+  if (device_->profiler.active()) {
+    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+      kernel_globals.start_profiling();
+    }
  }

  tbb::task_arena local_arena = local_tbb_arena_create(device_);
@@ -97,6 +100,7 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
      work_tile.w = 1;
      work_tile.h = 1;
      work_tile.start_sample = start_sample;
+      work_tile.sample_offset = sample_offset;
      work_tile.num_samples = 1;
      work_tile.offset = effective_buffer_params_.offset;
      work_tile.stride = effective_buffer_params_.stride;
@@ -106,9 +110,10 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
      render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
    });
  });
-
-  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
-    kernel_globals.stop_profiling();
+  if (device_->profiler.active()) {
+    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+      kernel_globals.stop_profiling();
+    }
  }

  statistics.occupancy = 1.0f;
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -48,7 +48,8 @@ class PathTraceWorkCPU : public PathTraceWork {

  virtual void render_samples(RenderStatistics &statistics,
                              int start_sample,
-                              int samples_num) override;
+                              int samples_num,
+                              int sample_offset) override;

  virtual void copy_to_display(PathTraceDisplay *display,
                               PassMode pass_mode,
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -250,7 +250,8 @@ void PathTraceWorkGPU::init_execution()

 void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
                                      int start_sample,
-                                      int samples_num)
+                                      int samples_num,
+                                      int sample_offset)
 {
  /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
   * add more work (because tiles are smaller, so there is higher chance that more paths will
@@ -261,6 +262,7 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
  work_tile_scheduler_.reset(effective_buffer_params_,
                             start_sample,
                             samples_num,
+                             sample_offset,
                             device_scene_->data.integrator.scrambling_distance);

  enqueue_reset();
@@ -437,7 +439,15 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
  DCHECK_LE(work_size, max_num_paths_);

  switch (kernel) {
-    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: {
+      /* Closest ray intersection kernels with integrator state and render buffer. */
+      void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+      void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
+
+      queue_->enqueue(kernel, work_size, args);
+      break;
+    }
+
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -46,7 +46,8 @@ class PathTraceWorkGPU : public PathTraceWork {

  virtual void render_samples(RenderStatistics &statistics,
                              int start_sample,
-                              int samples_num) override;
+                              int samples_num,
+                              int sample_offset) override;

  virtual void copy_to_display(PathTraceDisplay *display,
                               PassMode pass_mode,
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -88,6 +88,16 @@ int RenderScheduler::get_num_samples() const
  return num_samples_;
 }

+void RenderScheduler::set_sample_offset(int sample_offset)
+{
+  sample_offset_ = sample_offset;
+}
+
+int RenderScheduler::get_sample_offset() const
+{
+  return sample_offset_;
+}
+
 void RenderScheduler::set_time_limit(double time_limit)
 {
  time_limit_ = time_limit;
@@ -110,13 +120,15 @@ int RenderScheduler::get_num_rendered_samples() const
  return state_.num_rendered_samples;
 }

-void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
+void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples, int sample_offset)
 {
  buffer_params_ = buffer_params;

  update_start_resolution_divider();

  set_num_samples(num_samples);
+  set_start_sample(sample_offset);
+  set_sample_offset(sample_offset);

  /* In background mode never do lower resolution render preview, as it is not really supported
   * by the software. */
@@ -171,7 +183,7 @@ void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)

 void RenderScheduler::reset_for_next_tile()
 {
-  reset(buffer_params_, num_samples_);
+  reset(buffer_params_, num_samples_, sample_offset_);
 }

 bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work)
@@ -317,6 +329,7 @@ RenderWork RenderScheduler::get_render_work()

  render_work.path_trace.start_sample = get_start_sample_to_path_trace();
  render_work.path_trace.num_samples = get_num_samples_to_path_trace();
+  render_work.path_trace.sample_offset = get_sample_offset();

  render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample());

--- a/intern/cycles/integrator/render_scheduler.h
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -39,6 +39,7 @@ class RenderWork {
  struct {
    int start_sample = 0;
    int num_samples = 0;
+    int sample_offset = 0;
  } path_trace;

  struct {
@@ -125,6 +126,9 @@ class RenderScheduler {
  void set_num_samples(int num_samples);
  int get_num_samples() const;

+  void set_sample_offset(int sample_offset);
+  int get_sample_offset() const;
+
  /* Time limit for the path tracing tasks, in minutes.
   * Zero disables the limit. */
  void set_time_limit(double time_limit);
@@ -150,7 +154,7 @@ class RenderScheduler {

  /* Reset scheduler, indicating that rendering will happen from scratch.
   * Resets current rendered state, as well as scheduling information. */
-  void reset(const BufferParams &buffer_params, int num_samples);
+  void reset(const BufferParams &buffer_params, int num_samples, int sample_offset);

  /* Reset scheduler upon switching to a next tile.
   * Will keep the same number of samples and full-frame render parameters, but will reset progress
@@ -419,6 +423,8 @@ class RenderScheduler {
  int start_sample_ = 0;
  int num_samples_ = 0;

+  int sample_offset_ = 0;
+
  /* Limit in seconds for how long path tracing is allowed to happen.
   * Zero means no limit is applied. */
  double time_limit_ = 0.0;
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -96,7 +96,7 @@ bool ShaderEval::eval_cpu(Device *device,
  device->get_cpu_kernel_thread_globals(kernel_thread_globals);

  /* Find required kernel function. */
-  const CPUKernels &kernels = *(device->get_cpu_kernels());
+  const CPUKernels &kernels = Device::get_cpu_kernels();

  /* Simple parallel_for over all work items. */
  KernelShaderEvalInput *input_data = input.data();
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -36,6 +36,7 @@ void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
 void WorkTileScheduler::reset(const BufferParams &buffer_params,
                              int sample_start,
                              int samples_num,
+                              int sample_offset,
                              float scrambling_distance)
 {
  /* Image buffer parameters. */
@@ -51,6 +52,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
  /* Samples parameters. */
  sample_start_ = sample_start;
  samples_num_ = samples_num;
+  sample_offset_ = sample_offset;

  /* Initialize new scheduling. */
  reset_scheduler_state();
@@ -111,6 +113,7 @@ bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_
  work_tile.h = tile_size_.height;
  work_tile.start_sample = sample_start_ + start_sample;
  work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample);
+  work_tile.sample_offset = sample_offset_;
  work_tile.offset = offset_;
  work_tile.stride = stride_;

--- a/intern/cycles/integrator/work_tile_scheduler.h
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -41,6 +41,7 @@ class WorkTileScheduler {
  void reset(const BufferParams &buffer_params,
             int sample_start,
             int samples_num,
+             int sample_offset,
             float scrambling_distance);

  /* Get work for a device.
@@ -79,6 +80,7 @@ class WorkTileScheduler {
   * (splitting into a smaller work tiles). */
  int sample_start_ = 0;
  int samples_num_ = 0;
+  int sample_offset_ = 0;

  /* Tile size which be scheduled for rendering. */
  TileSize tile_size_;
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -39,6 +39,10 @@ set(SRC_KERNEL_DEVICE_HIP
  device/hip/kernel.cpp
 )

+set(SRC_KERNEL_DEVICE_METAL
+  device/metal/kernel.metal
+)
+
 set(SRC_KERNEL_DEVICE_OPTIX
  device/optix/kernel.cu
  device/optix/kernel_shader_raytrace.cu
@@ -79,6 +83,13 @@ set(SRC_KERNEL_DEVICE_OPTIX_HEADERS
  device/optix/globals.h
 )

+set(SRC_KERNEL_DEVICE_METAL_HEADERS
+  device/metal/compat.h
+  device/metal/context_begin.h
+  device/metal/context_end.h
+  device/metal/globals.h
+)
+
 set(SRC_KERNEL_CLOSURE_HEADERS
  closure/alloc.h
  closure/bsdf.h
@@ -723,12 +734,14 @@ cycles_add_library(cycles_kernel "${LIB}"
  ${SRC_KERNEL_DEVICE_CUDA}
  ${SRC_KERNEL_DEVICE_HIP}
  ${SRC_KERNEL_DEVICE_OPTIX}
+  ${SRC_KERNEL_DEVICE_METAL}
  ${SRC_KERNEL_HEADERS}
  ${SRC_KERNEL_DEVICE_CPU_HEADERS}
  ${SRC_KERNEL_DEVICE_GPU_HEADERS}
  ${SRC_KERNEL_DEVICE_CUDA_HEADERS}
  ${SRC_KERNEL_DEVICE_HIP_HEADERS}
  ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}
+  ${SRC_KERNEL_DEVICE_METAL_HEADERS}
 )

 source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS})
@@ -740,6 +753,7 @@ source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_
 source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS})
 source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS})
 source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS})
+source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS})
 source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS})
 source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
@@ -772,6 +786,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP}" ${CYCLES_
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/film)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
--- a/intern/cycles/kernel/device/cpu/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -18,6 +18,7 @@

 /* CPU Kernel Interface */

+#include "util/half.h"
 #include "util/types.h"

 #include "kernel/types.h"
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -37,7 +37,7 @@

 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
-KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(intersect_closest);
 KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
 KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
 KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
@@ -52,6 +52,37 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
 #undef KERNEL_INTEGRATOR_INIT_FUNCTION
 #undef KERNEL_INTEGRATOR_SHADE_FUNCTION

+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                      const float *buffer, \
+                                                      float *pixel, \
+                                                      const int width, \
+                                                      const int buffer_stride, \
+                                                      const int pixel_stride); \
+  void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+      const KernelFilmConvert *kfilm_convert, \
+      const float *buffer, \
+      half4 *pixel, \
+      const int width, \
+      const int buffer_stride);
+
+KERNEL_FILM_CONVERT_FUNCTION(depth)
+KERNEL_FILM_CONVERT_FUNCTION(mist)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+KERNEL_FILM_CONVERT_FUNCTION(float)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path)
+KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+KERNEL_FILM_CONVERT_FUNCTION(combined)
+KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
 /* --------------------------------------------------------------------
 * Shader evaluation.
 */
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -47,8 +47,8 @@
 #    include "kernel/integrator/megakernel.h"

 #    include "kernel/film/adaptive_sampling.h"
-#    include "kernel/film/read.h"
 #    include "kernel/film/id_passes.h"
+#    include "kernel/film/read.h"

 #    include "kernel/bake/bake.h"

@@ -112,7 +112,7 @@ CCL_NAMESPACE_BEGIN

 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
-DEFINE_INTEGRATOR_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_SHADE_KERNEL(intersect_closest)
 DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
 DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
@@ -232,6 +232,85 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *
 #endif
 }

+/* --------------------------------------------------------------------
+ * Film Convert.
+ */
+
+#ifdef KERNEL_STUB
+
+#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                        const float *buffer, \
+                                                        float *pixel, \
+                                                        const int width, \
+                                                        const int buffer_stride, \
+                                                        const int pixel_stride) \
+    { \
+      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+    } \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+        const KernelFilmConvert *kfilm_convert, \
+        const float *buffer, \
+        half4 *pixel, \
+        const int width, \
+        const int buffer_stride) \
+    { \
+      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+    }
+
+#else
+
+#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                        const float *buffer, \
+                                                        float *pixel, \
+                                                        const int width, \
+                                                        const int buffer_stride, \
+                                                        const int pixel_stride) \
+    { \
+      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel += pixel_stride) { \
+        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel); \
+      } \
+    } \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+        const KernelFilmConvert *kfilm_convert, \
+        const float *buffer, \
+        half4 *pixel, \
+        const int width, \
+        const int buffer_stride) \
+    { \
+      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel++) { \
+        float pixel_rgba[4] = {0.0f, 0.0f, 0.0f, 1.0f}; \
+        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel_rgba); \
+        if (is_float) { \
+          pixel_rgba[1] = pixel_rgba[0]; \
+          pixel_rgba[2] = pixel_rgba[0]; \
+        } \
+        film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); \
+        *pixel = float4_to_half4_display( \
+            make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); \
+      } \
+    }
+
+#endif
+
+KERNEL_FILM_CONVERT_FUNCTION(depth, true)
+KERNEL_FILM_CONVERT_FUNCTION(mist, true)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count, true)
+KERNEL_FILM_CONVERT_FUNCTION(float, true)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
+KERNEL_FILM_CONVERT_FUNCTION(float3, false)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion, false)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow, false)
+KERNEL_FILM_CONVERT_FUNCTION(combined, false)
+KERNEL_FILM_CONVERT_FUNCTION(float4, false)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
 #undef KERNEL_INVOKE
 #undef DEFINE_INTEGRATOR_KERNEL
 #undef DEFINE_INTEGRATOR_SHADE_KERNEL
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -75,6 +75,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -92,12 +92,29 @@

 /* Compute number of threads per block and minimum blocks per multiprocessor
 * given the maximum number of registers per thread. */
-
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
  extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                   (block_num_threads * thread_num_registers))

+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda { \
+    __VA_ARGS__; \
+    __device__ int operator()(const int state) \
+    { \
+      return (func); \
+    } \
+  } ccl_gpu_kernel_lambda_pass
+
 /* sanity checks */

 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a)

 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
+                                                      float x,
+                                                      float y)
 {
  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;

@@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
 ccl_device_noinline T
-kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
 {
  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;

@@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl

 template<typename T>
 ccl_device_noinline T kernel_tex_image_interp_nanovdb(
-    const TextureInfo &info, float x, float y, float z, uint interpolation)
+    ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
  using namespace nanovdb;

@@ -191,7 +193,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(

 ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);

  /* float4, byte4, ushort4 and half4 */
  const int texture_type = info.data_type;
@@ -226,7 +228,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                             float3 P,
                                             InterpolationType interp)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);

  if (info.use_transform_3d) {
    P = transform_point(&info.transform_3d, P);
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif

+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+  ActiveIndexContext(int _thread_index,
+                     int _global_index,
+                     int _threadgroup_size,
+                     int _simdgroup_size,
+                     int _simd_lane_index,
+                     int _simd_group_index,
+                     int _num_simd_groups,
+                     threadgroup int *_simdgroup_offset)
+      : thread_index(_thread_index),
+        global_index(_global_index),
+        blocksize(_threadgroup_size),
+        ccl_gpu_warp_size(_simdgroup_size),
+        thread_warp(_simd_lane_index),
+        warp_index(_simd_group_index),
+        num_warps(_num_simd_groups),
+        warp_offset(_simdgroup_offset)
+  {
+  }
+
+  const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+      num_warps;
+  threadgroup int *warp_offset;
+
+  template<uint blocksizeDummy, typename IsActiveOp>
+  void active_index_array(const uint num_states,
+                          ccl_global int *indices,
+                          ccl_global int *num_indices,
+                          IsActiveOp is_active_op)
+  {
+    const uint state_index = global_index;
+#else
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
-                                                int *indices,
-                                                int *num_indices,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
                                                IsActiveOp is_active_op)
 {
  extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
  const uint warp_index = thread_index / ccl_gpu_warp_size;
  const uint num_warps = blocksize / ccl_gpu_warp_size;

-  /* Test if state corresponding to this thread is active. */
  const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
-  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif

-  /* For each thread within a warp compute how many other active states precede it. */
-  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
-  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+    /* Test if state corresponding to this thread is active. */
+    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;

-  /* Last thread in warp stores number of active states for each warp. */
-  if (thread_warp == ccl_gpu_warp_size - 1) {
-    warp_offset[warp_index] = thread_offset + is_active;
-  }
+    /* For each thread within a warp compute how many other active states precede it. */
+    const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
+                                            ccl_gpu_thread_mask(thread_warp));

-  ccl_gpu_syncthreads();
-
-  /* Last thread in block converts per-warp sizes to offsets, increments global size of
-   * index array and gets offset to write to. */
-  if (thread_index == blocksize - 1) {
-    /* TODO: parallelize this. */
-    int offset = 0;
-    for (int i = 0; i < num_warps; i++) {
-      int num_active = warp_offset[i];
-      warp_offset[i] = offset;
-      offset += num_active;
+    /* Last thread in warp stores number of active states for each warp. */
+    if (thread_warp == ccl_gpu_warp_size - 1) {
+      warp_offset[warp_index] = thread_offset + is_active;
    }

-    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
-    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+    ccl_gpu_syncthreads();
+
+    /* Last thread in block converts per-warp sizes to offsets, increments global size of
+     * index array and gets offset to write to. */
+    if (thread_index == blocksize - 1) {
+      /* TODO: parallelize this. */
+      int offset = 0;
+      for (int i = 0; i < num_warps; i++) {
+        int num_active = warp_offset[i];
+        warp_offset[i] = offset;
+        offset += num_active;
+      }
+
+      const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+      warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+    }
+
+    ccl_gpu_syncthreads();
+
+    /* Write to index array. */
+    if (is_active) {
+      const uint block_offset = warp_offset[num_warps];
+      indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    }
  }

-  ccl_gpu_syncthreads();
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */

-  /* Write to index array. */
-  if (is_active) {
-    const uint block_offset = warp_offset[num_warps];
-    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
-  }
-}
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+#  define gpu_parallel_active_index_array \
+    ActiveIndexContext(metal_local_id, \
+                       metal_global_id, \
+                       metal_local_size, \
+                       simdgroup_size, \
+                       simd_lane_index, \
+                       simd_group_index, \
+                       num_simd_groups, \
+                       simdgroup_offset) \
+        .active_index_array
+#endif

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,10 +33,12 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif

-template<uint blocksize>
-__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
+__device__ void gpu_parallel_prefix_sum(const int global_id,
+                                        ccl_global int *counter,
+                                        ccl_global int *prefix_sum,
+                                        const int num_values)
 {
-  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+  if (global_id != 0) {
    return;
  }

--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -33,16 +33,16 @@ CCL_NAMESPACE_BEGIN
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)

-template<uint blocksize, typename GetKeyOp>
-__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+template<typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint state_index,
+                                                const uint num_states,
                                                const int num_states_limit,
-                                                int *indices,
-                                                int *num_indices,
-                                                int *key_counter,
-                                                int *key_prefix_sum,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
+                                                ccl_global int *key_counter,
+                                                ccl_global int *key_prefix_sum,
                                                GetKeyOp get_key_op)
 {
-  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
  const int key = (state_index < num_states) ? get_key_op(state_index) :
                                               GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;

--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -74,6 +74,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -35,12 +35,29 @@

 /* Compute number of threads per block and minimum blocks per multiprocessor
 * given the maximum number of registers per thread. */
-
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
  extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                   (block_num_threads * thread_num_registers))

+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda { \
+    __VA_ARGS__; \
+    __device__ int operator()(const int state) \
+    { \
+      return (func); \
+    } \
+  } ccl_gpu_kernel_lambda_pass
+
 /* sanity checks */

 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -58,6 +58,98 @@ using namespace metal;

 #define kernel_assert(cond)

+#define ccl_gpu_global_id_x() metal_global_id
+#define ccl_gpu_warp_size simdgroup_size
+#define ccl_gpu_thread_idx_x simd_group_index
+#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
+
+#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
+#define ccl_gpu_popc(x) popcount(x)
+
+// clang-format off
+
+/* kernel.h adapters */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
+#define ccl_gpu_kernel_threads(block_num_threads)
+
+/* Convert a comma-separated list into a semicolon-separated list
+ * (so that we can generate a struct based on kernel entry-point parameters). */
+#define FN0()
+#define FN1(p1) p1;
+#define FN2(p1, p2) p1; p2;
+#define FN3(p1, p2, p3) p1; p2; p3;
+#define FN4(p1, p2, p3, p4) p1; p2; p3; p4;
+#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5;
+#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6;
+#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7;
+#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8;
+#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9;
+#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10;
+#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11;
+#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12;
+#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13;
+#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
+#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
+#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
+#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16
+#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
+
+/* Generate a struct containing the entry-point parameters and a "run"
+ * method which can access them implicitly via this-> */
+#define ccl_gpu_kernel_signature(name, ...) \
+struct kernel_gpu_##name \
+{ \
+  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
+  void run(thread MetalKernelContext& context, \
+           threadgroup int *simdgroup_offset, \
+           const uint metal_global_id, \
+           const ushort metal_local_id, \
+           const ushort metal_local_size, \
+           uint simdgroup_size, \
+           uint simd_lane_index, \
+           uint simd_group_index, \
+           uint num_simd_groups) ccl_global const; \
+}; \
+kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
+                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
+                                constant MetalAncillaries *_metal_ancillaries, \
+                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                const uint metal_global_id [[thread_position_in_grid]], \
+                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
+                                const ushort metal_local_size [[threads_per_threadgroup]], \
+                                uint simdgroup_size [[threads_per_simdgroup]], \
+                                uint simd_lane_index [[thread_index_in_simdgroup]], \
+                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
+                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
+  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
+  INIT_DEBUG_BUFFER \
+  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+} \
+void kernel_gpu_##name::run(thread MetalKernelContext& context, \
+                  threadgroup int *simdgroup_offset, \
+                  const uint metal_global_id, \
+                  const ushort metal_local_id, \
+                  const ushort metal_local_size, \
+                  uint simdgroup_size, \
+                  uint simd_lane_index, \
+                  uint simd_group_index, \
+                  uint num_simd_groups) ccl_global const
+
+#define ccl_gpu_kernel_call(x) context.x
+
+/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda \
+  { \
+    KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \
+    ccl_private MetalKernelContext &context; \
+    __VA_ARGS__; \
+    int operator()(const int state) const { return (func); } \
+  } ccl_gpu_kernel_lambda_pass(context)
+
+// clang-format on
+
 /* make_type definitions with Metal style element initializers */
 #ifdef make_float2
 #  undef make_float2
@@ -124,3 +216,38 @@ using namespace metal;
 #define logf(x) trigmode::log(float(x))

 #define NULL 0
+
+/* texture bindings and sampler setup */
+
+struct Texture2DParamsMetal {
+  texture2d<float, access::sample> tex;
+};
+struct Texture3DParamsMetal {
+  texture3d<float, access::sample> tex;
+};
+
+struct MetalAncillaries {
+  device Texture2DParamsMetal *textures_2d;
+  device Texture3DParamsMetal *textures_3d;
+};
+
+enum SamplerType {
+  SamplerFilterNearest_AddressRepeat,
+  SamplerFilterNearest_AddressClampEdge,
+  SamplerFilterNearest_AddressClampZero,
+
+  SamplerFilterLinear_AddressRepeat,
+  SamplerFilterLinear_AddressClampEdge,
+  SamplerFilterLinear_AddressClampZero,
+
+  SamplerCount
+};
+
+constant constexpr array<sampler, SamplerCount> metal_samplers = {
+    sampler(address::repeat, filter::nearest),
+    sampler(address::clamp_to_edge, filter::nearest),
+    sampler(address::clamp_to_zero, filter::nearest),
+    sampler(address::repeat, filter::linear),
+    sampler(address::clamp_to_edge, filter::linear),
+    sampler(address::clamp_to_zero, filter::linear),
+};
--- a/intern/cycles/kernel/device/metal/context_begin.h
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+
+/* Open the Metal kernel context class
+ * Necessary to access resource bindings */
+class MetalKernelContext {
+  public:
+    constant KernelParamsMetal &launch_params_metal;
+    constant MetalAncillaries *metal_ancillaries;
+
+    MetalKernelContext(constant KernelParamsMetal &_launch_params_metal, constant MetalAncillaries * _metal_ancillaries)
+    : launch_params_metal(_launch_params_metal), metal_ancillaries(_metal_ancillaries)
+    {}
+
+    /* texture fetch adapter functions */
+    typedef uint64_t ccl_gpu_tex_object;
+
+    template<typename T>
+    inline __attribute__((__always_inline__))
+    T ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      kernel_assert(0);
+      return 0;
+    }
+    template<typename T>
+    inline __attribute__((__always_inline__))
+    T ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      kernel_assert(0);
+      return 0;
+    }
+
+    // texture2d
+    template<>
+    inline __attribute__((__always_inline__))
+    float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
+    }
+    template<>
+    inline __attribute__((__always_inline__))
+    float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
+    }
+
+    // texture3d
+    template<>
+    inline __attribute__((__always_inline__))
+    float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
+    }
+    template<>
+    inline __attribute__((__always_inline__))
+    float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
+    }
+#    include "kernel/device/gpu/image.h"
+
+  // clang-format on
--- a/intern/cycles/kernel/device/metal/context_end.h
+++ b/intern/cycles/kernel/device/metal/context_end.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+}
+; /* end of MetalKernelContext class definition */
+
+/* Silently redirect into the MetalKernelContext instance */
+/* NOTE: These macros will need maintaining as entry-points change. */
+
+#undef kernel_integrator_state
+#define kernel_integrator_state context.launch_params_metal.__integrator_state
--- a/intern/cycles/kernel/device/metal/globals.h
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
+
+#include "kernel/integrator/state.h"
+
+CCL_NAMESPACE_BEGIN
+
+typedef struct KernelParamsMetal {
+
+#define KERNEL_TEX(type, name) ccl_constant type *name;
+#include "kernel/textures.h"
+#undef KERNEL_TEX
+
+  const IntegratorStateGPU __integrator_state;
+  const KernelData data;
+
+} KernelParamsMetal;
+
+typedef struct KernelGlobalsGPU {
+  int unused[1];
+} KernelGlobalsGPU;
+
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+#define kernel_data launch_params_metal.data
+#define kernel_integrator_state launch_params_metal.__integrator_state
+
+/* data lookup defines */
+
+#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index]
+#define kernel_tex_array(tex) launch_params_metal.tex
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Metal kernel entry points */
+
+// clang-format off
+
+#include "kernel/device/metal/compat.h"
+#include "kernel/device/metal/globals.h"
+#include "kernel/device/gpu/kernel.h"
+
+// clang-format on
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -76,6 +76,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -57,7 +57,7 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
  const int global_index = optixGetLaunchIndex().x;
  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
                                                       global_index;
-  integrator_intersect_closest(nullptr, path_index);
+  integrator_intersect_closest(nullptr, path_index, __params.render_buffer);
 }

 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
--- a/intern/cycles/kernel/film/accumulate.h
+++ b/intern/cycles/kernel/film/accumulate.h
@@ -33,62 +33,72 @@ CCL_NAMESPACE_BEGIN
 * them separately. */

 ccl_device_inline void bsdf_eval_init(ccl_private BsdfEval *eval,
-                                      const bool is_diffuse,
+                                      const ClosureType closure_type,
                                      float3 value)
 {
  eval->diffuse = zero_float3();
  eval->glossy = zero_float3();

-  if (is_diffuse) {
+  if (CLOSURE_IS_BSDF_DIFFUSE(closure_type)) {
    eval->diffuse = value;
  }
-  else {
+  else if (CLOSURE_IS_BSDF_GLOSSY(closure_type)) {
    eval->glossy = value;
  }
+
+  eval->sum = value;
 }

 ccl_device_inline void bsdf_eval_accum(ccl_private BsdfEval *eval,
-                                       const bool is_diffuse,
-                                       float3 value,
-                                       float mis_weight)
+                                       const ClosureType closure_type,
+                                       float3 value)
 {
-  value *= mis_weight;
-
-  if (is_diffuse) {
+  if (CLOSURE_IS_BSDF_DIFFUSE(closure_type)) {
    eval->diffuse += value;
  }
-  else {
+  else if (CLOSURE_IS_BSDF_GLOSSY(closure_type)) {
    eval->glossy += value;
  }
+
+  eval->sum += value;
 }

 ccl_device_inline bool bsdf_eval_is_zero(ccl_private BsdfEval *eval)
 {
-  return is_zero(eval->diffuse) && is_zero(eval->glossy);
+  return is_zero(eval->sum);
 }

 ccl_device_inline void bsdf_eval_mul(ccl_private BsdfEval *eval, float value)
 {
  eval->diffuse *= value;
  eval->glossy *= value;
+  eval->sum *= value;
 }

 ccl_device_inline void bsdf_eval_mul3(ccl_private BsdfEval *eval, float3 value)
 {
  eval->diffuse *= value;
  eval->glossy *= value;
+  eval->sum *= value;
 }

 ccl_device_inline float3 bsdf_eval_sum(ccl_private const BsdfEval *eval)
 {
-  return eval->diffuse + eval->glossy;
+  return eval->sum;
 }

-ccl_device_inline float3 bsdf_eval_diffuse_glossy_ratio(ccl_private const BsdfEval *eval)
+ccl_device_inline float3 bsdf_eval_pass_diffuse_weight(ccl_private const BsdfEval *eval)
 {
-  /* Ratio of diffuse and glossy to recover proportions for writing to render pass.
+  /* Ratio of diffuse weight to recover proportions for writing to render pass.
   * We assume reflection, transmission and volume scatter to be exclusive. */
-  return safe_divide_float3_float3(eval->diffuse, eval->diffuse + eval->glossy);
+  return safe_divide_float3_float3(eval->diffuse, eval->sum);
+}
+
+ccl_device_inline float3 bsdf_eval_pass_glossy_weight(ccl_private const BsdfEval *eval)
+{
+  /* Ratio of glossy weight to recover proportions for writing to render pass.
+   * We assume reflection, transmission and volume scatter to be exclusive. */
+  return safe_divide_float3_float3(eval->glossy, eval->sum);
 }

 /* --------------------------------------------------------------------
@@ -141,7 +151,8 @@ ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
 ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
                                          ConstIntegratorState state,
                                          ccl_global float *ccl_restrict render_buffer,
-                                          int sample)
+                                          int sample,
+                                          int sample_offset)
 {
  if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
    return sample;
@@ -149,7 +160,8 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg,

  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);

-  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
+  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
+         sample_offset;
 }

 ccl_device void kernel_accum_adaptive_buffer(KernelGlobals kg,
@@ -351,37 +363,47 @@ ccl_device_inline void kernel_accum_emission_or_background_pass(KernelGlobals kg
    /* Directly visible, write to emission or background pass. */
    pass_offset = pass;
  }
-  else if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
-    /* Indirectly visible through reflection. */
-    const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
-                                       ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                                            kernel_data.film.pass_glossy_direct :
-                                            kernel_data.film.pass_glossy_indirect) :
-                                       ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                                            kernel_data.film.pass_transmission_direct :
-                                            kernel_data.film.pass_transmission_indirect);
+  else if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    if (path_flag & PATH_RAY_SURFACE_PASS) {
+      /* Indirectly visible through reflection. */
+      const float3 diffuse_weight = INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+      const float3 glossy_weight = INTEGRATOR_STATE(state, path, pass_glossy_weight);

-    if (glossy_pass_offset != PASS_UNUSED) {
-      /* Glossy is a subset of the throughput, reconstruct it here using the
-       * diffuse-glossy ratio. */
-      const float3 ratio = INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
-      const float3 glossy_contribution = (one_float3() - ratio) * contribution;
-      kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
-    }
+      /* Glossy */
+      const int glossy_pass_offset = ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                                          kernel_data.film.pass_glossy_direct :
+                                          kernel_data.film.pass_glossy_indirect);
+      if (glossy_pass_offset != PASS_UNUSED) {
+        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_weight * contribution);
+      }

-    /* Reconstruct diffuse subset of throughput. */
-    pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                      kernel_data.film.pass_diffuse_direct :
-                      kernel_data.film.pass_diffuse_indirect;
-    if (pass_offset != PASS_UNUSED) {
-      contribution *= INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
+      /* Transmission */
+      const int transmission_pass_offset = ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                                                kernel_data.film.pass_transmission_direct :
+                                                kernel_data.film.pass_transmission_indirect);
+
+      if (transmission_pass_offset != PASS_UNUSED) {
+        /* Transmission is what remains if not diffuse and glossy, not stored explicitly to save
+         * GPU memory. */
+        const float3 transmission_weight = one_float3() - diffuse_weight - glossy_weight;
+        kernel_write_pass_float3(buffer + transmission_pass_offset,
+                                 transmission_weight * contribution);
+      }
+
+      /* Reconstruct diffuse subset of throughput. */
+      pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                        kernel_data.film.pass_diffuse_direct :
+                        kernel_data.film.pass_diffuse_indirect;
+      if (pass_offset != PASS_UNUSED) {
+        contribution *= diffuse_weight;
+      }
+    }
+    else if (path_flag & PATH_RAY_VOLUME_PASS) {
+      /* Indirectly visible through volume. */
+      pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                        kernel_data.film.pass_volume_direct :
+                        kernel_data.film.pass_volume_indirect;
    }
-  }
-  else if (path_flag & PATH_RAY_VOLUME_PASS) {
-    /* Indirectly visible through volume. */
-    pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                      kernel_data.film.pass_volume_direct :
-                      kernel_data.film.pass_volume_indirect;
  }

  /* Single write call for GPU coherence. */
@@ -426,45 +448,56 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
 #ifdef __PASSES__
  if (kernel_data.film.light_pass_flag & PASS_ANY) {
    const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag);
-    int pass_offset = PASS_UNUSED;

-    if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
-      /* Indirectly visible through reflection. */
-      const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
-                                         ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                                              kernel_data.film.pass_glossy_direct :
-                                              kernel_data.film.pass_glossy_indirect) :
-                                         ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                                              kernel_data.film.pass_transmission_direct :
-                                              kernel_data.film.pass_transmission_indirect);
+    if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+      int pass_offset = PASS_UNUSED;

-      if (glossy_pass_offset != PASS_UNUSED) {
-        /* Glossy is a subset of the throughput, reconstruct it here using the
-         * diffuse-glossy ratio. */
-        const float3 ratio = INTEGRATOR_STATE(state, shadow_path, diffuse_glossy_ratio);
-        const float3 glossy_contribution = (one_float3() - ratio) * contribution;
-        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+      if (path_flag & PATH_RAY_SURFACE_PASS) {
+        /* Indirectly visible through reflection. */
+        const float3 diffuse_weight = INTEGRATOR_STATE(state, shadow_path, pass_diffuse_weight);
+        const float3 glossy_weight = INTEGRATOR_STATE(state, shadow_path, pass_glossy_weight);
+
+        /* Glossy */
+        const int glossy_pass_offset = ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                                            kernel_data.film.pass_glossy_direct :
+                                            kernel_data.film.pass_glossy_indirect);
+        if (glossy_pass_offset != PASS_UNUSED) {
+          kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_weight * contribution);
+        }
+
+        /* Transmission */
+        const int transmission_pass_offset = ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                                                  kernel_data.film.pass_transmission_direct :
+                                                  kernel_data.film.pass_transmission_indirect);
+
+        if (transmission_pass_offset != PASS_UNUSED) {
+          /* Transmission is what remains if not diffuse and glossy, not stored explicitly to save
+           * GPU memory. */
+          const float3 transmission_weight = one_float3() - diffuse_weight - glossy_weight;
+          kernel_write_pass_float3(buffer + transmission_pass_offset,
+                                   transmission_weight * contribution);
+        }
+
+        /* Reconstruct diffuse subset of throughput. */
+        pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                          kernel_data.film.pass_diffuse_direct :
+                          kernel_data.film.pass_diffuse_indirect;
+        if (pass_offset != PASS_UNUSED) {
+          contribution *= diffuse_weight;
+        }
+      }
+      else if (path_flag & PATH_RAY_VOLUME_PASS) {
+        /* Indirectly visible through volume. */
+        pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                          kernel_data.film.pass_volume_direct :
+                          kernel_data.film.pass_volume_indirect;
      }

-      /* Reconstruct diffuse subset of throughput. */
-      pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                        kernel_data.film.pass_diffuse_direct :
-                        kernel_data.film.pass_diffuse_indirect;
+      /* Single write call for GPU coherence. */
      if (pass_offset != PASS_UNUSED) {
-        contribution *= INTEGRATOR_STATE(state, shadow_path, diffuse_glossy_ratio);
+        kernel_write_pass_float3(buffer + pass_offset, contribution);
      }
    }
-    else if (path_flag & PATH_RAY_VOLUME_PASS) {
-      /* Indirectly visible through volume. */
-      pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                        kernel_data.film.pass_volume_direct :
-                        kernel_data.film.pass_volume_indirect;
-    }
-
-    /* Single write call for GPU coherence. */
-    if (pass_offset != PASS_UNUSED) {
-      kernel_write_pass_float3(buffer + pass_offset, contribution);
-    }

    /* Write shadow pass. */
    if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
@@ -540,11 +573,10 @@ ccl_device_inline void kernel_accum_background(KernelGlobals kg,
 /* Write emission to render buffer. */
 ccl_device_inline void kernel_accum_emission(KernelGlobals kg,
                                             ConstIntegratorState state,
-                                             const float3 throughput,
                                             const float3 L,
                                             ccl_global float *ccl_restrict render_buffer)
 {
-  float3 contribution = throughput * L;
+  float3 contribution = L;
  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);

  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
--- a/intern/cycles/kernel/film/passes.h
+++ b/intern/cycles/kernel/film/passes.h
@@ -160,40 +160,6 @@ ccl_device_forceinline void kernel_write_denoising_features_volume(KernelGlobals
 }
 #endif /* __DENOISING_FEATURES__ */

-#ifdef __SHADOW_CATCHER__
-
-/* Write shadow catcher passes on a bounce from the shadow catcher object. */
-ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
-    KernelGlobals kg,
-    IntegratorState state,
-    ccl_private const ShaderData *sd,
-    ccl_global float *ccl_restrict render_buffer)
-{
-  if (!kernel_data.integrator.has_shadow_catcher) {
-    return;
-  }
-
-  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
-  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
-
-  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, sd->object_flag)) {
-    return;
-  }
-
-  ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
-
-  /* Count sample for the shadow catcher object. */
-  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
-
-  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
-   * transparency to the matte. */
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
-                          average(throughput));
-}
-
-#endif /* __SHADOW_CATCHER__ */
-
 ccl_device_inline size_t kernel_write_id_pass(ccl_global float *ccl_restrict buffer,
                                              size_t depth,
                                              float id,
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -65,7 +65,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
  }

  /* Always count the sample, even if the camera sample will reject the ray. */
-  const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample);
+  const int sample = kernel_accum_sample(
+      kg, state, render_buffer, scheduled_sample, tile->sample_offset);

  /* Setup render buffers. */
  const int index = INTEGRATOR_STATE(state, path, render_pixel_index);
--- a/intern/cycles/kernel/integrator/init_from_camera.h
+++ b/intern/cycles/kernel/integrator/init_from_camera.h
@@ -89,7 +89,8 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
   * This logic allows to both count actual number of samples per pixel, and to add samples to this
   * pixel after it was converged and samples were added somewhere else (in which case the
   * `scheduled_sample` will be different from actual number of samples in this pixel). */
-  const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample);
+  const int sample = kernel_accum_sample(
+      kg, state, render_buffer, scheduled_sample, tile->sample_offset);

  /* Initialize random number seed for path. */
  const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -31,7 +31,6 @@

 CCL_NAMESPACE_BEGIN

-template<uint32_t current_kernel>
 ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
                                                           IntegratorState state,
                                                           const int shader_flags)
@@ -86,36 +85,80 @@ ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
  return false;
 }

-/* Note that current_kernel is a template value since making this a variable
- * leads to poor performance with CUDA atomics. */
-template<uint32_t current_kernel>
-ccl_device_forceinline void integrator_intersect_shader_next_kernel(
+#ifdef __SHADOW_CATCHER__
+/* Split path if a shadow catcher was hit. */
+ccl_device_forceinline void integrator_split_shadow_catcher(
    KernelGlobals kg,
    IntegratorState state,
    ccl_private const Intersection *ccl_restrict isect,
-    const int shader,
-    const int shader_flags)
+    ccl_global float *ccl_restrict render_buffer)
 {
-  /* Note on scheduling.
-   *
-   * When there is no shadow catcher split the scheduling is simple: schedule surface shading with
-   * or without raytrace support, depending on the shader used.
-   *
-   * When there is a shadow catcher split the general idea is to have the following configuration:
-   *
-   *  - Schedule surface shading kernel (with corresponding raytrace support) for the ray which
-   *    will trace shadow catcher object.
-   *
-   *  - When no alpha-over of approximate shadow catcher is needed, schedule surface shading for
-   *    the matte ray.
-   *
-   *  - Otherwise schedule background shading kernel, so that we have a background to alpha-over
-   *    on. The background kernel will then schedule surface shading for the matte ray.
+  /* Test if we hit a shadow catcher object, and potentially split the path to continue tracing two
+   * paths from here. */
+  const int object_flags = intersection_get_object_flags(kg, isect);
+  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, object_flags)) {
+    return;
+  }
+
+  kernel_write_shadow_catcher_bounce_data(kg, state, render_buffer);
+
+  /* Mark state as having done a shadow catcher split so that it stops contributing to
+   * the shadow catcher matte pass, but keeps contributing to the combined pass. */
+  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
+
+  /* Copy current state to new state. */
+  state = integrator_state_shadow_catcher_split(kg, state);
+
+  /* Initialize new state.
   *
   * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
   * the matte path. */

-  const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);
+  /* Mark current state so that it will only track contribution of shadow catcher objects ignoring
+   * non-catcher objects. */
+  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
+
+  if (kernel_data.film.pass_background != PASS_UNUSED && !kernel_data.background.transparent) {
+    /* If using background pass, schedule background shading kernel so that we have a background
+     * to alpha-over on. The background kernel will then continue the path afterwards. */
+    INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
+
+  if (!integrator_state_volume_stack_is_empty(kg, state)) {
+    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
+     * objects from it, and then continue shading volume and shadow catcher surface after. */
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    return;
+  }
+
+  /* Continue with shading shadow catcher surface. */
+  const int shader = intersection_get_shader(kg, isect);
+  const int flags = kernel_tex_fetch(__shaders, shader).flags;
+  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+
+  if (use_raytrace_kernel) {
+    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+  }
+  else {
+    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+  }
+}
+
+/* Schedule next kernel to be executed after updating volume stack for shadow catcher. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_volume(
+    KernelGlobals kg, IntegratorState state)
+{
+  /* Continue with shading shadow catcher surface. Same as integrator_split_shadow_catcher, but
+   * using NEXT instead of INIT. */
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(kg, state, &isect);
+
+  const int shader = intersection_get_shader(kg, &isect);
+  const int flags = kernel_tex_fetch(__shaders, shader).flags;
+  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);

  if (use_raytrace_kernel) {
    INTEGRATOR_PATH_NEXT_SORTED(
@@ -124,26 +167,141 @@ ccl_device_forceinline void integrator_intersect_shader_next_kernel(
  else {
    INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
  }
-
-#ifdef __SHADOW_CATCHER__
-  const int object_flags = intersection_get_object_flags(kg, isect);
-  if (kernel_shadow_catcher_split(kg, state, object_flags)) {
-    if (kernel_data.film.pass_background != PASS_UNUSED && !kernel_data.background.transparent) {
-      INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
-
-      INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-    }
-    else if (use_raytrace_kernel) {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
-    }
-    else {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
-    }
-  }
-#endif
 }

-ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState state)
+/* Schedule next kernel to be executed after executing background shader for shadow catcher. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_background(
+    KernelGlobals kg, IntegratorState state)
+{
+  /* Same logic as integrator_split_shadow_catcher, but using NEXT instead of INIT. */
+  if (!integrator_state_volume_stack_is_empty(kg, state)) {
+    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
+     * objects from it, and then continue shading volume and shadow catcher surface after. */
+    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    return;
+  }
+
+  /* Continue with shading shadow catcher surface. */
+  integrator_intersect_next_kernel_after_shadow_catcher_volume<current_kernel>(kg, state);
+}
+#endif
+
+/* Schedule next kernel to be executed after intersect closest.
+ *
+ * Note that current_kernel is a template value since making this a variable
+ * leads to poor performance with CUDA atomics. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel(
+    KernelGlobals kg,
+    IntegratorState state,
+    ccl_private const Intersection *ccl_restrict isect,
+    ccl_global float *ccl_restrict render_buffer,
+    const bool hit)
+{
+  /* Continue with volume kernel if we are inside a volume, regardless if we hit anything. */
+#ifdef __VOLUME__
+  if (!integrator_state_volume_stack_is_empty(kg, state)) {
+    const bool hit_surface = hit && !(isect->type & PRIMITIVE_LAMP);
+    const int shader = (hit_surface) ? intersection_get_shader(kg, isect) : SHADER_NONE;
+    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
+
+    if (!integrator_intersect_terminate(kg, state, flags)) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    }
+    else {
+      INTEGRATOR_PATH_TERMINATE(current_kernel);
+    }
+    return;
+  }
+#endif
+
+  if (hit) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect->type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      if (!integrator_intersect_terminate(kg, state, flags)) {
+        const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+        if (use_raytrace_kernel) {
+          INTEGRATOR_PATH_NEXT_SORTED(
+              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+        }
+        else {
+          INTEGRATOR_PATH_NEXT_SORTED(
+              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+        }
+
+#ifdef __SHADOW_CATCHER__
+        /* Handle shadow catcher. */
+        integrator_split_shadow_catcher(kg, state, isect, render_buffer);
+#endif
+      }
+      else {
+        INTEGRATOR_PATH_TERMINATE(current_kernel);
+      }
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+  }
+}
+
+/* Schedule next kernel to be executed after shade volume.
+ *
+ * The logic here matches integrator_intersect_next_kernel, except that
+ * volume shading and termination testing have already been done. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
+    KernelGlobals kg,
+    IntegratorState state,
+    ccl_private const Intersection *ccl_restrict isect,
+    ccl_global float *ccl_restrict render_buffer)
+{
+  if (isect->prim != PRIM_NONE) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect->type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+      const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+
+      if (use_raytrace_kernel) {
+        INTEGRATOR_PATH_NEXT_SORTED(
+            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+      }
+      else {
+        INTEGRATOR_PATH_NEXT_SORTED(
+            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+      }
+
+#ifdef __SHADOW_CATCHER__
+      /* Handle shadow catcher. */
+      integrator_split_shadow_catcher(kg, state, isect, render_buffer);
+#endif
+      return;
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
+}
+
+ccl_device void integrator_intersect_closest(KernelGlobals kg,
+                                             IntegratorState state,
+                                             ccl_global float *ccl_restrict render_buffer)
 {
  PROFILING_INIT(kg, PROFILING_INTERSECT_CLOSEST);

@@ -192,56 +350,9 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState s
  /* Write intersection result into global integrator state memory. */
  integrator_state_write_isect(kg, state, &isect);

-#ifdef __VOLUME__
-  if (!integrator_state_volume_stack_is_empty(kg, state)) {
-    const bool hit_surface = hit && !(isect.type & PRIMITIVE_LAMP);
-    const int shader = (hit_surface) ? intersection_get_shader(kg, &isect) : SHADER_NONE;
-    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
-
-    if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
-            kg, state, flags)) {
-      /* Continue with volume kernel if we are inside a volume, regardless
-       * if we hit anything. */
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
-    }
-    else {
-      INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
-    }
-    return;
-  }
-#endif
-
-  if (hit) {
-    /* Hit a surface, continue with light or surface kernel. */
-    if (isect.type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
-      return;
-    }
-    else {
-      /* Hit a surface, continue with surface kernel unless terminated. */
-      const int shader = intersection_get_shader(kg, &isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
-      if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
-              kg, state, flags)) {
-        integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
-            kg, state, &isect, shader, flags);
-        return;
-      }
-      else {
-        INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
-        return;
-      }
-    }
-  }
-  else {
-    /* Nothing hit, continue with background kernel. */
-    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
-                         DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-    return;
-  }
+  /* Setup up next kernel to be executed. */
+  integrator_intersect_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+      kg, state, &isect, render_buffer, hit);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h
@@ -42,10 +42,13 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
  /* Store to avoid global fetches on every intersection step. */
  const uint volume_stack_size = kernel_data.volume_stack_size;

+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+  const uint32_t visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, PATH_RAY_ALL_VISIBILITY);
+
 #ifdef __VOLUME_RECORD_ALL__
  Intersection hits[2 * MAX_VOLUME_STACK_SIZE + 1];
  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * volume_stack_size, PATH_RAY_ALL_VISIBILITY);
+      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
  if (num_hits > 0) {
    Intersection *isect = hits;

@@ -60,7 +63,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
  Intersection isect;
  int step = 0;
  while (step < 2 * volume_stack_size &&
-         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
+         scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
    volume_stack_enter_exit(kg, state, stack_sd);

@@ -74,7 +77,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
 #endif
 }

-ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
+ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState state)
 {
  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);

@@ -89,14 +92,20 @@ ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorSt
  volume_ray.D = make_float3(0.0f, 0.0f, 1.0f);
  volume_ray.t = FLT_MAX;

-  const uint visibility = (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_ALL_VISIBILITY);
  int stack_index = 0, enclosed_index = 0;

-  /* Write background shader. */
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+  const uint32_t visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, PATH_RAY_CAMERA);
+
+  /* Initialize volume stack with background volume For shadow catcher the
+   * background volume is always assumed to be CG. */
  if (kernel_data.background.volume_shader != SHADER_NONE) {
-    const VolumeStack new_entry = {OBJECT_NONE, kernel_data.background.volume_shader};
-    integrator_state_write_volume_stack(state, stack_index, new_entry);
-    stack_index++;
+    if (!(path_flag & PATH_RAY_SHADOW_CATCHER_PASS)) {
+      INTEGRATOR_STATE_ARRAY_WRITE(state, volume_stack, stack_index, object) = OBJECT_NONE;
+      INTEGRATOR_STATE_ARRAY_WRITE(
+          state, volume_stack, stack_index, shader) = kernel_data.background.volume_shader;
+      stack_index++;
+    }
  }

  /* Store to avoid global fetches on every intersection step. */
@@ -202,9 +211,22 @@ ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorSt
  /* Write terminator. */
  const VolumeStack new_entry = {OBJECT_NONE, SHADER_NONE};
  integrator_state_write_volume_stack(state, stack_index, new_entry);
+}

-  INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
-                       DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
+{
+  integrator_volume_stack_init(kg, state);
+
+  if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SHADOW_CATCHER_PASS) {
+    /* Volume stack re-init for shadow catcher, continue with shading of hit. */
+    integrator_intersect_next_kernel_after_shadow_catcher_volume<
+        DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK>(kg, state);
+  }
+  else {
+    /* Volume stack init for camera rays, continue with intersection of camera ray. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+  }
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/megakernel.h
+++ b/intern/cycles/kernel/integrator/megakernel.h
@@ -76,7 +76,7 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
    if (queued_kernel) {
      switch (queued_kernel) {
        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
-          integrator_intersect_closest(kg, state);
+          integrator_intersect_closest(kg, state, render_buffer);
          break;
        case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
          integrator_shade_background(kg, state, render_buffer);
--- a/intern/cycles/kernel/integrator/path_state.h
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -185,7 +185,7 @@ ccl_device_inline void path_state_next(KernelGlobals kg, IntegratorState state,

    /* Render pass categories. */
    if (bounce == 1) {
-      flag |= (label & LABEL_TRANSMIT) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
+      flag |= PATH_RAY_SURFACE_PASS;
    }
  }

--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -175,7 +175,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,

      /* Write to render buffer. */
      const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-      kernel_accum_emission(kg, state, throughput, light_eval, render_buffer);
+      kernel_accum_emission(kg, state, throughput * light_eval, render_buffer);
    }
  }
 }
@@ -192,23 +192,11 @@ ccl_device void integrator_shade_background(KernelGlobals kg,

 #ifdef __SHADOW_CATCHER__
  if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    /* Special case for shadow catcher where we want to fill the background pass
+     * behind the shadow catcher but also continue tracing the path. */
    INTEGRATOR_STATE_WRITE(state, path, flag) &= ~PATH_RAY_SHADOW_CATCHER_BACKGROUND;
-
-    const int isect_prim = INTEGRATOR_STATE(state, isect, prim);
-    const int isect_type = INTEGRATOR_STATE(state, isect, type);
-    const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim, isect_type);
-    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
-
-    if (shader_flags & SD_HAS_RAYTRACE) {
-      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
-                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
-                                  shader);
-    }
-    else {
-      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
-                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
-                                  shader);
-    }
+    integrator_intersect_next_kernel_after_shadow_catcher_background<
+        DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND>(kg, state);
    return;
  }
 #endif
--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -90,7 +90,7 @@ ccl_device_inline void integrate_light(KernelGlobals kg,

  /* Write to render buffer. */
  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_accum_emission(kg, state, throughput, light_eval, render_buffer);
+  kernel_accum_emission(kg, state, throughput * light_eval, render_buffer);
 }

 ccl_device void integrator_shade_light(KernelGlobals kg,
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -101,7 +101,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
  }

  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_accum_emission(kg, state, throughput, L, render_buffer);
+  kernel_accum_emission(kg, state, throughput * L, render_buffer);
 }
 #endif /* __EMISSION__ */

@@ -191,14 +191,18 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
  const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
  uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag);
  shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
-  shadow_flag |= (is_transmission) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
+  shadow_flag |= PATH_RAY_SURFACE_PASS;
  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval);

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 diffuse_glossy_ratio = (bounce == 0) ?
-                                            bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
-                                            INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+    const float3 pass_diffuse_weight = (bounce == 0) ?
+                                           bsdf_eval_pass_diffuse_weight(&bsdf_eval) :
+                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+    const float3 pass_glossy_weight = (bounce == 0) ?
+                                          bsdf_eval_pass_glossy_weight(&bsdf_eval) :
+                                          INTEGRATOR_STATE(state, path, pass_glossy_weight);
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight;
  }

  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
@@ -283,7 +287,9 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
    if (INTEGRATOR_STATE(state, path, bounce) == 0) {
-      INTEGRATOR_STATE_WRITE(state, path, diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(
+      INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = bsdf_eval_pass_diffuse_weight(
+          &bsdf_eval);
+      INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = bsdf_eval_pass_glossy_weight(
          &bsdf_eval);
    }
  }
@@ -445,7 +451,7 @@ ccl_device bool integrate_surface(KernelGlobals kg,
    }
 #endif

-    shader_prepare_surface_closures(kg, state, &sd);
+    shader_prepare_surface_closures(kg, state, &sd, path_flag);

 #ifdef __HOLDOUT__
    /* Evaluate holdout. */
@@ -492,10 +498,6 @@ ccl_device bool integrate_surface(KernelGlobals kg,
    kernel_write_denoising_features_surface(kg, state, &sd, render_buffer);
 #endif

-#ifdef __SHADOW_CATCHER__
-    kernel_write_shadow_catcher_bounce_data(kg, state, &sd, render_buffer);
-#endif
-
    /* Direct light. */
    PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
    integrate_surface_direct_light(kg, state, &sd, &rng_state);
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -608,7 +608,7 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
        if (!result.indirect_scatter) {
          const float3 emission = volume_emission_integrate(
              &coeff, closure_flag, transmittance, dt);
-          accum_emission += emission;
+          accum_emission += result.indirect_throughput * emission;
        }
      }

@@ -661,7 +661,7 @@ ccl_device_forceinline void volume_integrate_heterogeneous(

  /* Write accumulated emission. */
  if (!is_zero(accum_emission)) {
-    kernel_accum_emission(kg, state, result.indirect_throughput, accum_emission, render_buffer);
+    kernel_accum_emission(kg, state, accum_emission, render_buffer);
  }

 #  ifdef __DENOISING_FEATURES__
@@ -794,10 +794,11 @@ ccl_device_forceinline void integrate_volume_direct_light(
  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 diffuse_glossy_ratio = (bounce == 0) ?
-                                            one_float3() :
-                                            INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+    const float3 pass_diffuse_weight = (bounce == 0) ?
+                                           one_float3() :
+                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = zero_float3();
  }

  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
@@ -876,7 +877,8 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
  INTEGRATOR_STATE_WRITE(state, path, throughput) = throughput_phase;

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    INTEGRATOR_STATE_WRITE(state, path, diffuse_glossy_ratio) = one_float3();
+    INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_float3();
+    INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_float3();
  }

  /* Update path state */
@@ -1023,25 +1025,9 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,
  }
  else {
    /* Continue to background, light or surface. */
-    if (isect.prim == PRIM_NONE) {
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-      return;
-    }
-    else if (isect.type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
-                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
-      return;
-    }
-    else {
-      /* Hit a surface, continue with surface kernel unless terminated. */
-      const int shader = intersection_get_shader(kg, &isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
-      integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
-          kg, state, &isect, shader, flags);
-      return;
-    }
+    integrator_intersect_next_kernel_after_volume<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
+        kg, state, &isect, render_buffer);
+    return;
  }
 #endif /* __VOLUME__ */
 }
--- a/intern/cycles/kernel/integrator/shader_eval.h
+++ b/intern/cycles/kernel/integrator/shader_eval.h
@@ -105,8 +105,45 @@ ccl_device_inline void shader_copy_volume_phases(ccl_private ShaderVolumePhases

 ccl_device_inline void shader_prepare_surface_closures(KernelGlobals kg,
                                                       ConstIntegratorState state,
-                                                       ccl_private ShaderData *sd)
+                                                       ccl_private ShaderData *sd,
+                                                       const uint32_t path_flag)
 {
+  /* Filter out closures. */
+  if (kernel_data.integrator.filter_closures) {
+    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_EMISSION) {
+      sd->closure_emission_background = zero_float3();
+    }
+
+    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIRECT_LIGHT) {
+      sd->flag &= ~SD_BSDF_HAS_EVAL;
+    }
+
+    if (path_flag & PATH_RAY_CAMERA) {
+      for (int i = 0; i < sd->num_closure; i++) {
+        ccl_private ShaderClosure *sc = &sd->closure[i];
+
+        if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
+          if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIFFUSE) {
+            sc->type = CLOSURE_NONE_ID;
+            sc->sample_weight = 0.0f;
+          }
+        }
+        else if (CLOSURE_IS_BSDF_GLOSSY(sc->type)) {
+          if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_GLOSSY) {
+            sc->type = CLOSURE_NONE_ID;
+            sc->sample_weight = 0.0f;
+          }
+        }
+        else if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type)) {
+          if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSMISSION) {
+            sc->type = CLOSURE_NONE_ID;
+            sc->sample_weight = 0.0f;
+          }
+        }
+      }
+    }
+  }
+
  /* Defensive sampling.
   *
   * We can likely also do defensive sampling at deeper bounces, particularly
@@ -209,8 +246,7 @@ ccl_device_inline float _shader_bsdf_multi_eval(KernelGlobals kg,
        float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);

        if (bsdf_pdf != 0.0f) {
-          const bool is_diffuse = CLOSURE_IS_BSDF_DIFFUSE(sc->type);
-          bsdf_eval_accum(result_eval, is_diffuse, eval * sc->weight, 1.0f);
+          bsdf_eval_accum(result_eval, sc->type, eval * sc->weight);
          sum_pdf += bsdf_pdf * sc->sample_weight;
        }
      }
@@ -235,7 +271,7 @@ ccl_device_inline
                     ccl_private BsdfEval *bsdf_eval,
                     const uint light_shader_flags)
 {
-  bsdf_eval_init(bsdf_eval, false, zero_float3());
+  bsdf_eval_init(bsdf_eval, CLOSURE_NONE_ID, zero_float3());

  return _shader_bsdf_multi_eval(
      kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
@@ -328,8 +364,7 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals kg,
  label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);

  if (*pdf != 0.0f) {
-    const bool is_diffuse = CLOSURE_IS_BSDF_DIFFUSE(sc->type);
-    bsdf_eval_init(bsdf_eval, is_diffuse, eval * sc->weight);
+    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight);

    if (sd->num_closure > 1) {
      const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in);
@@ -655,7 +690,7 @@ ccl_device_inline float _shader_volume_phase_multi_eval(
    float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);

    if (phase_pdf != 0.0f) {
-      bsdf_eval_accum(result_eval, false, eval, 1.0f);
+      bsdf_eval_accum(result_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
      sum_pdf += phase_pdf * svc->sample_weight;
    }

@@ -671,7 +706,7 @@ ccl_device float shader_volume_phase_eval(KernelGlobals kg,
                                          const float3 omega_in,
                                          ccl_private BsdfEval *phase_eval)
 {
-  bsdf_eval_init(phase_eval, false, zero_float3());
+  bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, zero_float3());

  return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
 }
@@ -729,7 +764,7 @@ ccl_device int shader_volume_phase_sample(KernelGlobals kg,
  label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf);

  if (*pdf != 0.0f) {
-    bsdf_eval_init(phase_eval, false, eval);
+    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
  }

  return label;
@@ -752,7 +787,7 @@ ccl_device int shader_phase_sample_closure(KernelGlobals kg,
  label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);

  if (*pdf != 0.0f)
-    bsdf_eval_init(phase_eval, false, eval);
+    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);

  return label;
 }
--- a/intern/cycles/kernel/integrator/shadow_catcher.h
+++ b/intern/cycles/kernel/integrator/shadow_catcher.h
@@ -16,6 +16,7 @@

 #pragma once

+#include "kernel/film/write_passes.h"
 #include "kernel/integrator/path_state.h"
 #include "kernel/integrator/state_util.h"

@@ -47,7 +48,7 @@ ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(KernelGlobals
    return false;
  }

-  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_HIT) {
    return false;
  }

@@ -76,33 +77,6 @@ ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg,
  return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0;
 }

-/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
- * after this function. */
-ccl_device_inline bool kernel_shadow_catcher_split(KernelGlobals kg,
-                                                   IntegratorState state,
-                                                   const int object_flags)
-{
-#ifdef __SHADOW_CATCHER__
-
-  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, object_flags)) {
-    return false;
-  }
-
-  /* The split is to be done. Mark the current state as such, so that it stops contributing to the
-   * shadow catcher matte pass, but keeps contributing to the combined pass. */
-  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
-
-  /* Split new state from the current one. This new state will only track contribution of shadow
-   * catcher objects ignoring non-catcher objects. */
-  integrator_state_shadow_catcher_split(kg, state);
-
-  return true;
-#else
-  (void)object_flags;
-  return false;
-#endif
-}
-
 #ifdef __SHADOW_CATCHER__

 ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(const uint32_t path_flag)
@@ -115,6 +89,28 @@ ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(const uint32_t
  return path_flag & PATH_RAY_SHADOW_CATCHER_PASS;
 }

+/* Write shadow catcher passes on a bounce from the shadow catcher object. */
+ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
+    KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
+{
+  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  ccl_global float *buffer = render_buffer + render_buffer_offset;
+
+  /* Count sample for the shadow catcher object. */
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
+
+  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
+   * transparency to the matte. */
+  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
+                          average(throughput));
+}
+
 #endif /* __SHADOW_CATCHER__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -46,8 +46,9 @@ KERNEL_STRUCT_MEMBER(shadow_path,
                     float3,
                     unshadowed_throughput,
                     KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE)
-/* Ratio of throughput to distinguish diffuse and glossy render passes. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Number of intersections found by ray-tracing. */
 KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_END(shadow_path)
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -173,10 +173,10 @@ typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;

 /* Array access on GPU with Structure-of-Arrays. */

-typedef const int IntegratorState;
-typedef const int ConstIntegratorState;
-typedef const int IntegratorShadowState;
-typedef const int ConstIntegratorShadowState;
+typedef int IntegratorState;
+typedef int ConstIntegratorState;
+typedef int IntegratorShadowState;
+typedef int ConstIntegratorShadowState;

 #  define INTEGRATOR_STATE_NULL -1

--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -60,8 +60,9 @@ KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(path, float, continuation_probability, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
 KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
-/* Ratio of throughput to distinguish diffuse and glossy render passes. */
-KERNEL_STRUCT_MEMBER(path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
+KERNEL_STRUCT_MEMBER(path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Denoising. */
 KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
 /* Shader sorting. */
--- a/intern/cycles/kernel/integrator/state_util.h
+++ b/intern/cycles/kernel/integrator/state_util.h
@@ -326,8 +326,8 @@ ccl_device_inline void integrator_shadow_state_move(KernelGlobals kg,

 /* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
 * after this function. */
-ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
-                                                             IntegratorState state)
+ccl_device_inline IntegratorState integrator_state_shadow_catcher_split(KernelGlobals kg,
+                                                                        IntegratorState state)
 {
 #if defined(__KERNEL_GPU__)
  ConstIntegratorState to_state = atomic_fetch_and_add_uint32(
@@ -337,14 +337,14 @@ ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
 #else
  IntegratorStateCPU *ccl_restrict to_state = state + 1;

-  /* Only copy the required subset, since shadow intersections are big and irrelevant here. */
+  /* Only copy the required subset for performance. */
  to_state->path = state->path;
  to_state->ray = state->ray;
  to_state->isect = state->isect;
  integrator_state_copy_volume_stack(kg, to_state, state);
 #endif

-  INTEGRATOR_STATE_WRITE(to_state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
+  return to_state;
 }

 #ifdef __KERNEL_CPU__
--- a/intern/cycles/kernel/integrator/subsurface.h
+++ b/intern/cycles/kernel/integrator/subsurface.h
@@ -79,7 +79,8 @@ ccl_device int subsurface_bounce(KernelGlobals kg,

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
    if (INTEGRATOR_STATE(state, path, bounce) == 0) {
-      INTEGRATOR_STATE_WRITE(state, path, diffuse_glossy_ratio) = one_float3();
+      INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_float3();
+      INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_float3();
    }
  }

--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -353,8 +353,8 @@ ccl_device bool light_sample_from_distant_ray(KernelGlobals kg,
  /* compute pdf */
  float invarea = klight->distant.invarea;
  ls->pdf = invarea / (costheta * costheta * costheta);
-  ls->pdf *= kernel_data.integrator.pdf_lights;
  ls->eval_fac = ls->pdf;
+  ls->pdf *= kernel_data.integrator.pdf_lights;

  return true;
 }
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -832,16 +832,21 @@ static bool get_object_attribute(const OSLGlobals::Attribute &attr,
 {
  if (attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
      attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor) {
-    return set_attribute_float3(*(float3 *)attr.value.data(), type, derivatives, val);
+    const float *data = (const float *)attr.value.data();
+    return set_attribute_float3(make_float3(data[0], data[1], data[2]), type, derivatives, val);
  }
  else if (attr.type == TypeFloat2) {
-    return set_attribute_float2(*(float2 *)attr.value.data(), type, derivatives, val);
+    const float *data = (const float *)attr.value.data();
+    return set_attribute_float2(make_float2(data[0], data[1]), type, derivatives, val);
  }
  else if (attr.type == TypeDesc::TypeFloat) {
-    return set_attribute_float(*(float *)attr.value.data(), type, derivatives, val);
+    const float *data = (const float *)attr.value.data();
+    return set_attribute_float(data[0], type, derivatives, val);
  }
  else if (attr.type == TypeRGBA || attr.type == TypeDesc::TypeFloat4) {
-    return set_attribute_float4(*(float4 *)attr.value.data(), type, derivatives, val);
+    const float *data = (const float *)attr.value.data();
+    return set_attribute_float4(
+        make_float4(data[0], data[1], data[2], data[3]), type, derivatives, val);
  }
  else if (attr.type == type) {
    size_t datasize = attr.value.datasize();
--- a/intern/cycles/kernel/osl/shader.cpp
+++ b/intern/cycles/kernel/osl/shader.cpp
@@ -132,10 +132,12 @@ static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
  /* Used by render-services. */
  sd->osl_globals = kg;
  if (path_flag & PATH_RAY_SHADOW) {
+    sd->osl_path_state = nullptr;
    sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
  }
  else {
    sd->osl_path_state = (const IntegratorStateCPU *)state;
+    sd->osl_shadow_path_state = nullptr;
  }
 }

--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -286,27 +286,26 @@ enum PathRayFlag {
  PATH_RAY_DENOISING_FEATURES = (1U << 23U),

  /* Render pass categories. */
-  PATH_RAY_REFLECT_PASS = (1U << 24U),
-  PATH_RAY_TRANSMISSION_PASS = (1U << 25U),
-  PATH_RAY_VOLUME_PASS = (1U << 26U),
-  PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),
+  PATH_RAY_SURFACE_PASS = (1U << 24U),
+  PATH_RAY_VOLUME_PASS = (1U << 25U),
+  PATH_RAY_ANY_PASS = (PATH_RAY_SURFACE_PASS | PATH_RAY_VOLUME_PASS),

  /* Shadow ray is for a light or surface, or AO. */
-  PATH_RAY_SHADOW_FOR_LIGHT = (1U << 27U),
-  PATH_RAY_SHADOW_FOR_AO = (1U << 28U),
+  PATH_RAY_SHADOW_FOR_LIGHT = (1U << 26U),
+  PATH_RAY_SHADOW_FOR_AO = (1U << 27U),

  /* A shadow catcher object was hit and the path was split into two. */
-  PATH_RAY_SHADOW_CATCHER_HIT = (1U << 29U),
+  PATH_RAY_SHADOW_CATCHER_HIT = (1U << 28U),

  /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into
   * their dedicated pass for later division.
   *
   * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling
   * which is separate from the light passes. */
-  PATH_RAY_SHADOW_CATCHER_PASS = (1U << 30U),
+  PATH_RAY_SHADOW_CATCHER_PASS = (1U << 29U),

  /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
-  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 31U),
+  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1U << 30U),
 };

 /* Configure ray visibility bits for rays and objects respectively,
@@ -428,8 +427,19 @@ typedef enum CryptomatteType {
 typedef struct BsdfEval {
  float3 diffuse;
  float3 glossy;
+  float3 sum;
 } BsdfEval;

+/* Closure Filter */
+
+typedef enum FilterClosures {
+  FILTER_CLOSURE_EMISSION = (1 << 0),
+  FILTER_CLOSURE_DIFFUSE = (1 << 1),
+  FILTER_CLOSURE_GLOSSY = (1 << 2),
+  FILTER_CLOSURE_TRANSMISSION = (1 << 3),
+  FILTER_CLOSURE_DIRECT_LIGHT = (1 << 4),
+} FilterClosures;
+
 /* Shader Flag */

 typedef enum ShaderFlag {
@@ -1186,7 +1196,11 @@ typedef struct KernelIntegrator {
  int has_shadow_catcher;
  float scrambling_distance;

+  /* Closure filter. */
+  int filter_closures;
+
  /* padding */
+  int pad1, pad2, pad3;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);

@@ -1410,6 +1424,7 @@ typedef struct KernelWorkTile {

  uint start_sample;
  uint num_samples;
+  uint sample_offset;

  int offset;
  uint stride;
--- a/intern/cycles/scene/film.cpp
+++ b/intern/cycles/scene/film.cpp
@@ -187,8 +187,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
  kfilm->pass_transmission_indirect = PASS_UNUSED;
  kfilm->pass_volume_direct = PASS_UNUSED;
  kfilm->pass_volume_indirect = PASS_UNUSED;
-  kfilm->pass_volume_direct = PASS_UNUSED;
-  kfilm->pass_volume_indirect = PASS_UNUSED;
  kfilm->pass_shadow = PASS_UNUSED;

  /* Mark passes as unused so that the kernel knows the pass is inaccessible. */
@@ -673,13 +671,12 @@ uint Film::get_kernel_features(const Scene *scene) const
      kernel_features |= KERNEL_FEATURE_DENOISING;
    }

-    if (pass_type != PASS_NONE && pass_type != PASS_COMBINED &&
-        pass_type <= PASS_CATEGORY_LIGHT_END) {
+    if (pass_type >= PASS_DIFFUSE && pass_type <= PASS_VOLUME_INDIRECT) {
      kernel_features |= KERNEL_FEATURE_LIGHT_PASSES;
+    }

-      if (pass_type == PASS_SHADOW) {
-        kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
-      }
+    if (pass_type == PASS_SHADOW) {
+      kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
    }

    if (pass_type == PASS_AO) {
--- a/intern/cycles/scene/image.cpp
+++ b/intern/cycles/scene/image.cpp
@@ -303,7 +303,6 @@ ImageManager::ImageManager(const DeviceInfo &info)
  animation_frame = 0;

  /* Set image limits */
-  features.has_half_float = info.has_half_images;
  features.has_nanovdb = info.has_nanovdb;
 }

@@ -357,8 +356,6 @@ void ImageManager::load_image_metadata(Image *img)

  metadata.detect_colorspace();

-  assert(features.has_half_float ||
-         (metadata.type != IMAGE_DATA_TYPE_HALF4 && metadata.type != IMAGE_DATA_TYPE_HALF));
  assert(features.has_nanovdb || (metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
                                  metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3));

--- a/intern/cycles/scene/image.h
+++ b/intern/cycles/scene/image.h
@@ -100,7 +100,6 @@ class ImageMetaData {
 /* Information about supported features that Image loaders can use. */
 class ImageDeviceFeatures {
 public:
-  bool has_half_float;
  bool has_nanovdb;
 };

--- a/intern/cycles/scene/image_oiio.cpp
+++ b/intern/cycles/scene/image_oiio.cpp
@@ -30,7 +30,8 @@ OIIOImageLoader::~OIIOImageLoader()
 {
 }

-bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMetaData &metadata)
+bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures & /*features*/,
+                                    ImageMetaData &metadata)
 {
  /* Perform preliminary checks, with meaningful logging. */
  if (!path_exists(filepath.string())) {
@@ -76,7 +77,7 @@ bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMe
  }

  /* check if it's half float */
-  if (spec.format == TypeDesc::HALF && features.has_half_float) {
+  if (spec.format == TypeDesc::HALF) {
    is_half = true;
  }

--- a/intern/cycles/scene/integrator.cpp
+++ b/intern/cycles/scene/integrator.cpp
@@ -63,6 +63,14 @@ NODE_DEFINE(Integrator)
  SOCKET_BOOLEAN(caustics_reflective, "Reflective Caustics", true);
  SOCKET_BOOLEAN(caustics_refractive, "Refractive Caustics", true);
  SOCKET_FLOAT(filter_glossy, "Filter Glossy", 0.0f);
+
+  SOCKET_BOOLEAN(use_direct_light, "Use Direct Light", true);
+  SOCKET_BOOLEAN(use_indirect_light, "Use Indirect Light", true);
+  SOCKET_BOOLEAN(use_diffuse, "Use Diffuse", true);
+  SOCKET_BOOLEAN(use_glossy, "Use Glossy", true);
+  SOCKET_BOOLEAN(use_transmission, "Use Transmission", true);
+  SOCKET_BOOLEAN(use_emission, "Use Emission", true);
+
  SOCKET_INT(seed, "Seed", 0);
  SOCKET_FLOAT(sample_clamp_direct, "Sample Clamp Direct", 0.0f);
  SOCKET_FLOAT(sample_clamp_indirect, "Sample Clamp Indirect", 0.0f);
@@ -184,6 +192,27 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
  kintegrator->caustics_refractive = caustics_refractive;
  kintegrator->filter_glossy = (filter_glossy == 0.0f) ? FLT_MAX : 1.0f / filter_glossy;

+  kintegrator->filter_closures = 0;
+  if (!use_direct_light) {
+    kintegrator->filter_closures |= FILTER_CLOSURE_DIRECT_LIGHT;
+  }
+  if (!use_indirect_light) {
+    kintegrator->min_bounce = 1;
+    kintegrator->max_bounce = 1;
+  }
+  if (!use_diffuse) {
+    kintegrator->filter_closures |= FILTER_CLOSURE_DIFFUSE;
+  }
+  if (!use_glossy) {
+    kintegrator->filter_closures |= FILTER_CLOSURE_GLOSSY;
+  }
+  if (!use_transmission) {
+    kintegrator->filter_closures |= FILTER_CLOSURE_TRANSMISSION;
+  }
+  if (!use_emission) {
+    kintegrator->filter_closures |= FILTER_CLOSURE_EMISSION;
+  }
+
  kintegrator->seed = seed;

  kintegrator->sample_clamp_direct = (sample_clamp_direct == 0.0f) ? FLT_MAX :
--- a/intern/cycles/scene/integrator.h
+++ b/intern/cycles/scene/integrator.h
@@ -56,6 +56,13 @@ class Integrator : public Node {
  NODE_SOCKET_API(bool, caustics_refractive)
  NODE_SOCKET_API(float, filter_glossy)

+  NODE_SOCKET_API(bool, use_direct_light);
+  NODE_SOCKET_API(bool, use_indirect_light);
+  NODE_SOCKET_API(bool, use_diffuse);
+  NODE_SOCKET_API(bool, use_glossy);
+  NODE_SOCKET_API(bool, use_transmission);
+  NODE_SOCKET_API(bool, use_emission);
+
  NODE_SOCKET_API(int, seed)

  NODE_SOCKET_API(float, sample_clamp_direct)
--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@@ -274,19 +274,26 @@ void OSLShaderManager::shading_system_init()

        "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */

-        "__unused__", /* PATH_RAY_SINGLE_PASS_DONE */
-        "__unused__", /* PATH_RAY_TRANSPARENT_BACKGROUND */
-        "__unused__", /* PATH_RAY_TERMINATE_IMMEDIATE */
-        "__unused__", /* PATH_RAY_TERMINATE_AFTER_TRANSPARENT */
-        "__unused__", /* PATH_RAY_EMISSION */
-        "__unused__", /* PATH_RAY_SUBSURFACE */
-        "__unused__", /* PATH_RAY_DENOISING_FEATURES */
-        "__unused__", /* PATH_RAY_REFLECT_PASS */
-        "__unused__", /* PATH_RAY_TRANSMISSION_PASS */
-        "__unused__", /* PATH_RAY_VOLUME_PASS */
-        "__unused__", /* PATH_RAY_SHADOW_FOR_LIGHT */
-        "__unused__", /* PATH_RAY_SHADOW_CATCHER_HIT */
-        "__unused__", /* PATH_RAY_SHADOW_CATCHER_PASS */
+        /* Remaining irrelevant bits up to 32. */
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
+        "__unused__",
    };

    const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
--- a/intern/cycles/session/session.cpp
+++ b/intern/cycles/session/session.cpp
@@ -262,6 +262,7 @@ RenderWork Session::run_update_for_next_iteration()
  }

  render_scheduler_.set_num_samples(params.samples);
+  render_scheduler_.set_start_sample(params.sample_offset);
  render_scheduler_.set_time_limit(params.time_limit);

  while (have_tiles) {
@@ -397,7 +398,7 @@ void Session::do_delayed_reset()

  /* Tile and work scheduling. */
  tile_manager_.reset_scheduling(buffer_params_, get_effective_tile_size());
-  render_scheduler_.reset(buffer_params_, params.samples);
+  render_scheduler_.reset(buffer_params_, params.samples, params.sample_offset);

  /* Passes. */
  /* When multiple tiles are used SAMPLE_COUNT pass is used to keep track of possible partial
--- a/intern/cycles/session/session.h
+++ b/intern/cycles/session/session.h
@@ -54,6 +54,7 @@ class SessionParams {

  bool experimental;
  int samples;
+  int sample_offset;
  int pixel_size;
  int threads;

@@ -75,6 +76,7 @@ class SessionParams {

    experimental = false;
    samples = 1024;
+    sample_offset = 0;
    pixel_size = 1;
    threads = 0;
    time_limit = 0.0;
--- a/intern/cycles/session/tile.cpp
+++ b/intern/cycles/session/tile.cpp
@@ -29,6 +29,7 @@
 #include "util/path.h"
 #include "util/string.h"
 #include "util/system.h"
+#include "util/time.h"
 #include "util/types.h"

 CCL_NAMESPACE_BEGIN
@@ -503,9 +504,9 @@ bool TileManager::write_tile(const RenderBuffers &tile_buffers)
    }
  }

-  DCHECK_EQ(tile_buffers.params.pass_stride, buffer_params_.pass_stride);
+  const double time_start = time_dt();

-  vector<float> pixel_storage;
+  DCHECK_EQ(tile_buffers.params.pass_stride, buffer_params_.pass_stride);

  const BufferParams &tile_params = tile_buffers.params;

@@ -515,13 +516,32 @@ bool TileManager::write_tile(const RenderBuffers &tile_buffers)
  const int64_t pass_stride = tile_params.pass_stride;
  const int64_t tile_row_stride = tile_params.width * pass_stride;

-  const int64_t xstride = pass_stride * sizeof(float);
-  const int64_t ystride = xstride * tile_params.width;
-  const int64_t zstride = ystride * tile_params.height;
-
+  vector<float> pixel_storage;
  const float *pixels = tile_buffers.buffer.data() + tile_params.window_x * pass_stride +
                        tile_params.window_y * tile_row_stride;

+  /* If there is an overscan used for the tile copy pixels into single continuous block of memory
+   * without any "gaps".
+   * This is a workaround for bug in OIIO (https://github.com/OpenImageIO/oiio/pull/3176).
+   * Our task reference: T93008. */
+  if (tile_params.window_x || tile_params.window_y ||
+      tile_params.window_width != tile_params.width ||
+      tile_params.window_height != tile_params.height) {
+    pixel_storage.resize(pass_stride * tile_params.window_width * tile_params.window_height);
+    float *pixels_continuous = pixel_storage.data();
+
+    const int64_t pixels_row_stride = pass_stride * tile_params.width;
+    const int64_t pixels_continuous_row_stride = pass_stride * tile_params.window_width;
+
+    for (int i = 0; i < tile_params.window_height; ++i) {
+      memcpy(pixels_continuous, pixels, sizeof(float) * pixels_continuous_row_stride);
+      pixels += pixels_row_stride;
+      pixels_continuous += pixels_continuous_row_stride;
+    }
+
+    pixels = pixel_storage.data();
+  }
+
  VLOG(3) << "Write tile at " << tile_x << ", " << tile_y;

  /* The image tile sizes in the OpenEXR file are different from the size of our big tiles. The
@@ -531,6 +551,11 @@ bool TileManager::write_tile(const RenderBuffers &tile_buffers)
   *
   * The only thing we have to ensure is that the tile_x and tile_y are a multiple of the
   * image tile size, which happens in compute_render_tile_size. */
+
+  const int64_t xstride = pass_stride * sizeof(float);
+  const int64_t ystride = xstride * tile_params.window_width;
+  const int64_t zstride = ystride * tile_params.window_height;
+
  if (!write_state_.tile_out->write_tiles(tile_x,
                                          tile_x + tile_params.window_width,
                                          tile_y,
@@ -548,6 +573,8 @@ bool TileManager::write_tile(const RenderBuffers &tile_buffers)

  ++write_state_.num_tiles_written;

+  VLOG(3) << "Tile written in " << time_dt() - time_start << " seconds.";
+
  return true;
 }

@@ -589,6 +616,9 @@ void TileManager::finish_write_tiles()
    full_buffer_written_cb(write_state_.filename);
  }

+  VLOG(3) << "Tile file size is "
+          << string_human_readable_number(path_file_size(write_state_.filename)) << " bytes.";
+
  /* Advance the counter upon explicit finish of the file.
   * Makes it possible to re-use tile manager for another scene, and avoids unnecessary increments
   * of the tile-file-within-session index. */
--- a/intern/cycles/util/profiling.cpp
+++ b/intern/cycles/util/profiling.cpp
@@ -171,4 +171,9 @@ bool Profiler::get_object(int object, uint64_t &samples, uint64_t &hits)
  return true;
 }

+bool Profiler::active() const
+{
+  return (worker != nullptr);
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/util/profiling.h
+++ b/intern/cycles/util/profiling.h
@@ -96,6 +96,8 @@ class Profiler {
  bool get_shader(int shader, uint64_t &samples, uint64_t &hits);
  bool get_object(int object, uint64_t &samples, uint64_t &hits);

+  bool active() const;
+
 protected:
  void run();

--- a/intern/ghost/CMakeLists.txt
+++ b/intern/ghost/CMakeLists.txt
@@ -489,6 +489,9 @@ if(WITH_XR_OPENXR)
    intern/GHOST_XrSwapchain.h
    intern/GHOST_Xr_intern.h
    intern/GHOST_Xr_openxr_includes.h
+
+    # Header only library.
+    ../../extern/tinygltf/tiny_gltf.h
  )
  list(APPEND INC
    ../../extern/json/include
--- a/Show More
+++ b/Show More