add cmake changes

fix
cleanup
2021-11-05 14:48:22 +01:00 · 2021-11-05 14:46:42 +01:00 · 2021-11-05 14:46:32 +01:00 · 2021-11-05 13:11:26 +01:00
589 changed files with 4136 additions and 8793 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -440,11 +440,7 @@ mark_as_advanced(WITH_CYCLES_CUDA_BUILD_SERIAL)
 mark_as_advanced(WITH_CUDA_DYNLOAD)

 # AMD HIP
-if(WIN32)
-  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" ON)
-else()
-  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" OFF)
-endif()
+option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" OFF)
 option(WITH_CYCLES_HIP_BINARIES      "Build Cycles AMD HIP binaries" OFF)
 set(CYCLES_HIP_BINARIES_ARCH gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 CACHE STRING "AMD HIP architectures to build binaries for")
 mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
@@ -494,8 +490,7 @@ endif()

 # This should be turned off when Blender enter beta/rc/release
 if("${BLENDER_VERSION_CYCLE}" STREQUAL "release" OR
-   "${BLENDER_VERSION_CYCLE}" STREQUAL "rc" OR
-   "${BLENDER_VERSION_CYCLE}" STREQUAL "beta")
+   "${BLENDER_VERSION_CYCLE}" STREQUAL "rc")
  set(WITH_EXPERIMENTAL_FEATURES OFF)
 else()
  set(WITH_EXPERIMENTAL_FEATURES ON)
--- a/build_files/build_environment/cmake/nanovdb.cmake
+++ b/build_files/build_environment/cmake/nanovdb.cmake
@@ -42,7 +42,6 @@ ExternalProject_Add(nanovdb
  URL_HASH ${NANOVDB_HASH_TYPE}=${NANOVDB_HASH}
  PREFIX ${BUILD_DIR}/nanovdb
  SOURCE_SUBDIR nanovdb
-  PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/nanovdb/src/nanovdb < ${PATCH_DIR}/nanovdb.diff
  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/nanovdb ${DEFAULT_CMAKE_FLAGS} ${NANOVDB_EXTRA_ARGS}
  INSTALL_DIR ${LIBDIR}/nanovdb
 )
--- a/build_files/build_environment/patches/nanovdb.diff
+++ b/build_files/build_environment/patches/nanovdb.diff
@@ -1,374 +0,0 @@
-Index: nanovdb/nanovdb/NanoVDB.h
-===================================================================
--- a/nanovdb/nanovdb/NanoVDB.h	(revision 62751)
-+++ b/nanovdb/nanovdb/NanoVDB.h	(working copy)
-@@ -152,8 +152,8 @@
- 
- #endif // __CUDACC_RTC__
- 
-#ifdef __CUDACC__
-// Only define __hostdev__ when using NVIDIA CUDA compiler
-+#if defined(__CUDACC__) || defined(__HIP__)
-+// Only define __hostdev__ when using NVIDIA CUDA or HIP compiler
- #define __hostdev__ __host__ __device__
- #else
- #define __hostdev__
-@@ -461,7 +461,7 @@
- /// Maximum floating-point values
- template<typename T>
- struct Maximum;
-#ifdef __CUDA_ARCH__
-+#if defined(__CUDA_ARCH__) || defined(__HIP__)
- template<>
- struct Maximum<int>
- {
-@@ -1006,10 +1006,10 @@
- using Vec3i = Vec3<int>;
- 
- /// @brief Return a single precision floating-point vector of this coordinate
-Vec3f Coord::asVec3s() const { return Vec3f(float(mVec[0]), float(mVec[1]), float(mVec[2])); }
-+inline __hostdev__ Vec3f Coord::asVec3s() const { return Vec3f(float(mVec[0]), float(mVec[1]), float(mVec[2])); }
- 
- /// @brief Return a double precision floating-point vector of this coordinate
-Vec3d Coord::asVec3d() const { return Vec3d(double(mVec[0]), double(mVec[1]), double(mVec[2])); }
-+inline __hostdev__ Vec3d Coord::asVec3d() const { return Vec3d(double(mVec[0]), double(mVec[1]), double(mVec[2])); }
- 
- // ----------------------------> Vec4 <--------------------------------------
- 
-@@ -1820,7 +1820,7 @@
- }; // Map
- 
- template<typename Mat4T>
-void Map::set(const Mat4T& mat, const Mat4T& invMat, double taper)
-+__hostdev__ void Map::set(const Mat4T& mat, const Mat4T& invMat, double taper)
- {
-     float * mf = mMatF, *vf = mVecF;
-     float*  mif = mInvMatF;
-@@ -2170,7 +2170,7 @@
- }; // Class Grid
- 
- template<typename TreeT>
-int Grid<TreeT>::findBlindDataForSemantic(GridBlindDataSemantic semantic) const
-+__hostdev__ int Grid<TreeT>::findBlindDataForSemantic(GridBlindDataSemantic semantic) const
- {
-     for (uint32_t i = 0, n = blindDataCount(); i < n; ++i)
-         if (blindMetaData(i).mSemantic == semantic)
-@@ -2328,7 +2328,7 @@
- }; // Tree class
- 
- template<typename RootT>
-void Tree<RootT>::extrema(ValueType& min, ValueType& max) const
-+__hostdev__ void Tree<RootT>::extrema(ValueType& min, ValueType& max) const
- {
-     min = this->root().valueMin();
-     max = this->root().valueMax();
-@@ -2336,7 +2336,7 @@
- 
- template<typename RootT>
- template<typename NodeT>
-const NodeT* Tree<RootT>::getNode(uint32_t i) const
-+__hostdev__ const NodeT* Tree<RootT>::getNode(uint32_t i) const
- {
-     static_assert(is_same<TreeNodeT<NodeT::LEVEL>, NodeT>::value, "Tree::getNode: unvalid node type");
-     NANOVDB_ASSERT(i < DataType::mCount[NodeT::LEVEL]);
-@@ -2345,7 +2345,7 @@
- 
- template<typename RootT>
- template<int LEVEL>
-const typename TreeNode<Tree<RootT>, LEVEL>::type* Tree<RootT>::getNode(uint32_t i) const
-+__hostdev__ const typename TreeNode<Tree<RootT>, LEVEL>::type* Tree<RootT>::getNode(uint32_t i) const
- {
-     NANOVDB_ASSERT(i < DataType::mCount[LEVEL]);
-     return reinterpret_cast<const TreeNodeT<LEVEL>*>(reinterpret_cast<const uint8_t*>(this) + DataType::mBytes[LEVEL]) + i;
-@@ -2353,7 +2353,7 @@
- 
- template<typename RootT>
- template<typename NodeT>
-NodeT* Tree<RootT>::getNode(uint32_t i)
-+__hostdev__ NodeT* Tree<RootT>::getNode(uint32_t i)
- {
-     static_assert(is_same<TreeNodeT<NodeT::LEVEL>, NodeT>::value, "Tree::getNode: invalid node type");
-     NANOVDB_ASSERT(i < DataType::mCount[NodeT::LEVEL]);
-@@ -2362,7 +2362,7 @@
- 
- template<typename RootT>
- template<int LEVEL>
-typename TreeNode<Tree<RootT>, LEVEL>::type* Tree<RootT>::getNode(uint32_t i)
-+__hostdev__ typename TreeNode<Tree<RootT>, LEVEL>::type* Tree<RootT>::getNode(uint32_t i)
- {
-     NANOVDB_ASSERT(i < DataType::mCount[LEVEL]);
-     return reinterpret_cast<TreeNodeT<LEVEL>*>(reinterpret_cast<uint8_t*>(this) + DataType::mBytes[LEVEL]) + i;
-@@ -2370,7 +2370,7 @@
- 
- template<typename RootT>
- template<typename NodeT>
-uint32_t Tree<RootT>::getNodeID(const NodeT& node) const
-+__hostdev__ uint32_t Tree<RootT>::getNodeID(const NodeT& node) const
- {
-     static_assert(is_same<TreeNodeT<NodeT::LEVEL>, NodeT>::value, "Tree::getNodeID: invalid node type");
-     const NodeT* first = reinterpret_cast<const NodeT*>(reinterpret_cast<const uint8_t*>(this) + DataType::mBytes[NodeT::LEVEL]);
-@@ -2380,7 +2380,7 @@
- 
- template<typename RootT>
- template<typename NodeT>
-uint32_t Tree<RootT>::getLinearOffset(const NodeT& node) const
-+__hostdev__ uint32_t Tree<RootT>::getLinearOffset(const NodeT& node) const
- {
-     return this->getNodeID(node) + DataType::mPFSum[NodeT::LEVEL];
- }
-@@ -3366,7 +3366,7 @@
- }; // LeafNode class
- 
- template<typename ValueT, typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
-inline void LeafNode<ValueT, CoordT, MaskT, LOG2DIM>::updateBBox()
-+inline __hostdev__ void LeafNode<ValueT, CoordT, MaskT, LOG2DIM>::updateBBox()
- {
-     static_assert(LOG2DIM == 3, "LeafNode::updateBBox: only supports LOGDIM = 3!");
-     if (!this->isActive()) return;
-Index: nanovdb/nanovdb/util/SampleFromVoxels.h
-===================================================================
--- a/nanovdb/nanovdb/util/SampleFromVoxels.h	(revision 62751)
-+++ b/nanovdb/nanovdb/util/SampleFromVoxels.h	(working copy)
-@@ -22,7 +22,7 @@
- #define NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED
- 
- // Only define __hostdev__ when compiling as NVIDIA CUDA
-#ifdef __CUDACC__
-+#if defined(__CUDACC__) || defined(__HIP__)
- #define __hostdev__ __host__ __device__
- #else
- #include <cmath> // for floor
-@@ -136,7 +136,7 @@
- 
- template<typename TreeOrAccT>
- template<typename Vec3T>
-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const Vec3T& xyz) const
-+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const Vec3T& xyz) const
- {
-     const CoordT ijk = Round<CoordT>(xyz);
-     if (ijk != mPos) {
-@@ -147,7 +147,7 @@
- }
- 
- template<typename TreeOrAccT>
-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const CoordT& ijk) const
-+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const CoordT& ijk) const
- {
-     if (ijk != mPos) {
-         mPos = ijk;
-@@ -158,7 +158,7 @@
- 
- template<typename TreeOrAccT>
- template<typename Vec3T>
-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, false>::operator()(const Vec3T& xyz) const
-+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, false>::operator()(const Vec3T& xyz) const
- {
-     return mAcc.getValue(Round<CoordT>(xyz));
- }
-@@ -195,7 +195,7 @@
- }; // TrilinearSamplerBase
- 
- template<typename TreeOrAccT>
-void TrilinearSampler<TreeOrAccT>::stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const
-+__hostdev__ void TrilinearSampler<TreeOrAccT>::stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const
- {
-     v[0][0][0] = mAcc.getValue(ijk); // i, j, k
- 
-@@ -224,7 +224,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-typename TreeOrAccT::ValueType TrilinearSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
-+__hostdev__ typename TreeOrAccT::ValueType TrilinearSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
- {
- #if 0
-   auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a
-@@ -239,7 +239,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-Vec3T<typename TreeOrAccT::ValueType> TrilinearSampler<TreeOrAccT>::gradient(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
-+__hostdev__ Vec3T<typename TreeOrAccT::ValueType> TrilinearSampler<TreeOrAccT>::gradient(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
- {
-     static_assert(std::is_floating_point<ValueT>::value, "TrilinearSampler::gradient requires a floating-point type");
- #if 0
-@@ -270,7 +270,7 @@
- }
- 
- template<typename TreeOrAccT>
-bool TrilinearSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[2][2][2])
-+__hostdev__ bool TrilinearSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[2][2][2])
- {
-     static_assert(std::is_floating_point<ValueT>::value, "TrilinearSampler::zeroCrossing requires a floating-point type");
-     const bool less = v[0][0][0] < ValueT(0);
-@@ -363,7 +363,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(Vec3T<RealT> xyz) const
-+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(Vec3T<RealT> xyz) const
- {
-     this->cache(xyz);
-     return BaseT::sample(xyz, mVal);
-@@ -370,7 +370,7 @@
- }
- 
- template<typename TreeOrAccT>
-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(const CoordT &ijk) const
-+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(const CoordT &ijk) const
- {
-     return  ijk == mPos ? mVal[0][0][0] : BaseT::mAcc.getValue(ijk);
- }
-@@ -377,7 +377,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, true>::gradient(Vec3T<RealT> xyz) const
-+__hostdev__ Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, true>::gradient(Vec3T<RealT> xyz) const
- {
-     this->cache(xyz);
-     return BaseT::gradient(xyz, mVal);
-@@ -393,7 +393,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-void SampleFromVoxels<TreeOrAccT, 1, true>::cache(Vec3T<RealT>& xyz) const
-+__hostdev__ void SampleFromVoxels<TreeOrAccT, 1, true>::cache(Vec3T<RealT>& xyz) const
- {
-     CoordT ijk = Floor<CoordT>(xyz);
-     if (ijk != mPos) {
-@@ -406,7 +406,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
-+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
- {
-     ValueT val[2][2][2];
-     CoordT ijk = Floor<CoordT>(xyz);
-@@ -418,7 +418,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
-+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
- {
-     auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); };
- 
-@@ -463,7 +463,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-inline Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, false>::gradient(Vec3T<RealT> xyz) const
-+inline __hostdev__ Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, false>::gradient(Vec3T<RealT> xyz) const
- {
-     ValueT val[2][2][2];
-     CoordT ijk = Floor<CoordT>(xyz);
-@@ -473,7 +473,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-bool SampleFromVoxels<TreeOrAccT, 1, false>::zeroCrossing(Vec3T<RealT> xyz) const
-+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 1, false>::zeroCrossing(Vec3T<RealT> xyz) const
- {
-     ValueT val[2][2][2];
-     CoordT ijk = Floor<CoordT>(xyz);
-@@ -510,7 +510,7 @@
- }; // TriquadraticSamplerBase
- 
- template<typename TreeOrAccT>
-void TriquadraticSampler<TreeOrAccT>::stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const
-+__hostdev__ void TriquadraticSampler<TreeOrAccT>::stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const
- {
-     CoordT p(ijk[0] - 1, 0, 0);
-     for (int dx = 0; dx < 3; ++dx, ++p[0]) {
-@@ -526,7 +526,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-typename TreeOrAccT::ValueType TriquadraticSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[3][3][3])
-+__hostdev__ typename TreeOrAccT::ValueType TriquadraticSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[3][3][3])
- {
-     auto kernel = [](const ValueT* value, double weight)->ValueT {
-         return weight * (weight * (0.5f * (value[0] + value[2]) - value[1]) + 
-@@ -545,7 +545,7 @@
- }
- 
- template<typename TreeOrAccT>
-bool TriquadraticSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[3][3][3])
-+__hostdev__ bool TriquadraticSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[3][3][3])
- {
-     static_assert(std::is_floating_point<ValueT>::value, "TrilinearSampler::zeroCrossing requires a floating-point type");
-     const bool less = v[0][0][0] < ValueT(0);
-@@ -624,7 +624,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(Vec3T<RealT> xyz) const
-+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(Vec3T<RealT> xyz) const
- {
-     this->cache(xyz);
-     return BaseT::sample(xyz, mVal);
-@@ -631,7 +631,7 @@
- }
- 
- template<typename TreeOrAccT>
-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(const CoordT &ijk) const
-+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(const CoordT &ijk) const
- {
-     return  ijk == mPos ? mVal[1][1][1] : BaseT::mAcc.getValue(ijk);
- }
-@@ -646,7 +646,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-void SampleFromVoxels<TreeOrAccT, 2, true>::cache(Vec3T<RealT>& xyz) const
-+__hostdev__ void SampleFromVoxels<TreeOrAccT, 2, true>::cache(Vec3T<RealT>& xyz) const
- {
-     CoordT ijk = Floor<CoordT>(xyz);
-     if (ijk != mPos) {
-@@ -657,7 +657,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, false>::operator()(Vec3T<RealT> xyz) const
-+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, false>::operator()(Vec3T<RealT> xyz) const
- {
-     ValueT val[3][3][3];
-     CoordT ijk = Floor<CoordT>(xyz);
-@@ -667,7 +667,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-bool SampleFromVoxels<TreeOrAccT, 2, false>::zeroCrossing(Vec3T<RealT> xyz) const
-+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 2, false>::zeroCrossing(Vec3T<RealT> xyz) const
- {
-     ValueT val[3][3][3];
-     CoordT ijk = Floor<CoordT>(xyz);
-@@ -710,7 +710,7 @@
- }; // TricubicSampler
- 
- template<typename TreeOrAccT>
-void TricubicSampler<TreeOrAccT>::stencil(const CoordT& ijk, ValueT (&C)[64]) const
-+__hostdev__ void TricubicSampler<TreeOrAccT>::stencil(const CoordT& ijk, ValueT (&C)[64]) const
- {
-     auto fetch = [&](int i, int j, int k) -> ValueT& { return C[((i + 1) << 4) + ((j + 1) << 2) + k + 1]; };
- 
-@@ -929,7 +929,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 3, true>::operator()(Vec3T<RealT> xyz) const
-+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 3, true>::operator()(Vec3T<RealT> xyz) const
- {
-     this->cache(xyz);
-     return BaseT::sample(xyz, mC);
-@@ -937,7 +937,7 @@
- 
- template<typename TreeOrAccT>
- template<typename RealT, template<typename...> class Vec3T>
-void SampleFromVoxels<TreeOrAccT, 3, true>::cache(Vec3T<RealT>& xyz) const
-+__hostdev__ void SampleFromVoxels<TreeOrAccT, 3, true>::cache(Vec3T<RealT>& xyz) const
- {
-     CoordT ijk = Floor<CoordT>(xyz);
-     if (ijk != mPos) {
--- a/build_files/cmake/clang_array_check.py
+++ b/build_files/cmake/clang_array_check.py
@@ -168,7 +168,7 @@ def function_parm_wash_tokens(parm):
    # if tokens[-1].kind == To
    # remove trailing char
    if tokens[-1].kind == TokenKind.PUNCTUATION:
-        if tokens[-1].spelling in {",", ")", ";"}:
+        if tokens[-1].spelling in (",", ")", ";"):
            tokens.pop()
        # else:
        #     print(tokens[-1].spelling)
@@ -179,7 +179,7 @@ def function_parm_wash_tokens(parm):
        t_spelling = t.spelling
        ok = True
        if t_kind == TokenKind.KEYWORD:
-            if t_spelling in {"const", "restrict", "volatile"}:
+            if t_spelling in ("const", "restrict", "volatile"):
                ok = False
            elif t_spelling.startswith("__"):
                ok = False  # __restrict
--- a/build_files/cmake/config/blender_release.cmake
+++ b/build_files/cmake/config/blender_release.cmake
@@ -81,5 +81,4 @@ if(NOT APPLE)
  set(WITH_CYCLES_DEVICE_OPTIX    ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_CUDA_BINARIES   ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_CUBIN_COMPILER  OFF CACHE BOOL "" FORCE)
-  set(WITH_CYCLES_HIP_BINARIES    ON  CACHE BOOL "" FORCE)
 endif()
--- a/doc/python_api/examples/bpy.types.Operator.1.py
+++ b/doc/python_api/examples/bpy.types.Operator.1.py
@@ -42,13 +42,8 @@ class SimpleMouseOperator(bpy.types.Operator):
        self.y = event.mouse_y
        return self.execute(context)

-# Only needed if you want to add into a dynamic menu
-def menu_func(self, context):
-    self.layout.operator(SimpleMouseOperator.bl_idname, text="Simple Mouse Operator")

-# Register and add to the view menu (required to also use F3 search "Simple Mouse Operator" for quick access)
 bpy.utils.register_class(SimpleMouseOperator)
-bpy.types.VIEW3D_MT_view.append(menu_func)

 # Test call to the newly defined operator.
 # Here we call the operator and invoke it, meaning that the settings are taken
--- a/doc/python_api/examples/bpy.types.Operator.2.py
+++ b/doc/python_api/examples/bpy.types.Operator.2.py
@@ -43,7 +43,7 @@ def menu_func(self, context):
    self.layout.operator(ExportSomeData.bl_idname, text="Text Export Operator")


-# Register and add to the file selector (required to also use F3 search "Text Export Operator" for quick access)
+# Register and add to the file selector
 bpy.utils.register_class(ExportSomeData)
 bpy.types.TOPBAR_MT_file_export.append(menu_func)

--- a/doc/python_api/examples/bpy.types.Operator.3.py
+++ b/doc/python_api/examples/bpy.types.Operator.3.py
@@ -27,14 +27,8 @@ class DialogOperator(bpy.types.Operator):
        wm = context.window_manager
        return wm.invoke_props_dialog(self)

-# Only needed if you want to add into a dynamic menu
-def menu_func(self, context):
-    self.layout.operator(DialogOperator.bl_idname, text="Dialog Operator")

-
-# Register and add to the object menu (required to also use F3 search "Dialog Operator" for quick access)
 bpy.utils.register_class(DialogOperator)
-bpy.types.VIEW3D_MT_object.append(menu_func)

 # Test call.
 bpy.ops.object.dialog_operator('INVOKE_DEFAULT')
--- a/doc/python_api/examples/bpy.types.Operator.4.py
+++ b/doc/python_api/examples/bpy.types.Operator.4.py
@@ -41,13 +41,8 @@ class CustomDrawOperator(bpy.types.Operator):

        col.prop(self, "my_string")

-# Only needed if you want to add into a dynamic menu
-def menu_func(self, context):
-    self.layout.operator(CustomDrawOperator.bl_idname, text="Custom Draw Operator")

-# Register and add to the object menu (required to also use F3 search "Custom Draw Operator" for quick access)
 bpy.utils.register_class(CustomDrawOperator)
-bpy.types.VIEW3D_MT_object.append(menu_func)

 # test call
 bpy.ops.object.custom_draw('INVOKE_DEFAULT')
--- a/doc/python_api/examples/bpy.types.Operator.5.py
+++ b/doc/python_api/examples/bpy.types.Operator.5.py
@@ -55,13 +55,8 @@ class ModalOperator(bpy.types.Operator):
        context.window_manager.modal_handler_add(self)
        return {'RUNNING_MODAL'}

-# Only needed if you want to add into a dynamic menu
-def menu_func(self, context):
-    self.layout.operator(ModalOperator.bl_idname, text="Modal Operator")

-# Register and add to the object menu (required to also use F3 search "Modal Operator" for quick access)
 bpy.utils.register_class(ModalOperator)
-bpy.types.VIEW3D_MT_object.append(menu_func)

 # test call
 bpy.ops.object.modal_operator('INVOKE_DEFAULT')
--- a/doc/python_api/examples/bpy.types.Operator.6.py
+++ b/doc/python_api/examples/bpy.types.Operator.6.py
@@ -31,13 +31,8 @@ class SearchEnumOperator(bpy.types.Operator):
        context.window_manager.invoke_search_popup(self)
        return {'RUNNING_MODAL'}

-# Only needed if you want to add into a dynamic menu
-def menu_func(self, context):
-    self.layout.operator(SearchEnumOperator.bl_idname, text="Search Enum Operator")

-# Register and add to the object menu (required to also use F3 search "Search Enum Operator" for quick access)
 bpy.utils.register_class(SearchEnumOperator)
-bpy.types.VIEW3D_MT_object.append(menu_func)

 # test call
 bpy.ops.object.search_enum_operator('INVOKE_DEFAULT')
--- a/doc/python_api/examples/bpy.types.Operator.py
+++ b/doc/python_api/examples/bpy.types.Operator.py
@@ -22,13 +22,8 @@ class HelloWorldOperator(bpy.types.Operator):
        print("Hello World")
        return {'FINISHED'}

-# Only needed if you want to add into a dynamic menu
-def menu_func(self, context):
-    self.layout.operator(HelloWorldOperator.bl_idname, text="Hello World Operator")

-# Register and add to the view menu (required to also use F3 search "Hello World Operator" for quick access)
 bpy.utils.register_class(HelloWorldOperator)
-bpy.types.VIEW3D_MT_view.append(menu_func)

 # test call to the newly defined operator
 bpy.ops.wm.hello_world()
--- a/doc/python_api/rst/bgl.rst
+++ b/doc/python_api/rst/bgl.rst
@@ -106,6 +106,24 @@ including advanced features.
      floating-point values. These values are interpreted as a plane equation.


+.. function:: glColor (red, green, blue, alpha):
+
+   B{glColor3b, glColor3d, glColor3f, glColor3i, glColor3s, glColor3ub, glColor3ui, glColor3us,
+   glColor4b, glColor4d, glColor4f, glColor4i, glColor4s, glColor4ub, glColor4ui, glColor4us,
+   glColor3bv, glColor3dv, glColor3fv, glColor3iv, glColor3sv, glColor3ubv, glColor3uiv,
+   glColor3usv, glColor4bv, glColor4dv, glColor4fv, glColor4iv, glColor4sv, glColor4ubv,
+   glColor4uiv, glColor4usv}
+
+   Set a new color.
+
+   .. seealso:: `OpenGL Docs <https://khronos.org/registry/OpenGL-Refpages/gl4/html/glColor.xhtml>`__
+
+   :type red, green, blue, alpha: Depends on function prototype.
+   :arg red, green, blue: Specify new red, green, and blue values for the current color.
+   :arg alpha: Specifies a new alpha value for the current color. Included only in the
+      four-argument glColor4 commands. (With '4' colors only)
+
+
 .. function:: glColorMask(red, green, blue, alpha):

   Enable and disable writing of frame buffer color components
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -1123,7 +1123,7 @@ context_type_map = {
    "soft_body": ("SoftBodyModifier", False),
    "speaker": ("Speaker", False),
    "texture": ("Texture", False),
-    "texture_slot": ("TextureSlot", False),
+    "texture_slot": ("MaterialTextureSlot", False),
    "texture_user": ("ID", False),
    "texture_user_property": ("Property", False),
    "ui_list": ("UIList", False),
--- a/extern/hipew/README
+++ b/extern/hipew/README
@@ -1,12 +0,0 @@
-The HIP Extension Wrangler Library (HIPEW) is a cross-platform open-source
-C/C++ library to dynamically load the HIP library.
-
-HIP (Heterogeneous-Compute Interface for Portability) is an API for C++
-programming on AMD GPUs.
-
-It is maintained as part of the Blender project, but included in extern/
-for consistency with CUEW and CLEW libraries.
-
-LICENSE
-
-HIPEW is released under the Apache 2.0 license.
--- a/extern/hipew/README.blender
+++ b/extern/hipew/README.blender
@@ -1,5 +0,0 @@
-Project: Blender
-URL: https://git.blender.org/blender.git
-License: Apache 2.0
-Upstream version: N/A
-Local modifications: None
--- a/extern/hipew/include/hipew.h
+++ b/extern/hipew/include/hipew.h
@@ -804,29 +804,31 @@ typedef enum hipDeviceP2PAttr {
 } hipDeviceP2PAttr;

 typedef struct HIP_MEMCPY3D {
-  unsigned int srcXInBytes;
-  unsigned int srcY;
-  unsigned int srcZ;
-  unsigned int srcLOD;
+  size_t srcXInBytes;
+  size_t srcY;
+  size_t srcZ;
+  size_t srcLOD;
  hipMemoryType srcMemoryType;
  const void* srcHost;
  hipDeviceptr_t srcDevice;
-  hArray srcArray;
-  unsigned int srcPitch;
-  unsigned int srcHeight;
-  unsigned int dstXInBytes;
-  unsigned int dstY;
-  unsigned int dstZ;
-  unsigned int dstLOD;
+  hArray * srcArray;
+  void* reserved0;
+  size_t srcPitch;
+  size_t srcHeight;
+  size_t dstXInBytes;
+  size_t dstY;
+  size_t dstZ;
+  size_t dstLOD;
  hipMemoryType dstMemoryType;
  void* dstHost;
  hipDeviceptr_t dstDevice;
-  hArray dstArray;
-  unsigned int dstPitch;
-  unsigned int dstHeight;
-  unsigned int WidthInBytes;
-  unsigned int Height;
-  unsigned int Depth;
+  hArray * dstArray;
+  void* reserved1;
+  size_t dstPitch;
+  size_t dstHeight;
+  size_t WidthInBytes;
+  size_t Height;
+  size_t Depth;
 } HIP_MEMCPY3D;

 typedef struct HIP_MEMCPY3D_PEER_st {
@@ -877,7 +879,7 @@ typedef struct HIP_RESOURCE_DESC_st {
  hipResourceType resType;
  union {
    struct {
-      hArray h_Array;
+      hArray * h_Array;
    } array;
    struct {
      hipMipmappedArray_t hMipmappedArray;
@@ -1072,10 +1074,9 @@ typedef enum hiprtcResult {
 typedef hipError_t HIPAPI thipGetErrorName(hipError_t error, const char** pStr);
 typedef hipError_t HIPAPI thipInit(unsigned int Flags);
 typedef hipError_t HIPAPI thipDriverGetVersion(int* driverVersion);
-typedef hipError_t HIPAPI thipGetDevice(int* device);
+typedef hipError_t HIPAPI thipGetDevice(hipDevice_t* device, int ordinal);
 typedef hipError_t HIPAPI thipGetDeviceCount(int* count);
 typedef hipError_t HIPAPI thipGetDeviceProperties(hipDeviceProp_t* props, int deviceId);
-typedef hipError_t HIPAPI thipDeviceGet(hipDevice_t* device, int ordinal);
 typedef hipError_t HIPAPI thipDeviceGetName(char* name, int len, hipDevice_t dev);
 typedef hipError_t HIPAPI thipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
 typedef hipError_t HIPAPI thipDeviceComputeCapability(int* major, int* minor, hipDevice_t dev);
@@ -1208,7 +1209,6 @@ extern thipDriverGetVersion *hipDriverGetVersion;
 extern thipGetDevice *hipGetDevice;
 extern thipGetDeviceCount *hipGetDeviceCount;
 extern thipGetDeviceProperties *hipGetDeviceProperties;
-extern thipDeviceGet* hipDeviceGet;
 extern thipDeviceGetName *hipDeviceGetName;
 extern thipDeviceGetAttribute *hipDeviceGetAttribute;
 extern thipDeviceComputeCapability *hipDeviceComputeCapability;
@@ -1333,7 +1333,6 @@ enum {
  HIPEW_SUCCESS = 0,
  HIPEW_ERROR_OPEN_FAILED = -1,
  HIPEW_ERROR_ATEXIT_FAILED = -2,
-  HIPEW_ERROR_OLD_DRIVER = -3,
 };

 enum {
--- a/extern/hipew/src/hipew.c
+++ b/extern/hipew/src/hipew.c
@@ -71,7 +71,6 @@ thipDriverGetVersion *hipDriverGetVersion;
 thipGetDevice *hipGetDevice;
 thipGetDeviceCount *hipGetDeviceCount;
 thipGetDeviceProperties *hipGetDeviceProperties;
-thipDeviceGet* hipDeviceGet;
 thipDeviceGetName *hipDeviceGetName;
 thipDeviceGetAttribute *hipDeviceGetAttribute;
 thipDeviceComputeCapability *hipDeviceComputeCapability;
@@ -214,36 +213,6 @@ static void hipewHipExit(void) {
  }
 }

-#ifdef _WIN32
-static int hipewHasOldDriver(const char *hip_path) {
-  DWORD verHandle = 0;
-  DWORD verSize = GetFileVersionInfoSize(hip_path, &verHandle);
-  int old_driver = 0;
-  if (verSize != 0) {
-    LPSTR verData = (LPSTR)malloc(verSize);
-    if (GetFileVersionInfo(hip_path, verHandle, verSize, verData)) {
-      LPBYTE lpBuffer = NULL;
-      UINT size = 0;
-      if (VerQueryValue(verData, "\\", (VOID FAR * FAR *)&lpBuffer, &size)) {
-        if (size) {
-          VS_FIXEDFILEINFO *verInfo = (VS_FIXEDFILEINFO *)lpBuffer;
-          /* Magic value from
-           * https://docs.microsoft.com/en-us/windows/win32/api/verrsrc/ns-verrsrc-vs_fixedfileinfo */
-          if (verInfo->dwSignature == 0xfeef04bd) {
-            unsigned int fileVersionLS0 = (verInfo->dwFileVersionLS >> 16) & 0xffff;
-            unsigned int fileversionLS1 = (verInfo->dwFileVersionLS >> 0) & 0xffff;
-            /* Corresponds to versions older than AMD Radeon Pro 21.Q4. */
-            old_driver = ((fileVersionLS0 < 3354) || (fileVersionLS0 == 3354 && fileversionLS1 < 13));
-          }
-        }
-      }
-    }
-    free(verData);
-  }
-  return old_driver;
-}
-#endif
-
 static int hipewHipInit(void) {
  /* Library paths. */
 #ifdef _WIN32
@@ -271,14 +240,6 @@ static int hipewHipInit(void) {
    return result;
  }

-#ifdef _WIN32
-  /* Test for driver version. */
-  if(hipewHasOldDriver(hip_paths[0])) {
-     result = HIPEW_ERROR_OLD_DRIVER;
-     return result;
-  }
-#endif
-
  /* Load library. */
  hip_lib = dynamic_library_open_find(hip_paths);

@@ -294,7 +255,6 @@ static int hipewHipInit(void) {
  HIP_LIBRARY_FIND_CHECKED(hipGetDevice);
  HIP_LIBRARY_FIND_CHECKED(hipGetDeviceCount);
  HIP_LIBRARY_FIND_CHECKED(hipGetDeviceProperties);
-  HIP_LIBRARY_FIND_CHECKED(hipDeviceGet);
  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetName);
  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetAttribute);
  HIP_LIBRARY_FIND_CHECKED(hipDeviceComputeCapability);
--- a/extern/nanosvg/README.blender
+++ b/extern/nanosvg/README.blender
@@ -1,7 +1,7 @@
 Project: NanoSVG
 URL: https://github.com/memononen/nanosvg
 License: zlib
-Upstream version: 3cdd4a9d7886
+Upstream version: 
 Local modifications: Added some functionality to manage grease pencil layers

 Added a fix to SVG import arc and float errors (https://developer.blender.org/rB11dc674c78b49fc4e0b7c134c375b6c8b8eacbcc)
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -82,7 +82,7 @@ static void session_print_status()
  string status, substatus;

  /* get status */
-  double progress = options.session->progress.get_progress();
+  float progress = options.session->progress.get_progress();
  options.session->progress.get_status(status, substatus);

  if (substatus != "")
@@ -183,7 +183,7 @@ static void display_info(Progress &progress)

  progress.get_time(total_time, sample_time);
  progress.get_status(status, substatus);
-  double progress_val = progress.get_progress();
+  float progress_val = progress.get_progress();

  if (substatus != "")
    status += ": " + substatus;
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -346,7 +346,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        name="Scrambling Distance",
        default=1.0,
        min=0.0, max=1.0,
-        description="Reduce randomization between pixels to improve GPU rendering performance, at the cost of possible rendering artifacts if set too low. Only works when not using adaptive sampling",
+        description="Lower values give faster rendering with GPU rendering and less noise with all devices at the cost of possible artifacts if set too low. Only works when not using adaptive sampling",
    )
    preview_scrambling_distance: BoolProperty(
        name="Scrambling Distance viewport",
@@ -354,10 +354,10 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        description="Uses the Scrambling Distance value for the viewport. Faster but may flicker",
    )

-    auto_scrambling_distance: BoolProperty(
-        name="Automatic Scrambling Distance",
+    adaptive_scrambling_distance: BoolProperty(
+        name="Adaptive Scrambling Distance",
        default=False,
-        description="Automatically reduce the randomization between pixels to improve GPU rendering performance, at the cost of possible rendering artifacts. Only works when not using adaptive sampling",
+        description="Uses a formula to adapt the scrambling distance strength based on the sample count",
    )

    use_layer_samples: EnumProperty(
@@ -770,8 +770,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
    )

    use_auto_tile: BoolProperty(
-        name="Use Tiling",
-        description="Render high resolution images in tiles to reduce memory usage, using the specified tile size. Tiles are cached to disk while rendering to save memory",
+        name="Auto Tiles",
+        description="Automatically render high resolution images in tiles to reduce memory usage, using the specified tile size. Tiles are cached to disk while rendering to save memory",
        default=True,
    )
    tile_size: IntProperty(
@@ -1419,9 +1419,10 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                col.label(text="and NVIDIA driver version 470 or newer", icon='BLANK1')
            elif device_type == 'HIP':
                import sys
-                col.label(text="Requires discrete AMD GPU with RDNA architecture", icon='BLANK1')
-                if sys.platform[:3] == "win":
-                    col.label(text="and AMD Radeon Pro 21.Q4 driver or newer", icon='BLANK1')
+                col.label(text="Requires discrete AMD GPU with RDNA2 architecture", icon='BLANK1')
+                # TODO: provide driver version info.
+                #if sys.platform[:3] == "win":
+                #    col.label(text="and AMD driver version ??? or newer", icon='BLANK1')
            return

        for device in devices:
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -292,13 +292,13 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):

        layout.separator()

-        heading = layout.column(align=True, heading="Scrambling Distance")
-        heading.active = not (cscene.use_adaptive_sampling and cscene.use_preview_adaptive_sampling)
-        heading.prop(cscene, "auto_scrambling_distance", text="Automatic")
-        sub = heading.row()
+        col = layout.column(align=True)
+        col.active = not (cscene.use_adaptive_sampling and cscene.use_preview_adaptive_sampling)
+        col.prop(cscene, "scrambling_distance", text="Scrambling Distance")
+        col.prop(cscene, "adaptive_scrambling_distance", text="Adaptive")
+        sub = col.row(align=True)
        sub.active = not cscene.use_preview_adaptive_sampling
        sub.prop(cscene, "preview_scrambling_distance", text="Viewport")
-        heading.prop(cscene, "scrambling_distance", text="Multiplier")

        layout.separator()

@@ -1051,7 +1051,7 @@ class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel):


 def has_geometry_visibility(ob):
-    return ob and ((ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LIGHT', 'VOLUME', 'POINTCLOUD', 'HAIR'}) or
+    return ob and ((ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LIGHT'}) or
                   (ob.instance_type == 'COLLECTION' and ob.instance_collection))


--- a/intern/cycles/blender/curves.cpp
+++ b/intern/cycles/blender/curves.cpp
@@ -819,14 +819,11 @@ void BlenderSync::sync_hair(BL::Depsgraph b_depsgraph, BObjectInfo &b_ob_info, H
  new_hair.set_used_shaders(used_shaders);

  if (view_layer.use_hair) {
-#ifdef WITH_HAIR_NODES
    if (b_ob_info.object_data.is_a(&RNA_Hair)) {
      /* Hair object. */
      sync_hair(&new_hair, b_ob_info, false);
    }
-    else
-#endif
-    {
+    else {
      /* Particle hair. */
      bool need_undeformed = new_hair.need_attribute(scene, ATTR_STD_GENERATED);
      BL::Mesh b_mesh = object_to_mesh(
@@ -873,15 +870,12 @@ void BlenderSync::sync_hair_motion(BL::Depsgraph b_depsgraph,

  /* Export deformed coordinates. */
  if (ccl::BKE_object_is_deform_modified(b_ob_info, b_scene, preview)) {
-#ifdef WITH_HAIR_NODES
    if (b_ob_info.object_data.is_a(&RNA_Hair)) {
      /* Hair object. */
      sync_hair(hair, b_ob_info, true, motion_step);
      return;
    }
-    else
-#endif
-    {
+    else {
      /* Particle hair. */
      BL::Mesh b_mesh = object_to_mesh(
          b_data, b_ob_info, b_depsgraph, false, Mesh::SUBDIVISION_NONE);
--- a/intern/cycles/blender/geometry.cpp
+++ b/intern/cycles/blender/geometry.cpp
@@ -31,11 +31,7 @@ CCL_NAMESPACE_BEGIN

 static Geometry::Type determine_geom_type(BObjectInfo &b_ob_info, bool use_particle_hair)
 {
-#ifdef WITH_HAIR_NODES
  if (b_ob_info.object_data.is_a(&RNA_Hair) || use_particle_hair) {
-#else
-  if (use_particle_hair) {
-#endif
    return Geometry::HAIR;
  }

@@ -219,11 +215,7 @@ void BlenderSync::sync_geometry_motion(BL::Depsgraph &b_depsgraph,
    if (progress.get_cancel())
      return;

-#ifdef WITH_HAIR_NODES
    if (b_ob_info.object_data.is_a(&RNA_Hair) || use_particle_hair) {
-#else
-    if (use_particle_hair) {
-#endif
      Hair *hair = static_cast<Hair *>(geom);
      sync_hair_motion(b_depsgraph, b_ob_info, hair, motion_step);
    }
--- a/intern/cycles/blender/image.cpp
+++ b/intern/cycles/blender/image.cpp
@@ -24,14 +24,8 @@ CCL_NAMESPACE_BEGIN

 /* Packed Images */

-BlenderImageLoader::BlenderImageLoader(BL::Image b_image,
-                                       const int frame,
-                                       const bool is_preview_render)
-    : b_image(b_image),
-      frame(frame),
-      /* Don't free cache for preview render to avoid race condition from T93560, to be fixed
-         properly later as we are close to release. */
-      free_cache(!is_preview_render && !b_image.has_data())
+BlenderImageLoader::BlenderImageLoader(BL::Image b_image, int frame)
+    : b_image(b_image), frame(frame), free_cache(!b_image.has_data())
 {
 }

--- a/intern/cycles/blender/image.h
+++ b/intern/cycles/blender/image.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN

 class BlenderImageLoader : public ImageLoader {
 public:
-  BlenderImageLoader(BL::Image b_image, const int frame, const bool is_preview_render);
+  BlenderImageLoader(BL::Image b_image, int frame);

  bool load_metadata(const ImageDeviceFeatures &features, ImageMetaData &metadata) override;
  bool load_pixels(const ImageMetaData &metadata,
--- a/intern/cycles/blender/object.cpp
+++ b/intern/cycles/blender/object.cpp
@@ -62,46 +62,31 @@ bool BlenderSync::BKE_object_is_modified(BL::Object &b_ob)
  return false;
 }

-bool BlenderSync::object_is_geometry(BObjectInfo &b_ob_info)
+bool BlenderSync::object_is_geometry(BL::Object &b_ob)
 {
-  BL::ID b_ob_data = b_ob_info.object_data;
+  BL::ID b_ob_data = b_ob.data();

  if (!b_ob_data) {
    return false;
  }

-  BL::Object::type_enum type = b_ob_info.iter_object.type();
+  BL::Object::type_enum type = b_ob.type();

  if (type == BL::Object::type_VOLUME || type == BL::Object::type_HAIR) {
    /* Will be exported attached to mesh. */
    return true;
  }
+  else if (type == BL::Object::type_CURVE) {
+    /* Skip exporting curves without faces, overhead can be
+     * significant if there are many for path animation. */
+    BL::Curve b_curve(b_ob_data);

-  /* Other object types that are not meshes but evaluate to meshes are presented to render engines
-   * as separate instance objects. Metaballs and surface objects have not been affected by that
-   * change yet. */
-  if (type == BL::Object::type_SURFACE || type == BL::Object::type_META) {
-    return true;
+    return (b_curve.bevel_object() || b_curve.extrude() != 0.0f || b_curve.bevel_depth() != 0.0f ||
+            b_curve.dimensions() == BL::Curve::dimensions_2D || b_ob.modifiers.length());
  }
-
-  return b_ob_data.is_a(&RNA_Mesh);
-}
-
-bool BlenderSync::object_can_have_geometry(BL::Object &b_ob)
-{
-  BL::Object::type_enum type = b_ob.type();
-  switch (type) {
-    case BL::Object::type_MESH:
-    case BL::Object::type_CURVE:
-    case BL::Object::type_SURFACE:
-    case BL::Object::type_META:
-    case BL::Object::type_FONT:
-    case BL::Object::type_HAIR:
-    case BL::Object::type_POINTCLOUD:
-    case BL::Object::type_VOLUME:
-      return true;
-    default:
-      return false;
+  else {
+    return (b_ob_data.is_a(&RNA_Mesh) || b_ob_data.is_a(&RNA_Curve) ||
+            b_ob_data.is_a(&RNA_MetaBall));
  }
 }

@@ -207,7 +192,7 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
  }

  /* only interested in object that we can create meshes from */
-  if (!object_is_geometry(b_ob_info)) {
+  if (!object_is_geometry(b_ob)) {
    return NULL;
  }

@@ -294,7 +279,7 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,

  object->set_visibility(visibility);

-  object->set_is_shadow_catcher(b_ob.is_shadow_catcher() || b_parent.is_shadow_catcher());
+  object->set_is_shadow_catcher(b_ob.is_shadow_catcher());

  float shadow_terminator_shading_offset = get_float(cobject, "shadow_terminator_offset");
  object->set_shadow_terminator_shading_offset(shadow_terminator_shading_offset);
--- a/intern/cycles/blender/output_driver.cpp
+++ b/intern/cycles/blender/output_driver.cpp
@@ -120,7 +120,7 @@ void BlenderOutputDriver::write_render_tile(const Tile &tile)
    b_pass.rect(&pixels[0]);
  }

-  b_engine_.end_result(b_rr, false, false, true);
+  b_engine_.end_result(b_rr, true, false, true);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -129,7 +129,7 @@ void BlenderSession::create_session()
  /* reset status/progress */
  last_status = "";
  last_error = "";
-  last_progress = -1.0;
+  last_progress = -1.0f;
  start_resize_time = 0.0;

  /* create session */
@@ -615,24 +615,6 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
    sync->sync_camera(b_render, b_camera_override, width, height, "");
    sync->sync_data(
        b_render, b_depsgraph, b_v3d, b_camera_override, width, height, &python_thread_state);
-
-    /* Filtering settings for combined pass. */
-    if (pass->get_type() == PASS_COMBINED) {
-      Integrator *integrator = scene->integrator;
-      integrator->set_use_direct_light((bake_filter & BL::BakeSettings::pass_filter_DIRECT) != 0);
-      integrator->set_use_indirect_light((bake_filter & BL::BakeSettings::pass_filter_INDIRECT) !=
-                                         0);
-      integrator->set_use_diffuse((bake_filter & BL::BakeSettings::pass_filter_DIFFUSE) != 0);
-      integrator->set_use_glossy((bake_filter & BL::BakeSettings::pass_filter_GLOSSY) != 0);
-      integrator->set_use_transmission(
-          (bake_filter & BL::BakeSettings::pass_filter_TRANSMISSION) != 0);
-      integrator->set_use_emission((bake_filter & BL::BakeSettings::pass_filter_EMIT) != 0);
-    }
-
-    /* Always use transpanent background for baking. */
-    scene->background->set_transparent(true);
-
-    /* Load built-in images from Blender. */
    builtin_images_load();
  }

@@ -859,7 +841,7 @@ void BlenderSession::get_status(string &status, string &substatus)
  session->progress.get_status(status, substatus);
 }

-void BlenderSession::get_progress(double &progress, double &total_time, double &render_time)
+void BlenderSession::get_progress(float &progress, double &total_time, double &render_time)
 {
  session->progress.get_time(total_time, render_time);
  progress = session->progress.get_progress();
@@ -867,10 +849,10 @@ void BlenderSession::get_progress(double &progress, double &total_time, double &

 void BlenderSession::update_bake_progress()
 {
-  double progress = session->progress.get_progress();
+  float progress = session->progress.get_progress();

  if (progress != last_progress) {
-    b_engine.update_progress((float)progress);
+    b_engine.update_progress(progress);
    last_progress = progress;
  }
 }
@@ -879,7 +861,7 @@ void BlenderSession::update_status_progress()
 {
  string timestatus, status, substatus;
  string scene_status = "";
-  double progress;
+  float progress;
  double total_time, remaining_time = 0, render_time;
  float mem_used = (float)session->stats.mem_used / 1024.0f / 1024.0f;
  float mem_peak = (float)session->stats.mem_peak / 1024.0f / 1024.0f;
@@ -923,7 +905,7 @@ void BlenderSession::update_status_progress()
    last_status_time = current_time;
  }
  if (progress != last_progress) {
-    b_engine.update_progress((float)progress);
+    b_engine.update_progress(progress);
    last_progress = progress;
  }

--- a/intern/cycles/blender/session.h
+++ b/intern/cycles/blender/session.h
@@ -82,7 +82,7 @@ class BlenderSession {
  void tag_redraw();
  void tag_update();
  void get_status(string &status, string &substatus);
-  void get_progress(double &progress, double &total_time, double &render_time);
+  void get_progress(float &progress, double &total_time, double &render_time);
  void test_cancel();
  void update_status_progress();
  void update_bake_progress();
@@ -108,7 +108,7 @@ class BlenderSession {

  string last_status;
  string last_error;
-  double last_progress;
+  float last_progress;
  double last_status_time;

  int width, height;
--- a/intern/cycles/blender/shader.cpp
+++ b/intern/cycles/blender/shader.cpp
@@ -762,8 +762,7 @@ static ShaderNode *add_node(Scene *scene,
        int scene_frame = b_scene.frame_current();
        int image_frame = image_user_frame_number(b_image_user, b_image, scene_frame);
        image->handle = scene->image_manager->add_image(
-            new BlenderImageLoader(b_image, image_frame, b_engine.is_preview()),
-            image->image_params());
+            new BlenderImageLoader(b_image, image_frame), image->image_params());
      }
      else {
        ustring filename = ustring(
@@ -798,9 +797,8 @@ static ShaderNode *add_node(Scene *scene,
      if (is_builtin) {
        int scene_frame = b_scene.frame_current();
        int image_frame = image_user_frame_number(b_image_user, b_image, scene_frame);
-        env->handle = scene->image_manager->add_image(
-            new BlenderImageLoader(b_image, image_frame, b_engine.is_preview()),
-            env->image_params());
+        env->handle = scene->image_manager->add_image(new BlenderImageLoader(b_image, image_frame),
+                                                      env->image_params());
      }
      else {
        env->set_filename(
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -162,19 +162,19 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
    /* Object */
    else if (b_id.is_a(&RNA_Object)) {
      BL::Object b_ob(b_id);
-      const bool can_have_geometry = object_can_have_geometry(b_ob);
-      const bool is_light = !can_have_geometry && object_is_light(b_ob);
+      const bool is_geometry = object_is_geometry(b_ob);
+      const bool is_light = !is_geometry && object_is_light(b_ob);

      if (b_ob.is_instancer() && b_update.is_updated_shading()) {
        /* Needed for e.g. object color updates on instancer. */
        object_map.set_recalc(b_ob);
      }

-      if (can_have_geometry || is_light) {
+      if (is_geometry || is_light) {
        const bool updated_geometry = b_update.is_updated_geometry();

        /* Geometry (mesh, hair, volume). */
-        if (can_have_geometry) {
+        if (is_geometry) {
          if (b_update.is_updated_transform() || b_update.is_updated_shading()) {
            object_map.set_recalc(b_ob);
          }
@@ -365,8 +365,8 @@ void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)

  int samples = get_int(cscene, "samples");
  float scrambling_distance = get_float(cscene, "scrambling_distance");
-  bool auto_scrambling_distance = get_boolean(cscene, "auto_scrambling_distance");
-  if (auto_scrambling_distance) {
+  bool adaptive_scrambling_distance = get_boolean(cscene, "adaptive_scrambling_distance");
+  if (adaptive_scrambling_distance) {
    scrambling_distance *= 4.0f / sqrtf(samples);
  }

--- a/intern/cycles/blender/sync.h
+++ b/intern/cycles/blender/sync.h
@@ -208,8 +208,7 @@ class BlenderSync {
  /* util */
  void find_shader(BL::ID &id, array<Node *> &used_shaders, Shader *default_shader);
  bool BKE_object_is_modified(BL::Object &b_ob);
-  bool object_is_geometry(BObjectInfo &b_ob_info);
-  bool object_can_have_geometry(BL::Object &b_ob);
+  bool object_is_geometry(BL::Object &b_ob);
  bool object_is_light(BL::Object &b_ob);

  /* variables */
--- a/intern/cycles/bvh/optix.cpp
+++ b/intern/cycles/bvh/optix.cpp
@@ -30,17 +30,15 @@ BVHOptiX::BVHOptiX(const BVHParams &params_,
    : BVH(params_, geometry_, objects_),
      device(device),
      traversable_handle(0),
-      as_data(make_unique<device_only_memory<char>>(
-          device, params.top_level ? "optix tlas" : "optix blas", false)),
-      motion_transform_data(
-          make_unique<device_only_memory<char>>(device, "optix motion transform", false))
+      as_data(device, params_.top_level ? "optix tlas" : "optix blas", false),
+      motion_transform_data(device, "optix motion transform", false)
 {
 }

 BVHOptiX::~BVHOptiX()
 {
-  /* Acceleration structure memory is delayed freed on device, since deleting the
-   * BVH may happen while still being used for rendering. */
+  // Acceleration structure memory is delayed freed on device, since deleting the
+  // BVH may happen while still being used for rendering.
  device->release_optix_bvh(this);
 }

--- a/intern/cycles/bvh/optix.h
+++ b/intern/cycles/bvh/optix.h
@@ -25,16 +25,14 @@

 #  include "device/memory.h"

-#  include "util/unique_ptr.h"
-
 CCL_NAMESPACE_BEGIN

 class BVHOptiX : public BVH {
 public:
  Device *device;
  uint64_t traversable_handle;
-  unique_ptr<device_only_memory<char>> as_data;
-  unique_ptr<device_only_memory<char>> motion_transform_data;
+  device_only_memory<char> as_data;
+  device_only_memory<char> motion_transform_data;

 protected:
  friend class BVH;
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -68,8 +68,7 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
 {
  /* Pick any kernel, all of them are supposed to have same level of microarchitecture
   * optimization. */
-  VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name()
-          << " CPU kernels.";
+  VLOG(1) << "Using " << kernels.integrator_init_from_camera.get_uarch_name() << " CPU kernels.";

  if (info.cpu_threads == 0) {
    info.cpu_threads = TaskScheduler::num_threads();
@@ -134,7 +133,8 @@ void CPUDevice::mem_alloc(device_memory &mem)
              << string_human_readable_size(mem.memory_size()) << ")";
    }

-    if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      assert(!mem.host_pointer);
      size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
      void *data = util_aligned_malloc(mem.memory_size(), alignment);
      mem.device_pointer = (device_ptr)data;
@@ -193,7 +193,7 @@ void CPUDevice::mem_free(device_memory &mem)
    tex_free((device_texture &)mem);
  }
  else if (mem.device_pointer) {
-    if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
+    if (mem.type == MEM_DEVICE_ONLY) {
      util_aligned_free((void *)mem.device_pointer);
    }
    mem.device_pointer = 0;
@@ -296,6 +296,11 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    Device::build_bvh(bvh, progress, refit);
 }

+const CPUKernels *CPUDevice::get_cpu_kernels() const
+{
+  return &kernels;
+}
+
 void CPUDevice::get_cpu_kernel_thread_globals(
    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
 {
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -57,6 +57,8 @@ class CPUDevice : public Device {
  RTCDevice embree_device;
 #endif

+  CPUKernels kernels;
+
  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
  ~CPUDevice();

@@ -88,6 +90,7 @@ class CPUDevice : public Device {

  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;

+  virtual const CPUKernels *get_cpu_kernels() const override;
  virtual void get_cpu_kernel_thread_globals(
      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
  virtual void *get_cpu_osl_memory() override;
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -26,9 +26,6 @@ CCL_NAMESPACE_BEGIN
      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)

 #define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
-#define REGISTER_KERNEL_FILM_CONVERT(name) \
-  film_convert_##name(KERNEL_FUNCTIONS(film_convert_##name)), \
-      film_convert_half_rgba_##name(KERNEL_FUNCTIONS(film_convert_half_rgba_##name))

 CPUKernels::CPUKernels()
    : /* Integrator. */
@@ -53,25 +50,11 @@ CPUKernels::CPUKernels()
      REGISTER_KERNEL(adaptive_sampling_filter_x),
      REGISTER_KERNEL(adaptive_sampling_filter_y),
      /* Cryptomatte. */
-      REGISTER_KERNEL(cryptomatte_postprocess),
-      /* Film Convert. */
-      REGISTER_KERNEL_FILM_CONVERT(depth),
-      REGISTER_KERNEL_FILM_CONVERT(mist),
-      REGISTER_KERNEL_FILM_CONVERT(sample_count),
-      REGISTER_KERNEL_FILM_CONVERT(float),
-      REGISTER_KERNEL_FILM_CONVERT(light_path),
-      REGISTER_KERNEL_FILM_CONVERT(float3),
-      REGISTER_KERNEL_FILM_CONVERT(motion),
-      REGISTER_KERNEL_FILM_CONVERT(cryptomatte),
-      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher),
-      REGISTER_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow),
-      REGISTER_KERNEL_FILM_CONVERT(combined),
-      REGISTER_KERNEL_FILM_CONVERT(float4)
+      REGISTER_KERNEL(cryptomatte_postprocess)
 {
 }

 #undef REGISTER_KERNEL
-#undef REGISTER_KERNEL_FILM_CONVERT
 #undef KERNEL_FUNCTIONS

 CCL_NAMESPACE_END
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -17,13 +17,11 @@
 #pragma once

 #include "device/cpu/kernel_function.h"
-#include "util/half.h"
 #include "util/types.h"

 CCL_NAMESPACE_BEGIN

 struct KernelGlobalsCPU;
-struct KernelFilmConvert;
 struct IntegratorStateCPU;
 struct TileInfo;

@@ -42,7 +40,7 @@ class CPUKernels {

  IntegratorInitFunction integrator_init_from_camera;
  IntegratorInitFunction integrator_init_from_bake;
-  IntegratorShadeFunction integrator_intersect_closest;
+  IntegratorFunction integrator_intersect_closest;
  IntegratorFunction integrator_intersect_shadow;
  IntegratorFunction integrator_intersect_subsurface;
  IntegratorFunction integrator_intersect_volume_stack;
@@ -104,41 +102,6 @@ class CPUKernels {

  CryptomattePostprocessFunction cryptomatte_postprocess;

-  /* Film Convert. */
-  using FilmConvertFunction = CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
-                                                         const float *buffer,
-                                                         float *pixel,
-                                                         const int width,
-                                                         const int buffer_stride,
-                                                         const int pixel_stride)>;
-  using FilmConvertHalfRGBAFunction =
-      CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
-                                 const float *buffer,
-                                 half4 *pixel,
-                                 const int width,
-                                 const int buffer_stride)>;
-
-#define KERNEL_FILM_CONVERT_FUNCTION(name) \
-  FilmConvertFunction film_convert_##name; \
-  FilmConvertHalfRGBAFunction film_convert_half_rgba_##name;
-
-  KERNEL_FILM_CONVERT_FUNCTION(depth)
-  KERNEL_FILM_CONVERT_FUNCTION(mist)
-  KERNEL_FILM_CONVERT_FUNCTION(sample_count)
-  KERNEL_FILM_CONVERT_FUNCTION(float)
-
-  KERNEL_FILM_CONVERT_FUNCTION(light_path)
-  KERNEL_FILM_CONVERT_FUNCTION(float3)
-
-  KERNEL_FILM_CONVERT_FUNCTION(motion)
-  KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
-  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
-  KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
-  KERNEL_FILM_CONVERT_FUNCTION(combined)
-  KERNEL_FILM_CONVERT_FUNCTION(float4)
-
-#undef KERNEL_FILM_CONVERT_FUNCTION
-
  CPUKernels();
 };

--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -680,7 +680,7 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_

  void *shared_pointer = 0;

-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
    if (mem.shared_pointer) {
      /* Another device already allocated host memory. */
      mem_alloc_result = CUDA_SUCCESS;
@@ -703,14 +703,8 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
  }

  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
+    status = " failed, out of device and host memory";
+    set_error("System is out of GPU and shared host memory");
  }

  if (mem.name) {
@@ -783,7 +777,6 @@ void CUDADevice::generic_free(device_memory &mem)
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
    const CUDAMem &cmem = cuda_mem_map[&mem];

    /* If cmem.use_mapped_host is true, reference counting is used
@@ -1152,7 +1145,6 @@ void CUDADevice::tex_free(device_texture &mem)
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
    const CUDAMem &cmem = cuda_mem_map[&mem];

    if (cmem.texobject) {
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -23,7 +23,6 @@
 #include "device/queue.h"

 #include "device/cpu/device.h"
-#include "device/cpu/kernel.h"
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
 #include "device/hip/device.h"
@@ -364,11 +363,10 @@ unique_ptr<DeviceQueue> Device::gpu_queue_create()
  return nullptr;
 }

-const CPUKernels &Device::get_cpu_kernels()
+const CPUKernels *Device::get_cpu_kernels() const
 {
-  /* Initialize CPU kernels once and reuse. */
-  static CPUKernels kernels;
-  return kernels;
+  LOG(FATAL) << "Device does not support CPU kernels.";
+  return nullptr;
 }

 void Device::get_cpu_kernel_thread_globals(
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -180,7 +180,7 @@ class Device {
   * These may not be used on GPU or multi-devices. */

  /* Get CPU kernel functions for native instruction set. */
-  static const CPUKernels &get_cpu_kernels();
+  virtual const CPUKernels *get_cpu_kernels() const;
  /* Get kernel globals to pass to kernels. */
  virtual void get_cpu_kernel_thread_globals(
      vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
--- a/intern/cycles/device/hip/device.cpp
+++ b/intern/cycles/device/hip/device.cpp
@@ -57,16 +57,9 @@ bool device_hip_init()
    }
  }
  else {
-    if (hipew_result == HIPEW_ERROR_ATEXIT_FAILED) {
-      VLOG(1) << "HIPEW initialization failed: Error setting up atexit() handler";
-    }
-    else if (hipew_result == HIPEW_ERROR_OLD_DRIVER) {
-      VLOG(1) << "HIPEW initialization failed: Driver version too old, requires AMD Radeon Pro "
-                 "21.Q4 driver or newer";
-    }
-    else {
-      VLOG(1) << "HIPEW initialization failed: Error opening HIP dynamic library";
-    }
+    VLOG(1) << "HIPEW initialization failed: "
+            << ((hipew_result == HIPEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
+                                                              "Error opening the library");
  }

  return result;
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -99,7 +99,7 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  }

  /* Setup device and context. */
-  result = hipDeviceGet(&hipDevice, hipDevId);
+  result = hipGetDevice(&hipDevice, hipDevId);
  if (result != hipSuccess) {
    set_error(string_printf("Failed to get HIP device handle from ordinal (%s)",
                            hipewErrorString(result)));
@@ -154,7 +154,7 @@ bool HIPDevice::support_device(const uint /*kernel_features*/)
    hipDeviceProp_t props;
    hipGetDeviceProperties(&props, hipDevId);

-    set_error(string_printf("HIP backend requires AMD RDNA graphics card or up, but found %s.",
+    set_error(string_printf("HIP backend requires AMD RDNA2 graphics card or up, but found %s.",
                            props.name));
    return false;
  }
@@ -222,6 +222,7 @@ string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features)
  const string include_path = source_path;
  string cflags = string_printf(
      "-m%d "
+      "--ptxas-options=\"-v\" "
      "--use_fast_math "
      "-DHIPCC "
      "-I\"%s\"",
@@ -233,7 +234,10 @@ string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features)
  return cflags;
 }

-string HIPDevice::compile_kernel(const uint kernel_features, const char *name, const char *base)
+string HIPDevice::compile_kernel(const uint kernel_features,
+                                 const char *name,
+                                 const char *base,
+                                 bool force_ptx)
 {
  /* Compute kernel name. */
  int major, minor;
@@ -251,11 +255,13 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c

  /* Attempt to use kernel provided with Blender. */
  if (!use_adaptive_compilation()) {
-    const string fatbin = path_get(string_printf("lib/%s_%s.fatbin", name, arch));
-    VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
-    if (path_exists(fatbin)) {
-      VLOG(1) << "Using precompiled kernel.";
-      return fatbin;
+    if (!force_ptx) {
+      const string fatbin = path_get(string_printf("lib/%s_%s.fatbin", name, arch));
+      VLOG(1) << "Testing for pre-compiled kernel " << fatbin << ".";
+      if (path_exists(fatbin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return fatbin;
+      }
    }
  }

@@ -292,9 +298,9 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c

 #  ifdef _WIN32
  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
-    if (!hipSupportsDevice(hipDevId)) {
+    if (major < 3) {
      set_error(
-          string_printf("HIP backend requires compute capability 10.1 or up, but found %d.%d. "
+          string_printf("HIP backend requires compute capability 3.0 or up, but found %d.%d. "
                        "Your GPU is not supported.",
                        major,
                        minor));
@@ -745,7 +751,6 @@ void HIPDevice::generic_free(device_memory &mem)
  if (mem.device_pointer) {
    HIPContextScope scope(this);
    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
    const HIPMem &cmem = hip_mem_map[&mem];

    /* If cmem.use_mapped_host is true, reference counting is used
@@ -989,16 +994,16 @@ void HIPDevice::tex_alloc(device_texture &mem)
            << string_human_readable_number(mem.memory_size()) << " bytes. ("
            << string_human_readable_size(mem.memory_size()) << ")";

-    hip_assert(hipArray3DCreate((hArray *)&array_3d, &desc));
+    hip_assert(hipArray3DCreate(&array_3d, &desc));

    if (!array_3d) {
      return;
    }

    HIP_MEMCPY3D param;
-    memset(&param, 0, sizeof(HIP_MEMCPY3D));
+    memset(&param, 0, sizeof(param));
    param.dstMemoryType = hipMemoryTypeArray;
-    param.dstArray = array_3d;
+    param.dstArray = &array_3d;
    param.srcMemoryType = hipMemoryTypeHost;
    param.srcHost = mem.host_pointer;
    param.srcPitch = src_pitch;
@@ -1064,13 +1069,13 @@ void HIPDevice::tex_alloc(device_texture &mem)

  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    /* Bindless textures. */
+    /* Kepler+, bindless textures. */
    hipResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));

    if (array_3d) {
      resDesc.resType = hipResourceTypeArray;
-      resDesc.res.array.h_Array = array_3d;
+      resDesc.res.array.h_Array = &array_3d;
      resDesc.flags = 0;
    }
    else if (mem.data_height > 0) {
@@ -1115,7 +1120,6 @@ void HIPDevice::tex_free(device_texture &mem)
  if (mem.device_pointer) {
    HIPContextScope scope(this);
    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
    const HIPMem &cmem = hip_mem_map[&mem];

    if (cmem.texobject) {
@@ -1156,8 +1160,6 @@ bool HIPDevice::should_use_graphics_interop()
   * possible, but from the empiric measurements it can be considerably slower than using naive
   * pixels copy. */

-  /* Disable graphics interop for now, because of driver bug in 21.40. See T92972 */
-#  if 0
  HIPContextScope scope(this);

  int num_all_devices = 0;
@@ -1176,7 +1178,6 @@ bool HIPDevice::should_use_graphics_interop()
      return true;
    }
  }
-#  endif

  return false;
 }
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -95,7 +95,8 @@ class HIPDevice : public Device {

  string compile_kernel(const uint kernel_features,
                        const char *name,
-                        const char *base = "hip");
+                        const char *base = "hip",
+                        bool force_ptx = false);

  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);
--- a/intern/cycles/device/hip/util.h
+++ b/intern/cycles/device/hip/util.h
@@ -64,7 +64,7 @@ static inline bool hipSupportsDevice(const int hipDevId)
  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);

-  return (major > 10) || (major == 10 && minor >= 1);
+  return (major > 10) || (major == 10 && minor >= 3);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/device/memory.cpp
+++ b/intern/cycles/device/memory.cpp
@@ -44,6 +44,45 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type)
 {
 }

+device_memory::device_memory(device_memory &&other) noexcept
+    : data_type(other.data_type),
+      data_elements(other.data_elements),
+      data_size(other.data_size),
+      device_size(other.device_size),
+      data_width(other.data_width),
+      data_height(other.data_height),
+      data_depth(other.data_depth),
+      type(other.type),
+      name(other.name),
+      device(other.device),
+      device_pointer(other.device_pointer),
+      host_pointer(other.host_pointer),
+      shared_pointer(other.shared_pointer),
+      shared_counter(other.shared_counter),
+      original_device_ptr(other.original_device_ptr),
+      original_device_size(other.original_device_size),
+      original_device(other.original_device),
+      need_realloc_(other.need_realloc_),
+      modified(other.modified)
+{
+  other.data_elements = 0;
+  other.data_size = 0;
+  other.device_size = 0;
+  other.data_width = 0;
+  other.data_height = 0;
+  other.data_depth = 0;
+  other.device = 0;
+  other.device_pointer = 0;
+  other.host_pointer = 0;
+  other.shared_pointer = 0;
+  other.shared_counter = 0;
+  other.original_device_ptr = 0;
+  other.original_device_size = 0;
+  other.original_device = 0;
+  other.need_realloc_ = false;
+  other.modified = false;
+}
+
 device_memory::~device_memory()
 {
  assert(shared_pointer == 0);
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -281,16 +281,11 @@ class device_memory {

  /* Only create through subclasses. */
  device_memory(Device *device, const char *name, MemoryType type);
+  device_memory(device_memory &&other) noexcept;

-  /* No copying and allowed.
-   *
-   * This is because device implementation might need to register device memory in an allocation
-   * map of some sort and use pointer as a key to identify blocks. Moving data from one place to
-   * another bypassing device allocation routines will make those maps hard to maintain. */
+  /* No copying allowed. */
  device_memory(const device_memory &) = delete;
-  device_memory(device_memory &&other) noexcept = delete;
  device_memory &operator=(const device_memory &) = delete;
-  device_memory &operator=(device_memory &&) = delete;

  /* Host allocation on the device. All host_pointer memory should be
   * allocated with these functions, for devices that support using
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -44,14 +44,22 @@
 CCL_NAMESPACE_BEGIN

 OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
-    : device(device), queue(device), state(device, "__denoiser_state", true)
+    : device(device), queue(device), state(device, "__denoiser_state")
 {
 }

+OptiXDevice::Denoiser::~Denoiser()
+{
+  const CUDAContextScope scope(device);
+  if (optix_denoiser != nullptr) {
+    optixDenoiserDestroy(optix_denoiser);
+  }
+}
+
 OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
    : CUDADevice(info, stats, profiler),
      sbt_data(this, "__sbt", MEM_READ_ONLY),
-      launch_params(this, "__params", false),
+      launch_params(this, "__params"),
      denoiser_(this)
 {
  /* Make the CUDA context current. */
@@ -125,11 +133,6 @@ OptiXDevice::~OptiXDevice()
    }
  }

-  /* Make sure denoiser is destroyed before device context! */
-  if (denoiser_.optix_denoiser != nullptr) {
-    optixDenoiserDestroy(denoiser_.optix_denoiser);
-  }
-
  optixDeviceContextDestroy(context);
 }

@@ -507,7 +510,7 @@ class OptiXDevice::DenoiseContext {
      : denoise_params(task.params),
        render_buffers(task.render_buffers),
        buffer_params(task.buffer_params),
-        guiding_buffer(device, "denoiser guiding passes buffer", true),
+        guiding_buffer(device, "denoiser guiding passes buffer"),
        num_samples(task.num_samples)
  {
    num_input_passes = 1;
@@ -522,9 +525,9 @@ class OptiXDevice::DenoiseContext {
      }
    }

-    use_guiding_passes = (num_input_passes - 1) > 0;
+    const int num_guiding_passes = num_input_passes - 1;

-    if (use_guiding_passes) {
+    if (num_guiding_passes) {
      if (task.allow_inplace_modification) {
        guiding_params.device_pointer = render_buffers->buffer.device_pointer;

@@ -577,7 +580,6 @@ class OptiXDevice::DenoiseContext {

  /* Number of input passes. Including the color and extra auxiliary passes. */
  int num_input_passes = 0;
-  bool use_guiding_passes = false;
  bool use_pass_albedo = false;
  bool use_pass_normal = false;

@@ -709,7 +711,7 @@ void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type)
      return;
    }
  }
-  else if (context.use_guiding_passes && !context.albedo_replaced_with_fake) {
+  else if (!context.albedo_replaced_with_fake) {
    context.albedo_replaced_with_fake = true;
    if (!denoise_filter_guiding_set_fake_albedo(context)) {
      LOG(ERROR) << "Error replacing real albedo with the fake one.";
@@ -882,31 +884,27 @@ bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
  optix_assert(optixDenoiserComputeMemoryResources(
      denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));

-  /* Denoiser is invoked on whole images only, so no overlap needed (would be used for tiling). */
-  denoiser_.scratch_size = sizes.withoutOverlapScratchSizeInBytes;
+  denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes;
  denoiser_.scratch_offset = sizes.stateSizeInBytes;

  /* Allocate denoiser state if tile size has changed since last setup. */
  denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);

  /* Initialize denoiser state for the current tile size. */
-  const OptixResult result = optixDenoiserSetup(
-      denoiser_.optix_denoiser,
-      0, /* Work around bug in r495 drivers that causes artifacts when denoiser setup is called
-            on a stream that is not the default stream */
-      buffer_params.width,
-      buffer_params.height,
-      denoiser_.state.device_pointer,
-      denoiser_.scratch_offset,
-      denoiser_.state.device_pointer + denoiser_.scratch_offset,
-      denoiser_.scratch_size);
+  const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser,
+                                                denoiser_.queue.stream(),
+                                                buffer_params.width,
+                                                buffer_params.height,
+                                                denoiser_.state.device_pointer,
+                                                denoiser_.scratch_offset,
+                                                denoiser_.state.device_pointer +
+                                                    denoiser_.scratch_offset,
+                                                denoiser_.scratch_size);
  if (result != OPTIX_SUCCESS) {
    set_error("Failed to set up OptiX denoiser");
    return false;
  }

-  cuda_assert(cuCtxSynchronize());
-
  denoiser_.is_configured = true;
  denoiser_.configured_size.x = buffer_params.width;
  denoiser_.configured_size.y = buffer_params.height;
@@ -941,6 +939,8 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
  }

+  device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE);
+
  /* Optional albedo and color passes. */
  if (context.num_input_passes > 1) {
    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
@@ -971,7 +971,6 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)

  /* Finally run denoising. */
  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
-
  OptixDenoiserLayer image_layers = {};
  image_layers.input = color_layer;
  image_layers.output = output_layer;
@@ -1001,13 +1000,6 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
                                  const OptixBuildInput &build_input,
                                  uint16_t num_motion_steps)
 {
-  /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
-   * from running out of memory (since both original and compacted acceleration structure memory
-   * may be allocated at the same time for the duration of this function). The builds would
-   * otherwise happen on the same CUDA stream anyway. */
-  static thread_mutex mutex;
-  thread_scoped_lock lock(mutex);
-
  const CUDAContextScope scope(this);

  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
@@ -1033,15 +1025,14 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
  optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));

  /* Allocate required output buffers. */
-  device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
+  device_only_memory<char> temp_mem(this, "optix temp as build mem");
  temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
  if (!temp_mem.device_pointer) {
    /* Make sure temporary memory allocation succeeded. */
    return false;
  }

-  /* Acceleration structure memory has to be allocated on the device (not allowed on the host). */
-  device_only_memory<char> &out_data = *bvh->as_data;
+  device_only_memory<char> &out_data = bvh->as_data;
  if (operation == OPTIX_BUILD_OPERATION_BUILD) {
    assert(out_data.device == this);
    out_data.alloc_to_device(sizes.outputSizeInBytes);
@@ -1089,13 +1080,12 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,

    /* There is no point compacting if the size does not change. */
    if (compacted_size < sizes.outputSizeInBytes) {
-      device_only_memory<char> compacted_data(this, "optix compacted as", false);
+      device_only_memory<char> compacted_data(this, "optix compacted as");
      compacted_data.alloc_to_device(compacted_size);
-      if (!compacted_data.device_pointer) {
+      if (!compacted_data.device_pointer)
        /* Do not compact if memory allocation for compacted acceleration structure fails.
         * Can just use the uncompacted one then, so succeed here regardless. */
        return !have_error();
-      }

      optix_assert(optixAccelCompact(
          context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
@@ -1106,8 +1096,6 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,

      std::swap(out_data.device_size, compacted_data.device_size);
      std::swap(out_data.device_pointer, compacted_data.device_pointer);
-      /* Original acceleration structure memory is freed when 'compacted_data' goes out of scope.
-       */
    }
  }

@@ -1135,7 +1123,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
      operation = OPTIX_BUILD_OPERATION_UPDATE;
    }
    else {
-      bvh_optix->as_data->free();
+      bvh_optix->as_data.free();
      bvh_optix->traversable_handle = 0;
    }

@@ -1196,7 +1184,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
              const float4 pw = make_float4(
                  curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);

-              /* Convert Catmull-Rom data to B-spline. */
+              /* Convert Catmull-Rom data to Bezier spline. */
              static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
              static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
              static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
@@ -1356,9 +1344,9 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    unsigned int num_instances = 0;
    unsigned int max_num_instances = 0xFFFFFFFF;

-    bvh_optix->as_data->free();
+    bvh_optix->as_data.free();
    bvh_optix->traversable_handle = 0;
-    bvh_optix->motion_transform_data->free();
+    bvh_optix->motion_transform_data.free();

    optixDeviceContextGetProperty(context,
                                  OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
@@ -1391,8 +1379,8 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
        }
      }

-      assert(bvh_optix->motion_transform_data->device == this);
-      bvh_optix->motion_transform_data->alloc_to_device(total_motion_transform_size);
+      assert(bvh_optix->motion_transform_data.device == this);
+      bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
    }

    for (Object *ob : bvh->objects) {
@@ -1453,7 +1441,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)

        motion_transform_offset = align_up(motion_transform_offset,
                                           OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data->device_pointer +
+        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
                                           motion_transform_offset;
        motion_transform_offset += motion_transform_size;

--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -23,7 +23,6 @@
 #  include "device/optix/queue.h"
 #  include "device/optix/util.h"
 #  include "kernel/types.h"
-#  include "util/unique_ptr.h"

 CCL_NAMESPACE_BEGIN

@@ -77,12 +76,13 @@ class OptiXDevice : public CUDADevice {
  device_only_memory<KernelParamsOptiX> launch_params;
  OptixTraversableHandle tlas_handle = 0;

-  vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
+  vector<device_only_memory<char>> delayed_free_bvh_memory;
  thread_mutex delayed_free_bvh_mutex;

  class Denoiser {
   public:
    explicit Denoiser(OptiXDevice *device);
+    ~Denoiser();

    OptiXDevice *device;
    OptiXDeviceQueue queue;
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -73,8 +73,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
                        sizeof(device_ptr),
                        cuda_stream_));

-  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
    cuda_device_assert(
        cuda_device_,
        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
--- a/intern/cycles/integrator/denoiser.cpp
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -33,10 +33,7 @@ unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoisePa
    return make_unique<OptiXDenoiser>(path_trace_device, params);
  }

-  /* Always fallback to OIDN. */
-  DenoiseParams oidn_params = params;
-  oidn_params.type = DENOISER_OPENIMAGEDENOISE;
-  return make_unique<OIDNDenoiser>(path_trace_device, oidn_params);
+  return make_unique<OIDNDenoiser>(path_trace_device, params);
 }

 Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams &params)
--- a/intern/cycles/integrator/denoiser_oidn.cpp
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -47,6 +47,9 @@ static bool oidn_progress_monitor_function(void *user_ptr, double /*n*/)
  OIDNDenoiser *oidn_denoiser = reinterpret_cast<OIDNDenoiser *>(user_ptr);
  return !oidn_denoiser->is_cancelled();
 }
+#endif
+
+#ifdef WITH_OPENIMAGEDENOISE

 class OIDNPass {
 public:
@@ -544,6 +547,7 @@ class OIDNDenoiseContext {
   * the fake values and denoising of passes which do need albedo can no longer happen. */
  bool albedo_replaced_with_fake_ = false;
 };
+#endif

 static unique_ptr<DeviceQueue> create_device_queue(const RenderBuffers *render_buffers)
 {
@@ -578,20 +582,18 @@ static void copy_render_buffers_to_device(unique_ptr<DeviceQueue> &queue,
  }
 }

-#endif
-
 bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params,
                                  RenderBuffers *render_buffers,
                                  const int num_samples,
                                  bool allow_inplace_modification)
 {
-#ifdef WITH_OPENIMAGEDENOISE
  thread_scoped_lock lock(mutex_);

  /* Make sure the host-side data is available for denoising. */
  unique_ptr<DeviceQueue> queue = create_device_queue(render_buffers);
  copy_render_buffers_from_device(queue, render_buffers);

+#ifdef WITH_OPENIMAGEDENOISE
  OIDNDenoiseContext context(
      this, params_, buffer_params, render_buffers, num_samples, allow_inplace_modification);

@@ -618,11 +620,6 @@ bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params,
     * copies data from the device it doesn't overwrite the denoiser buffers. */
    copy_render_buffers_to_device(queue, render_buffers);
  }
-#else
-  (void)buffer_params;
-  (void)render_buffers;
-  (void)num_samples;
-  (void)allow_inplace_modification;
 #endif

  /* This code is not supposed to run when compiled without OIDN support, so can assume if we made
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -14,12 +14,9 @@
 * limitations under the License.
 */

-#include "device/device.h"
-
 #include "integrator/pass_accessor_cpu.h"

 #include "session/buffers.h"
-
 #include "util/log.h"
 #include "util/tbb.h"

@@ -36,16 +33,70 @@ CCL_NAMESPACE_BEGIN
 * Kernel processing.
 */

+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                                           const BufferParams &buffer_params,
+                                                           const Destination &destination,
+                                                           const Processor &processor) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  if (destination.pixels) {
+    /* NOTE: No overlays are applied since they are not used for final renders.
+     * Can be supported via some sort of specialization to avoid code duplication. */
+
+    run_get_pass_kernel_processor_float(
+        &kfilm_convert, render_buffers, buffer_params, destination, processor);
+  }
+
+  if (destination.pixels_half_rgba) {
+    /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
+
+    if (destination.num_components == 1) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                float pixel;
+                                                processor(kfilm_convert, buffer, &pixel);
+
+                                                pixel_rgba[0] = pixel;
+                                                pixel_rgba[1] = pixel;
+                                                pixel_rgba[2] = pixel;
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 3) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                processor(kfilm_convert, buffer, pixel_rgba);
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 4) {
+      run_get_pass_kernel_processor_half_rgba(
+          &kfilm_convert, render_buffers, buffer_params, destination, processor);
+    }
+  }
+}
+
+template<typename Processor>
 inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
    const KernelFilmConvert *kfilm_convert,
    const RenderBuffers *render_buffers,
    const BufferParams &buffer_params,
    const Destination &destination,
-    const CPUKernels::FilmConvertFunction func) const
+    const Processor &processor) const
 {
-  /* NOTE: No overlays are applied since they are not used for final renders.
-   * Can be supported via some sort of specialization to avoid code duplication. */
-
  DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";

  const int64_t pass_stride = buffer_params.pass_stride;
@@ -61,16 +112,21 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
    const float *buffer = window_data + y * buffer_row_stride;
    float *pixel = destination.pixels +
                   (y * buffer_params.width + destination.offset) * pixel_stride;
-    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride);
+
+    for (int64_t x = 0; x < buffer_params.window_width;
+         ++x, buffer += pass_stride, pixel += pixel_stride) {
+      processor(kfilm_convert, buffer, pixel);
+    }
  });
 }

+template<typename Processor>
 inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
    const KernelFilmConvert *kfilm_convert,
    const RenderBuffers *render_buffers,
    const BufferParams &buffer_params,
    const Destination &destination,
-    const CPUKernels::FilmConvertHalfRGBAFunction func) const
+    const Processor &processor) const
 {
  const int64_t pass_stride = buffer_params.pass_stride;
  const int64_t buffer_row_stride = buffer_params.stride * buffer_params.pass_stride;
@@ -85,7 +141,16 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
  tbb::parallel_for(0, buffer_params.window_height, [&](int64_t y) {
    const float *buffer = window_data + y * buffer_row_stride;
    half4 *pixel = dst_start + y * destination_stride;
-    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride);
+    for (int64_t x = 0; x < buffer_params.window_width; ++x, buffer += pass_stride, ++pixel) {
+
+      float pixel_rgba[4];
+      processor(kfilm_convert, buffer, pixel_rgba);
+
+      film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba);
+
+      *pixel = float4_to_half4_display(
+          make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3]));
+    }
  });
 }

@@ -98,25 +163,8 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
                                        const BufferParams &buffer_params, \
                                        const Destination &destination) const \
  { \
-    const CPUKernels &kernels = Device::get_cpu_kernels(); \
-    KernelFilmConvert kfilm_convert; \
-    init_kernel_film_convert(&kfilm_convert, buffer_params, destination); \
-\
-    if (destination.pixels) { \
-      run_get_pass_kernel_processor_float(&kfilm_convert, \
-                                          render_buffers, \
-                                          buffer_params, \
-                                          destination, \
-                                          kernels.film_convert_##pass); \
-    } \
-\
-    if (destination.pixels_half_rgba) { \
-      run_get_pass_kernel_processor_half_rgba(&kfilm_convert, \
-                                              render_buffers, \
-                                              buffer_params, \
-                                              destination, \
-                                              kernels.film_convert_half_rgba_##pass); \
-    } \
+    run_get_pass_kernel_processor( \
+        render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
  }

 /* Float (scalar) passes. */
--- a/intern/cycles/integrator/pass_accessor_cpu.h
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -16,8 +16,6 @@

 #pragma once

-#include "device/cpu/kernel.h"
-
 #include "integrator/pass_accessor.h"

 CCL_NAMESPACE_BEGIN
@@ -30,19 +28,25 @@ class PassAccessorCPU : public PassAccessor {
  using PassAccessor::PassAccessor;

 protected:
-  inline void run_get_pass_kernel_processor_float(
-      const KernelFilmConvert *kfilm_convert,
-      const RenderBuffers *render_buffers,
-      const BufferParams &buffer_params,
-      const Destination &destination,
-      const CPUKernels::FilmConvertFunction func) const;
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination,
+                                            const Processor &processor) const;

-  inline void run_get_pass_kernel_processor_half_rgba(
-      const KernelFilmConvert *kfilm_convert,
-      const RenderBuffers *render_buffers,
-      const BufferParams &buffer_params,
-      const Destination &destination,
-      const CPUKernels::FilmConvertHalfRGBAFunction func) const;
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
+                                                  const RenderBuffers *render_buffers,
+                                                  const BufferParams &buffer_params,
+                                                  const Destination &destination,
+                                                  const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
+                                                      const RenderBuffers *render_buffers,
+                                                      const BufferParams &buffer_params,
+                                                      const Destination &destination,
+                                                      const Processor &processor) const;

 #define DECLARE_PASS_ACCESSOR(pass) \
  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -296,13 +296,13 @@ static BufferParams scale_buffer_params(const BufferParams &params, int resoluti

  scaled_params.window_x = params.window_x / resolution_divider;
  scaled_params.window_y = params.window_y / resolution_divider;
-  scaled_params.window_width = max(1, params.window_width / resolution_divider);
-  scaled_params.window_height = max(1, params.window_height / resolution_divider);
+  scaled_params.window_width = params.window_width / resolution_divider;
+  scaled_params.window_height = params.window_height / resolution_divider;

  scaled_params.full_x = params.full_x / resolution_divider;
  scaled_params.full_y = params.full_y / resolution_divider;
-  scaled_params.full_width = max(1, params.full_width / resolution_divider);
-  scaled_params.full_height = max(1, params.full_height / resolution_divider);
+  scaled_params.full_width = params.full_width / resolution_divider;
+  scaled_params.full_height = params.full_height / resolution_divider;

  scaled_params.update_offset_stride();

@@ -479,11 +479,7 @@ void PathTrace::set_denoiser_params(const DenoiseParams &params)
  }

  denoiser_ = Denoiser::create(device_, params);
-
-  /* Only take into account the "immediate" cancel to have interactive rendering responding to
-   * navigation as quickly as possible, but allow to run denoiser after user hit Esc button while
-   * doing offline rendering. */
-  denoiser_->is_cancelled_cb = [this]() { return render_cancel_.is_requested; };
+  denoiser_->is_cancelled_cb = [this]() { return is_cancel_requested(); };
 }

 void PathTrace::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
@@ -851,8 +847,7 @@ void PathTrace::progress_update_if_needed(const RenderWork &render_work)
 {
  if (progress_ != nullptr) {
    const int2 tile_size = get_render_tile_size();
-    const uint64_t num_samples_added = uint64_t(tile_size.x) * tile_size.y *
-                                       render_work.path_trace.num_samples;
+    const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples;
    const int current_sample = render_work.path_trace.start_sample +
                               render_work.path_trace.num_samples;
    progress_->add_samples(num_samples_added, current_sample);
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -58,7 +58,7 @@ PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
                                   DeviceScene *device_scene,
                                   bool *cancel_requested_flag)
    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
-      kernels_(Device::get_cpu_kernels())
+      kernels_(*(device->get_cpu_kernels()))
 {
  DCHECK_EQ(device->info.type, DEVICE_CPU);
 }
@@ -77,10 +77,8 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
  const int64_t image_height = effective_buffer_params_.height;
  const int64_t total_pixels_num = image_width * image_height;

-  if (device_->profiler.active()) {
-    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
-      kernel_globals.start_profiling();
-    }
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.start_profiling();
  }

  tbb::task_arena local_arena = local_tbb_arena_create(device_);
@@ -108,10 +106,9 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
      render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
    });
  });
-  if (device_->profiler.active()) {
-    for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
-      kernel_globals.stop_profiling();
-    }
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.stop_profiling();
  }

  statistics.occupancy = 1.0f;
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -257,8 +257,7 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
   * become busy after adding new tiles). This is especially important for the shadow catcher which
   * schedules work in halves of available number of paths. */
  work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
-  work_tile_scheduler_.set_accelerated_rt((device_->get_bvh_layout_mask() & BVH_LAYOUT_OPTIX) !=
-                                          0);
+
  work_tile_scheduler_.reset(effective_buffer_params_,
                             start_sample,
                             samples_num,
@@ -438,15 +437,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
  DCHECK_LE(work_size, max_num_paths_);

  switch (kernel) {
-    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: {
-      /* Closest ray intersection kernels with integrator state and render buffer. */
-      void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
-      void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
-
-      queue_->enqueue(kernel, work_size, args);
-      break;
-    }
-
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -827,26 +827,6 @@ int RenderScheduler::get_num_samples_to_path_trace() const
      num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy);
    }

-    /* When time limit is used clamp the calculated number of samples to keep occupancy.
-     * This is because time limit causes the last render iteration to happen with less number of
-     * samples, which conflicts with the occupancy (lower number of samples causes lower
-     * occupancy, also the calculation is based on number of previously rendered samples).
-     *
-     * When time limit is not used the number of samples per render iteration is either increasing
-     * or stays the same, so there is no need to clamp number of samples calculated for occupancy.
-     */
-    if (time_limit_ && state_.start_render_time) {
-      const double remaining_render_time = max(
-          0.0, time_limit_ - (time_dt() - state_.start_render_time));
-      const double time_per_sample_average = path_trace_time_.get_average();
-      const double predicted_render_time = num_samples_to_occupy * time_per_sample_average;
-
-      if (predicted_render_time > remaining_render_time) {
-        num_samples_to_occupy = lround(num_samples_to_occupy *
-                                       (remaining_render_time / predicted_render_time));
-      }
-    }
-
    num_samples_to_render = max(num_samples_to_render,
                                min(num_samples_to_occupy, max_num_samples_to_render));
  }
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -96,7 +96,7 @@ bool ShaderEval::eval_cpu(Device *device,
  device->get_cpu_kernel_thread_globals(kernel_thread_globals);

  /* Find required kernel function. */
-  const CPUKernels &kernels = Device::get_cpu_kernels();
+  const CPUKernels &kernels = *(device->get_cpu_kernels());

  /* Simple parallel_for over all work items. */
  KernelShaderEvalInput *input_data = input.data();
--- a/intern/cycles/integrator/tile.cpp
+++ b/intern/cycles/integrator/tile.cpp
@@ -46,8 +46,7 @@ ccl_device_inline uint round_up_to_power_of_two(uint x)
  return next_power_of_two(x);
 }

-TileSize tile_calculate_best_size(const bool accel_rt,
-                                  const int2 &image_size,
+TileSize tile_calculate_best_size(const int2 &image_size,
                                  const int num_samples,
                                  const int max_num_path_states,
                                  const float scrambling_distance)
@@ -74,7 +73,7 @@ TileSize tile_calculate_best_size(const bool accel_rt,

  TileSize tile_size;
  const int num_path_states_per_sample = max_num_path_states / num_samples;
-  if (scrambling_distance < 0.9f && accel_rt) {
+  if (scrambling_distance < 0.9f) {
    /* Prefer large tiles for scrambling distance, bounded by max num path states. */
    tile_size.width = min(image_size.x, max_num_path_states);
    tile_size.height = min(image_size.y, max(max_num_path_states / tile_size.width, 1));
--- a/intern/cycles/integrator/tile.h
+++ b/intern/cycles/integrator/tile.h
@@ -49,8 +49,7 @@ std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
 * of active path states.
 * Will attempt to provide best guess to keep path tracing threads of a device as localized as
 * possible, and have as many threads active for every tile as possible. */
-TileSize tile_calculate_best_size(const bool accel_rt,
-                                  const int2 &image_size,
+TileSize tile_calculate_best_size(const int2 &image_size,
                                  const int num_samples,
                                  const int max_num_path_states,
                                  const float scrambling_distance);
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -28,11 +28,6 @@ WorkTileScheduler::WorkTileScheduler()
 {
 }

-void WorkTileScheduler::set_accelerated_rt(bool accelerated_rt)
-{
-  accelerated_rt_ = accelerated_rt;
-}
-
 void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
 {
  max_num_path_states_ = max_num_path_states;
@@ -64,7 +59,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
 void WorkTileScheduler::reset_scheduler_state()
 {
  tile_size_ = tile_calculate_best_size(
-      accelerated_rt_, image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);
+      image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);

  VLOG(3) << "Will schedule tiles of size " << tile_size_;

--- a/intern/cycles/integrator/work_tile_scheduler.h
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -31,9 +31,6 @@ class WorkTileScheduler {
 public:
  WorkTileScheduler();

-  /* To indicate if there is accelerated RT support. */
-  void set_accelerated_rt(bool state);
-
  /* MAximum path states which are allowed to be used by a single scheduled work tile.
   *
   * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
@@ -57,9 +54,6 @@ class WorkTileScheduler {
 protected:
  void reset_scheduler_state();

-  /* Used to indicate if there is accelerated ray tracing. */
-  bool accelerated_rt_ = false;
-
  /* Maximum allowed path states to be used.
   *
   * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -565,12 +565,6 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
      set(name ${name}_experimental)
    endif()

-    if(WITH_NANOVDB)
-      set(hip_flags ${hip_flags}
-        -D WITH_NANOVDB
-        -I "${NANOVDB_INCLUDE_DIR}")
-    endif()
-
    if(WITH_CYCLES_DEBUG)
      set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__)
    endif()
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -438,7 +438,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals kg,
  if (label & LABEL_TRANSMIT) {
    float threshold_squared = kernel_data.background.transparent_roughness_squared_threshold;

-    if (threshold_squared >= 0.0f && !(label & LABEL_DIFFUSE)) {
+    if (threshold_squared >= 0.0f) {
      if (bsdf_get_specular_roughness_squared(sc) <= threshold_squared) {
        label |= LABEL_TRANSMIT_TRANSPARENT;
      }
--- a/intern/cycles/kernel/device/cpu/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -18,7 +18,6 @@

 /* CPU Kernel Interface */

-#include "util/half.h"
 #include "util/types.h"

 #include "kernel/types.h"
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -37,7 +37,7 @@

 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
-KERNEL_INTEGRATOR_SHADE_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
 KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
 KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
 KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
@@ -52,37 +52,6 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
 #undef KERNEL_INTEGRATOR_INIT_FUNCTION
 #undef KERNEL_INTEGRATOR_SHADE_FUNCTION

-#define KERNEL_FILM_CONVERT_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
-                                                      const float *buffer, \
-                                                      float *pixel, \
-                                                      const int width, \
-                                                      const int buffer_stride, \
-                                                      const int pixel_stride); \
-  void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
-      const KernelFilmConvert *kfilm_convert, \
-      const float *buffer, \
-      half4 *pixel, \
-      const int width, \
-      const int buffer_stride);
-
-KERNEL_FILM_CONVERT_FUNCTION(depth)
-KERNEL_FILM_CONVERT_FUNCTION(mist)
-KERNEL_FILM_CONVERT_FUNCTION(sample_count)
-KERNEL_FILM_CONVERT_FUNCTION(float)
-
-KERNEL_FILM_CONVERT_FUNCTION(light_path)
-KERNEL_FILM_CONVERT_FUNCTION(float3)
-
-KERNEL_FILM_CONVERT_FUNCTION(motion)
-KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
-KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
-KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
-KERNEL_FILM_CONVERT_FUNCTION(combined)
-KERNEL_FILM_CONVERT_FUNCTION(float4)
-
-#undef KERNEL_FILM_CONVERT_FUNCTION
-
 /* --------------------------------------------------------------------
 * Shader evaluation.
 */
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -47,8 +47,8 @@
 #    include "kernel/integrator/megakernel.h"

 #    include "kernel/film/adaptive_sampling.h"
-#    include "kernel/film/id_passes.h"
 #    include "kernel/film/read.h"
+#    include "kernel/film/id_passes.h"

 #    include "kernel/bake/bake.h"

@@ -112,7 +112,7 @@ CCL_NAMESPACE_BEGIN

 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
-DEFINE_INTEGRATOR_SHADE_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
 DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
 DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
@@ -232,85 +232,6 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *
 #endif
 }

-/* --------------------------------------------------------------------
- * Film Convert.
- */
-
-#ifdef KERNEL_STUB
-
-#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
-    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
-                                                        const float *buffer, \
-                                                        float *pixel, \
-                                                        const int width, \
-                                                        const int buffer_stride, \
-                                                        const int pixel_stride) \
-    { \
-      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
-    } \
-    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
-        const KernelFilmConvert *kfilm_convert, \
-        const float *buffer, \
-        half4 *pixel, \
-        const int width, \
-        const int buffer_stride) \
-    { \
-      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
-    }
-
-#else
-
-#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
-    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
-                                                        const float *buffer, \
-                                                        float *pixel, \
-                                                        const int width, \
-                                                        const int buffer_stride, \
-                                                        const int pixel_stride) \
-    { \
-      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel += pixel_stride) { \
-        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel); \
-      } \
-    } \
-    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
-        const KernelFilmConvert *kfilm_convert, \
-        const float *buffer, \
-        half4 *pixel, \
-        const int width, \
-        const int buffer_stride) \
-    { \
-      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel++) { \
-        float pixel_rgba[4] = {0.0f, 0.0f, 0.0f, 1.0f}; \
-        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel_rgba); \
-        if (is_float) { \
-          pixel_rgba[1] = pixel_rgba[0]; \
-          pixel_rgba[2] = pixel_rgba[0]; \
-        } \
-        film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); \
-        *pixel = float4_to_half4_display( \
-            make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); \
-      } \
-    }
-
-#endif
-
-KERNEL_FILM_CONVERT_FUNCTION(depth, true)
-KERNEL_FILM_CONVERT_FUNCTION(mist, true)
-KERNEL_FILM_CONVERT_FUNCTION(sample_count, true)
-KERNEL_FILM_CONVERT_FUNCTION(float, true)
-
-KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
-KERNEL_FILM_CONVERT_FUNCTION(float3, false)
-
-KERNEL_FILM_CONVERT_FUNCTION(motion, false)
-KERNEL_FILM_CONVERT_FUNCTION(cryptomatte, false)
-KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher, false)
-KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow, false)
-KERNEL_FILM_CONVERT_FUNCTION(combined, false)
-KERNEL_FILM_CONVERT_FUNCTION(float4, false)
-
-#undef KERNEL_FILM_CONVERT_FUNCTION
-
 #undef KERNEL_INVOKE
 #undef DEFINE_INTEGRATOR_KERNEL
 #undef DEFINE_INTEGRATOR_SHADE_KERNEL
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -116,15 +116,13 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }

 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_closest(const int *path_index_array,
-                                            ccl_global float *render_buffer,
-                                            const int work_size)
+    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
 {
  const int global_index = ccl_gpu_global_id_x();

  if (global_index < work_size) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_closest(NULL, state, render_buffer);
+    integrator_intersect_closest(NULL, state);
  }
 }

@@ -488,26 +486,6 @@ ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *k
  processor(kfilm_convert, buffer, pixel);
 }

-ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba,
-                                                          const int rgba_offset,
-                                                          const int rgba_stride,
-                                                          const int x,
-                                                          const int y,
-                                                          const half4 half_pixel)
-{
-  /* Work around HIP issue with half float display, see T92972. */
-#ifdef __KERNEL_HIP__
-  ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
-  out[0] = half_pixel.x;
-  out[1] = half_pixel.y;
-  out[2] = half_pixel.z;
-  out[3] = half_pixel.w;
-#else
-  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
-  *out = half_pixel;
-#endif
-}
-
 /* Common implementation for half4 destination and 4-channel input pass. */
 template<typename Processor>
 ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
@@ -538,9 +516,8 @@ ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(

  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);

-  const half4 half_pixel = float4_to_half4_display(
-      make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
-  kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel);
+  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
+  *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
 }

 /* Common implementation for half4 destination and 3-channel input pass. */
@@ -811,7 +788,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  }

  /* Normal pass. */
-  if (guiding_pass_normal != PASS_UNUSED) {
+  if (render_pass_denoising_normal != PASS_UNUSED) {
    kernel_assert(render_pass_denoising_normal != PASS_UNUSED);

    const float *normal_in = buffer + render_pass_denoising_normal;
--- a/intern/cycles/kernel/device/gpu/work_stealing.h
+++ b/intern/cycles/kernel/device/gpu/work_stealing.h
@@ -29,20 +29,17 @@ ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
                                      ccl_private uint *y,
                                      ccl_private uint *sample)
 {
-  uint sample_offset, pixel_offset;
-
-  if (kernel_data.integrator.scrambling_distance < 0.9f) {
-    /* Keep threads for the same sample together. */
-    uint tile_pixels = tile->w * tile->h;
-    sample_offset = global_work_index / tile_pixels;
-    pixel_offset = global_work_index - sample_offset * tile_pixels;
-  }
-  else {
-    /* Keeping threads for the same pixel together.
-     * Appears to improve performance by a few % on CUDA and OptiX. */
-    sample_offset = global_work_index % tile->num_samples;
-    pixel_offset = global_work_index / tile->num_samples;
-  }
+#if 0
+  /* Keep threads for the same sample together. */
+  uint tile_pixels = tile->w * tile->h;
+  uint sample_offset = global_work_index / tile_pixels;
+  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+#else
+  /* Keeping threads for the same pixel together.
+   * Appears to improve performance by a few % on CUDA and OptiX. */
+  uint sample_offset = global_work_index % tile->num_samples;
+  uint pixel_offset = global_work_index / tile->num_samples;
+#endif

  uint y_offset = pixel_offset / tile->w;
  uint x_offset = pixel_offset - y_offset * tile->w;
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -57,7 +57,7 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
  const int global_index = optixGetLaunchIndex().x;
  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
                                                       global_index;
-  integrator_intersect_closest(nullptr, path_index, __params.render_buffer);
+  integrator_intersect_closest(nullptr, path_index);
 }

 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
--- a/intern/cycles/kernel/film/accumulate.h
+++ b/intern/cycles/kernel/film/accumulate.h
@@ -33,72 +33,62 @@ CCL_NAMESPACE_BEGIN
 * them separately. */

 ccl_device_inline void bsdf_eval_init(ccl_private BsdfEval *eval,
-                                      const ClosureType closure_type,
+                                      const bool is_diffuse,
                                      float3 value)
 {
  eval->diffuse = zero_float3();
  eval->glossy = zero_float3();

-  if (CLOSURE_IS_BSDF_DIFFUSE(closure_type)) {
+  if (is_diffuse) {
    eval->diffuse = value;
  }
-  else if (CLOSURE_IS_BSDF_GLOSSY(closure_type)) {
+  else {
    eval->glossy = value;
  }
-
-  eval->sum = value;
 }

 ccl_device_inline void bsdf_eval_accum(ccl_private BsdfEval *eval,
-                                       const ClosureType closure_type,
-                                       float3 value)
+                                       const bool is_diffuse,
+                                       float3 value,
+                                       float mis_weight)
 {
-  if (CLOSURE_IS_BSDF_DIFFUSE(closure_type)) {
+  value *= mis_weight;
+
+  if (is_diffuse) {
    eval->diffuse += value;
  }
-  else if (CLOSURE_IS_BSDF_GLOSSY(closure_type)) {
+  else {
    eval->glossy += value;
  }
-
-  eval->sum += value;
 }

 ccl_device_inline bool bsdf_eval_is_zero(ccl_private BsdfEval *eval)
 {
-  return is_zero(eval->sum);
+  return is_zero(eval->diffuse) && is_zero(eval->glossy);
 }

 ccl_device_inline void bsdf_eval_mul(ccl_private BsdfEval *eval, float value)
 {
  eval->diffuse *= value;
  eval->glossy *= value;
-  eval->sum *= value;
 }

 ccl_device_inline void bsdf_eval_mul3(ccl_private BsdfEval *eval, float3 value)
 {
  eval->diffuse *= value;
  eval->glossy *= value;
-  eval->sum *= value;
 }

 ccl_device_inline float3 bsdf_eval_sum(ccl_private const BsdfEval *eval)
 {
-  return eval->sum;
+  return eval->diffuse + eval->glossy;
 }

-ccl_device_inline float3 bsdf_eval_pass_diffuse_weight(ccl_private const BsdfEval *eval)
+ccl_device_inline float3 bsdf_eval_diffuse_glossy_ratio(ccl_private const BsdfEval *eval)
 {
-  /* Ratio of diffuse weight to recover proportions for writing to render pass.
+  /* Ratio of diffuse and glossy to recover proportions for writing to render pass.
   * We assume reflection, transmission and volume scatter to be exclusive. */
-  return safe_divide_float3_float3(eval->diffuse, eval->sum);
-}
-
-ccl_device_inline float3 bsdf_eval_pass_glossy_weight(ccl_private const BsdfEval *eval)
-{
-  /* Ratio of glossy weight to recover proportions for writing to render pass.
-   * We assume reflection, transmission and volume scatter to be exclusive. */
-  return safe_divide_float3_float3(eval->glossy, eval->sum);
+  return safe_divide_float3_float3(eval->diffuse, eval->diffuse + eval->glossy);
 }

 /* --------------------------------------------------------------------
@@ -361,48 +351,38 @@ ccl_device_inline void kernel_accum_emission_or_background_pass(KernelGlobals kg
    /* Directly visible, write to emission or background pass. */
    pass_offset = pass;
  }
-  else if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    if (path_flag & PATH_RAY_SURFACE_PASS) {
-      /* Indirectly visible through reflection. */
-      const float3 diffuse_weight = INTEGRATOR_STATE(state, path, pass_diffuse_weight);
-      const float3 glossy_weight = INTEGRATOR_STATE(state, path, pass_glossy_weight);
+  else if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+    /* Indirectly visible through reflection. */
+    const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                       ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                                            kernel_data.film.pass_glossy_direct :
+                                            kernel_data.film.pass_glossy_indirect) :
+                                       ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                                            kernel_data.film.pass_transmission_direct :
+                                            kernel_data.film.pass_transmission_indirect);

-      /* Glossy */
-      const int glossy_pass_offset = ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                                          kernel_data.film.pass_glossy_direct :
-                                          kernel_data.film.pass_glossy_indirect);
-      if (glossy_pass_offset != PASS_UNUSED) {
-        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_weight * contribution);
-      }
-
-      /* Transmission */
-      const int transmission_pass_offset = ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                                                kernel_data.film.pass_transmission_direct :
-                                                kernel_data.film.pass_transmission_indirect);
-
-      if (transmission_pass_offset != PASS_UNUSED) {
-        /* Transmission is what remains if not diffuse and glossy, not stored explicitly to save
-         * GPU memory. */
-        const float3 transmission_weight = one_float3() - diffuse_weight - glossy_weight;
-        kernel_write_pass_float3(buffer + transmission_pass_offset,
-                                 transmission_weight * contribution);
-      }
-
-      /* Reconstruct diffuse subset of throughput. */
-      pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                        kernel_data.film.pass_diffuse_direct :
-                        kernel_data.film.pass_diffuse_indirect;
-      if (pass_offset != PASS_UNUSED) {
-        contribution *= diffuse_weight;
-      }
+    if (glossy_pass_offset != PASS_UNUSED) {
+      /* Glossy is a subset of the throughput, reconstruct it here using the
+       * diffuse-glossy ratio. */
+      const float3 ratio = INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
+      const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+      kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
    }
-    else if (path_flag & PATH_RAY_VOLUME_PASS) {
-      /* Indirectly visible through volume. */
-      pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
-                        kernel_data.film.pass_volume_direct :
-                        kernel_data.film.pass_volume_indirect;
+
+    /* Reconstruct diffuse subset of throughput. */
+    pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                      kernel_data.film.pass_diffuse_direct :
+                      kernel_data.film.pass_diffuse_indirect;
+    if (pass_offset != PASS_UNUSED) {
+      contribution *= INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
    }
  }
+  else if (path_flag & PATH_RAY_VOLUME_PASS) {
+    /* Indirectly visible through volume. */
+    pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                      kernel_data.film.pass_volume_direct :
+                      kernel_data.film.pass_volume_indirect;
+  }

  /* Single write call for GPU coherence. */
  if (pass_offset != PASS_UNUSED) {
@@ -446,60 +426,49 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
 #ifdef __PASSES__
  if (kernel_data.film.light_pass_flag & PASS_ANY) {
    const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag);
+    int pass_offset = PASS_UNUSED;

-    if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-      int pass_offset = PASS_UNUSED;
+    if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+      /* Indirectly visible through reflection. */
+      const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                         ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_glossy_direct :
+                                              kernel_data.film.pass_glossy_indirect) :
+                                         ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_transmission_direct :
+                                              kernel_data.film.pass_transmission_indirect);

-      if (path_flag & PATH_RAY_SURFACE_PASS) {
-        /* Indirectly visible through reflection. */
-        const float3 diffuse_weight = INTEGRATOR_STATE(state, shadow_path, pass_diffuse_weight);
-        const float3 glossy_weight = INTEGRATOR_STATE(state, shadow_path, pass_glossy_weight);
-
-        /* Glossy */
-        const int glossy_pass_offset = ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                                            kernel_data.film.pass_glossy_direct :
-                                            kernel_data.film.pass_glossy_indirect);
-        if (glossy_pass_offset != PASS_UNUSED) {
-          kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_weight * contribution);
-        }
-
-        /* Transmission */
-        const int transmission_pass_offset = ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                                                  kernel_data.film.pass_transmission_direct :
-                                                  kernel_data.film.pass_transmission_indirect);
-
-        if (transmission_pass_offset != PASS_UNUSED) {
-          /* Transmission is what remains if not diffuse and glossy, not stored explicitly to save
-           * GPU memory. */
-          const float3 transmission_weight = one_float3() - diffuse_weight - glossy_weight;
-          kernel_write_pass_float3(buffer + transmission_pass_offset,
-                                   transmission_weight * contribution);
-        }
-
-        /* Reconstruct diffuse subset of throughput. */
-        pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                          kernel_data.film.pass_diffuse_direct :
-                          kernel_data.film.pass_diffuse_indirect;
-        if (pass_offset != PASS_UNUSED) {
-          contribution *= diffuse_weight;
-        }
-      }
-      else if (path_flag & PATH_RAY_VOLUME_PASS) {
-        /* Indirectly visible through volume. */
-        pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
-                          kernel_data.film.pass_volume_direct :
-                          kernel_data.film.pass_volume_indirect;
+      if (glossy_pass_offset != PASS_UNUSED) {
+        /* Glossy is a subset of the throughput, reconstruct it here using the
+         * diffuse-glossy ratio. */
+        const float3 ratio = INTEGRATOR_STATE(state, shadow_path, diffuse_glossy_ratio);
+        const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
      }

-      /* Single write call for GPU coherence. */
+      /* Reconstruct diffuse subset of throughput. */
+      pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_diffuse_direct :
+                        kernel_data.film.pass_diffuse_indirect;
      if (pass_offset != PASS_UNUSED) {
-        kernel_write_pass_float3(buffer + pass_offset, contribution);
+        contribution *= INTEGRATOR_STATE(state, shadow_path, diffuse_glossy_ratio);
      }
    }
+    else if (path_flag & PATH_RAY_VOLUME_PASS) {
+      /* Indirectly visible through volume. */
+      pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_volume_direct :
+                        kernel_data.film.pass_volume_indirect;
+    }
+
+    /* Single write call for GPU coherence. */
+    if (pass_offset != PASS_UNUSED) {
+      kernel_write_pass_float3(buffer + pass_offset, contribution);
+    }

    /* Write shadow pass. */
    if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
-        (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
+        (path_flag & PATH_RAY_CAMERA)) {
      const float3 unshadowed_throughput = INTEGRATOR_STATE(
          state, shadow_path, unshadowed_throughput);
      const float3 shadowed_throughput = INTEGRATOR_STATE(state, shadow_path, throughput);
@@ -571,10 +540,11 @@ ccl_device_inline void kernel_accum_background(KernelGlobals kg,
 /* Write emission to render buffer. */
 ccl_device_inline void kernel_accum_emission(KernelGlobals kg,
                                             ConstIntegratorState state,
+                                             const float3 throughput,
                                             const float3 L,
                                             ccl_global float *ccl_restrict render_buffer)
 {
-  float3 contribution = L;
+  float3 contribution = throughput * L;
  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);

  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
--- a/intern/cycles/kernel/film/passes.h
+++ b/intern/cycles/kernel/film/passes.h
@@ -160,6 +160,40 @@ ccl_device_forceinline void kernel_write_denoising_features_volume(KernelGlobals
 }
 #endif /* __DENOISING_FEATURES__ */

+#ifdef __SHADOW_CATCHER__
+
+/* Write shadow catcher passes on a bounce from the shadow catcher object. */
+ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
+    KernelGlobals kg,
+    IntegratorState state,
+    ccl_private const ShaderData *sd,
+    ccl_global float *ccl_restrict render_buffer)
+{
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
+
+  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, sd->object_flag)) {
+    return;
+  }
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
+
+  /* Count sample for the shadow catcher object. */
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
+
+  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
+   * transparency to the matte. */
+  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
+                          average(throughput));
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
 ccl_device_inline size_t kernel_write_id_pass(ccl_global float *ccl_restrict buffer,
                                              size_t depth,
                                              float id,
@@ -177,7 +211,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals kg,
 #ifdef __PASSES__
  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);

-  if (!(path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
+  if (!(path_flag & PATH_RAY_CAMERA)) {
    return;
  }

--- a/intern/cycles/kernel/film/read.h
+++ b/intern/cycles/kernel/film/read.h
@@ -460,7 +460,7 @@ ccl_device_inline float4 film_calculate_shadow_catcher_matte_with_shadow(
  const float transparency = in_matte[3] * scale;
  const float alpha = saturatef(1.0f - transparency);

-  const float alpha_matte = (1.0f - alpha) * (1.0f - saturatef(average(shadow_catcher))) + alpha;
+  const float alpha_matte = (1.0f - alpha) * (1.0f - average(shadow_catcher)) + alpha;

  if (kfilm_convert->use_approximate_shadow_catcher_background) {
    kernel_assert(kfilm_convert->pass_background != PASS_UNUSED);
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -70,16 +70,14 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
  /* Setup render buffers. */
  const int index = INTEGRATOR_STATE(state, path, render_pixel_index);
  const int pass_stride = kernel_data.film.pass_stride;
-  ccl_global float *buffer = render_buffer + index * pass_stride;
+  render_buffer += index * pass_stride;

-  ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive;
-  ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential;
+  ccl_global float *primitive = render_buffer + kernel_data.film.pass_bake_primitive;
+  ccl_global float *differential = render_buffer + kernel_data.film.pass_bake_differential;

  const int seed = __float_as_uint(primitive[0]);
  int prim = __float_as_uint(primitive[1]);
  if (prim == -1) {
-    /* Accumulate transparency for empty pixels. */
-    kernel_accum_transparent(kg, state, 0, 1.0f, buffer);
    return false;
  }

--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -31,6 +31,7 @@

 CCL_NAMESPACE_BEGIN

+template<uint32_t current_kernel>
 ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
                                                           IntegratorState state,
                                                           const int shader_flags)
@@ -85,80 +86,36 @@ ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
  return false;
 }

-#ifdef __SHADOW_CATCHER__
-/* Split path if a shadow catcher was hit. */
-ccl_device_forceinline void integrator_split_shadow_catcher(
+/* Note that current_kernel is a template value since making this a variable
+ * leads to poor performance with CUDA atomics. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_shader_next_kernel(
    KernelGlobals kg,
    IntegratorState state,
    ccl_private const Intersection *ccl_restrict isect,
-    ccl_global float *ccl_restrict render_buffer)
+    const int shader,
+    const int shader_flags)
 {
-  /* Test if we hit a shadow catcher object, and potentially split the path to continue tracing two
-   * paths from here. */
-  const int object_flags = intersection_get_object_flags(kg, isect);
-  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, object_flags)) {
-    return;
-  }
-
-  kernel_write_shadow_catcher_bounce_data(kg, state, render_buffer);
-
-  /* Mark state as having done a shadow catcher split so that it stops contributing to
-   * the shadow catcher matte pass, but keeps contributing to the combined pass. */
-  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
-
-  /* Copy current state to new state. */
-  state = integrator_state_shadow_catcher_split(kg, state);
-
-  /* Initialize new state.
+  /* Note on scheduling.
+   *
+   * When there is no shadow catcher split the scheduling is simple: schedule surface shading with
+   * or without raytrace support, depending on the shader used.
+   *
+   * When there is a shadow catcher split the general idea is to have the following configuration:
+   *
+   *  - Schedule surface shading kernel (with corresponding raytrace support) for the ray which
+   *    will trace shadow catcher object.
+   *
+   *  - When no alpha-over of approximate shadow catcher is needed, schedule surface shading for
+   *    the matte ray.
+   *
+   *  - Otherwise schedule background shading kernel, so that we have a background to alpha-over
+   *    on. The background kernel will then schedule surface shading for the matte ray.
   *
   * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
   * the matte path. */

-  /* Mark current state so that it will only track contribution of shadow catcher objects ignoring
-   * non-catcher objects. */
-  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
-
-  if (kernel_data.film.pass_background != PASS_UNUSED && !kernel_data.background.transparent) {
-    /* If using background pass, schedule background shading kernel so that we have a background
-     * to alpha-over on. The background kernel will then continue the path afterwards. */
-    INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-    return;
-  }
-
-  if (!integrator_state_volume_stack_is_empty(kg, state)) {
-    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
-     * objects from it, and then continue shading volume and shadow catcher surface after. */
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
-    return;
-  }
-
-  /* Continue with shading shadow catcher surface. */
-  const int shader = intersection_get_shader(kg, isect);
-  const int flags = kernel_tex_fetch(__shaders, shader).flags;
-  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
-
-  if (use_raytrace_kernel) {
-    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
-  }
-  else {
-    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
-  }
-}
-
-/* Schedule next kernel to be executed after updating volume stack for shadow catcher. */
-template<uint32_t current_kernel>
-ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_volume(
-    KernelGlobals kg, IntegratorState state)
-{
-  /* Continue with shading shadow catcher surface. Same as integrator_split_shadow_catcher, but
-   * using NEXT instead of INIT. */
-  Intersection isect ccl_optional_struct_init;
-  integrator_state_read_isect(kg, state, &isect);
-
-  const int shader = intersection_get_shader(kg, &isect);
-  const int flags = kernel_tex_fetch(__shaders, shader).flags;
-  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+  const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);

  if (use_raytrace_kernel) {
    INTEGRATOR_PATH_NEXT_SORTED(
@@ -167,141 +124,26 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche
  else {
    INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
  }
-}
-
-/* Schedule next kernel to be executed after executing background shader for shadow catcher. */
-template<uint32_t current_kernel>
-ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_background(
-    KernelGlobals kg, IntegratorState state)
-{
-  /* Same logic as integrator_split_shadow_catcher, but using NEXT instead of INIT. */
-  if (!integrator_state_volume_stack_is_empty(kg, state)) {
-    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
-     * objects from it, and then continue shading volume and shadow catcher surface after. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
-    return;
-  }
-
-  /* Continue with shading shadow catcher surface. */
-  integrator_intersect_next_kernel_after_shadow_catcher_volume<current_kernel>(kg, state);
-}
-#endif
-
-/* Schedule next kernel to be executed after intersect closest.
- *
- * Note that current_kernel is a template value since making this a variable
- * leads to poor performance with CUDA atomics. */
-template<uint32_t current_kernel>
-ccl_device_forceinline void integrator_intersect_next_kernel(
-    KernelGlobals kg,
-    IntegratorState state,
-    ccl_private const Intersection *ccl_restrict isect,
-    ccl_global float *ccl_restrict render_buffer,
-    const bool hit)
-{
-  /* Continue with volume kernel if we are inside a volume, regardless if we hit anything. */
-#ifdef __VOLUME__
-  if (!integrator_state_volume_stack_is_empty(kg, state)) {
-    const bool hit_surface = hit && !(isect->type & PRIMITIVE_LAMP);
-    const int shader = (hit_surface) ? intersection_get_shader(kg, isect) : SHADER_NONE;
-    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
-
-    if (!integrator_intersect_terminate(kg, state, flags)) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
-    }
-    else {
-      INTEGRATOR_PATH_TERMINATE(current_kernel);
-    }
-    return;
-  }
-#endif
-
-  if (hit) {
-    /* Hit a surface, continue with light or surface kernel. */
-    if (isect->type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
-    }
-    else {
-      /* Hit a surface, continue with surface kernel unless terminated. */
-      const int shader = intersection_get_shader(kg, isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
-      if (!integrator_intersect_terminate(kg, state, flags)) {
-        const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
-        if (use_raytrace_kernel) {
-          INTEGRATOR_PATH_NEXT_SORTED(
-              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
-        }
-        else {
-          INTEGRATOR_PATH_NEXT_SORTED(
-              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
-        }

 #ifdef __SHADOW_CATCHER__
-        /* Handle shadow catcher. */
-        integrator_split_shadow_catcher(kg, state, isect, render_buffer);
-#endif
-      }
-      else {
-        INTEGRATOR_PATH_TERMINATE(current_kernel);
-      }
-    }
-  }
-  else {
-    /* Nothing hit, continue with background kernel. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-  }
-}
+  const int object_flags = intersection_get_object_flags(kg, isect);
+  if (kernel_shadow_catcher_split(kg, state, object_flags)) {
+    if (kernel_data.film.pass_background != PASS_UNUSED && !kernel_data.background.transparent) {
+      INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;

-/* Schedule next kernel to be executed after shade volume.
- *
- * The logic here matches integrator_intersect_next_kernel, except that
- * volume shading and termination testing have already been done. */
-template<uint32_t current_kernel>
-ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
-    KernelGlobals kg,
-    IntegratorState state,
-    ccl_private const Intersection *ccl_restrict isect,
-    ccl_global float *ccl_restrict render_buffer)
-{
-  if (isect->prim != PRIM_NONE) {
-    /* Hit a surface, continue with light or surface kernel. */
-    if (isect->type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
-      return;
+      INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    }
+    else if (use_raytrace_kernel) {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
    }
    else {
-      /* Hit a surface, continue with surface kernel unless terminated. */
-      const int shader = intersection_get_shader(kg, isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
-      const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
-
-      if (use_raytrace_kernel) {
-        INTEGRATOR_PATH_NEXT_SORTED(
-            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
-      }
-      else {
-        INTEGRATOR_PATH_NEXT_SORTED(
-            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
-      }
-
-#ifdef __SHADOW_CATCHER__
-      /* Handle shadow catcher. */
-      integrator_split_shadow_catcher(kg, state, isect, render_buffer);
-#endif
-      return;
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
    }
  }
-  else {
-    /* Nothing hit, continue with background kernel. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
-    return;
-  }
+#endif
 }

-ccl_device void integrator_intersect_closest(KernelGlobals kg,
-                                             IntegratorState state,
-                                             ccl_global float *ccl_restrict render_buffer)
+ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState state)
 {
  PROFILING_INIT(kg, PROFILING_INTERSECT_CLOSEST);

@@ -350,9 +192,56 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg,
  /* Write intersection result into global integrator state memory. */
  integrator_state_write_isect(kg, state, &isect);

-  /* Setup up next kernel to be executed. */
-  integrator_intersect_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
-      kg, state, &isect, render_buffer, hit);
+#ifdef __VOLUME__
+  if (!integrator_state_volume_stack_is_empty(kg, state)) {
+    const bool hit_surface = hit && !(isect.type & PRIMITIVE_LAMP);
+    const int shader = (hit_surface) ? intersection_get_shader(kg, &isect) : SHADER_NONE;
+    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
+
+    if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            kg, state, flags)) {
+      /* Continue with volume kernel if we are inside a volume, regardless
+       * if we hit anything. */
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    }
+    else {
+      INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    }
+    return;
+  }
+#endif
+
+  if (hit) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+              kg, state, flags)) {
+        integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            kg, state, &isect, shader, flags);
+        return;
+      }
+      else {
+        INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+        return;
+      }
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                         DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h
@@ -42,13 +42,10 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
  /* Store to avoid global fetches on every intersection step. */
  const uint volume_stack_size = kernel_data.volume_stack_size;

-  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-  const uint32_t visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, PATH_RAY_ALL_VISIBILITY);
-
 #ifdef __VOLUME_RECORD_ALL__
  Intersection hits[2 * MAX_VOLUME_STACK_SIZE + 1];
  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
+      kg, &volume_ray, hits, 2 * volume_stack_size, PATH_RAY_ALL_VISIBILITY);
  if (num_hits > 0) {
    Intersection *isect = hits;

@@ -63,7 +60,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
  Intersection isect;
  int step = 0;
  while (step < 2 * volume_stack_size &&
-         scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
+         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
    volume_stack_enter_exit(kg, state, stack_sd);

@@ -77,7 +74,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
 #endif
 }

-ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState state)
+ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
 {
  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);

@@ -92,20 +89,14 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
  volume_ray.D = make_float3(0.0f, 0.0f, 1.0f);
  volume_ray.t = FLT_MAX;

+  const uint visibility = (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_ALL_VISIBILITY);
  int stack_index = 0, enclosed_index = 0;

-  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-  const uint32_t visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, PATH_RAY_CAMERA);
-
-  /* Initialize volume stack with background volume For shadow catcher the
-   * background volume is always assumed to be CG. */
+  /* Write background shader. */
  if (kernel_data.background.volume_shader != SHADER_NONE) {
-    if (!(path_flag & PATH_RAY_SHADOW_CATCHER_PASS)) {
-      INTEGRATOR_STATE_ARRAY_WRITE(state, volume_stack, stack_index, object) = OBJECT_NONE;
-      INTEGRATOR_STATE_ARRAY_WRITE(
-          state, volume_stack, stack_index, shader) = kernel_data.background.volume_shader;
-      stack_index++;
-    }
+    const VolumeStack new_entry = {OBJECT_NONE, kernel_data.background.volume_shader};
+    integrator_state_write_volume_stack(state, stack_index, new_entry);
+    stack_index++;
  }

  /* Store to avoid global fetches on every intersection step. */
@@ -211,22 +202,9 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
  /* Write terminator. */
  const VolumeStack new_entry = {OBJECT_NONE, SHADER_NONE};
  integrator_state_write_volume_stack(state, stack_index, new_entry);
-}

-ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
-{
-  integrator_volume_stack_init(kg, state);
-
-  if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SHADOW_CATCHER_PASS) {
-    /* Volume stack re-init for shadow catcher, continue with shading of hit. */
-    integrator_intersect_next_kernel_after_shadow_catcher_volume<
-        DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK>(kg, state);
-  }
-  else {
-    /* Volume stack init for camera rays, continue with intersection of camera ray. */
-    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
-                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
-  }
+  INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+                       DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/megakernel.h
+++ b/intern/cycles/kernel/integrator/megakernel.h
@@ -76,7 +76,7 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
    if (queued_kernel) {
      switch (queued_kernel) {
        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
-          integrator_intersect_closest(kg, state, render_buffer);
+          integrator_intersect_closest(kg, state);
          break;
        case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
          integrator_shade_background(kg, state, render_buffer);
--- a/intern/cycles/kernel/integrator/path_state.h
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -70,9 +70,6 @@ ccl_device_inline void path_state_init_integrator(KernelGlobals kg,
  INTEGRATOR_STATE_WRITE(state, path, continuation_probability) = 1.0f;
  INTEGRATOR_STATE_WRITE(state, path, throughput) = make_float3(1.0f, 1.0f, 1.0f);

-  INTEGRATOR_STATE_WRITE(state, isect, object) = OBJECT_NONE;
-  INTEGRATOR_STATE_WRITE(state, isect, prim) = PRIM_NONE;
-
  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
    INTEGRATOR_STATE_ARRAY_WRITE(state, volume_stack, 0, object) = OBJECT_NONE;
    INTEGRATOR_STATE_ARRAY_WRITE(
@@ -125,7 +122,7 @@ ccl_device_inline void path_state_next(KernelGlobals kg, IntegratorState state,
    /* volume scatter */
    flag |= PATH_RAY_VOLUME_SCATTER;
    flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
-    if (!(flag & PATH_RAY_ANY_PASS)) {
+    if (bounce == 1) {
      flag |= PATH_RAY_VOLUME_PASS;
    }

@@ -187,8 +184,8 @@ ccl_device_inline void path_state_next(KernelGlobals kg, IntegratorState state,
    }

    /* Render pass categories. */
-    if (!(flag & PATH_RAY_ANY_PASS) && !(flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-      flag |= PATH_RAY_SURFACE_PASS;
+    if (bounce == 1) {
+      flag |= (label & LABEL_TRANSMIT) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
    }
  }

@@ -211,7 +208,9 @@ ccl_device_inline bool path_state_volume_next(IntegratorState state)
  }

  /* Random number generator next bounce. */
-  INTEGRATOR_STATE_WRITE(state, path, rng_offset) += PRNG_BOUNCE_NUM;
+  if (volume_bounds_bounce > 1) {
+    INTEGRATOR_STATE_WRITE(state, path, rng_offset) += PRNG_BOUNCE_NUM;
+  }

  return true;
 }
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -175,7 +175,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,

      /* Write to render buffer. */
      const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-      kernel_accum_emission(kg, state, throughput * light_eval, render_buffer);
+      kernel_accum_emission(kg, state, throughput, light_eval, render_buffer);
    }
  }
 }
@@ -192,11 +192,23 @@ ccl_device void integrator_shade_background(KernelGlobals kg,

 #ifdef __SHADOW_CATCHER__
  if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
-    /* Special case for shadow catcher where we want to fill the background pass
-     * behind the shadow catcher but also continue tracing the path. */
    INTEGRATOR_STATE_WRITE(state, path, flag) &= ~PATH_RAY_SHADOW_CATCHER_BACKGROUND;
-    integrator_intersect_next_kernel_after_shadow_catcher_background<
-        DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND>(kg, state);
+
+    const int isect_prim = INTEGRATOR_STATE(state, isect, prim);
+    const int isect_type = INTEGRATOR_STATE(state, isect, type);
+    const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim, isect_type);
+    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+
+    if (shader_flags & SD_HAS_RAYTRACE) {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+                                  shader);
+    }
+    else {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+                                  shader);
+    }
    return;
  }
 #endif
--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -90,7 +90,7 @@ ccl_device_inline void integrate_light(KernelGlobals kg,

  /* Write to render buffer. */
  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_accum_emission(kg, state, throughput * light_eval, render_buffer);
+  kernel_accum_emission(kg, state, throughput, light_eval, render_buffer);
 }

 ccl_device void integrator_shade_light(KernelGlobals kg,
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -101,7 +101,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
  }

  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_accum_emission(kg, state, throughput * L, render_buffer);
+  kernel_accum_emission(kg, state, throughput, L, render_buffer);
 }
 #endif /* __EMISSION__ */

@@ -191,18 +191,14 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
  const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
  uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag);
  shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
-  shadow_flag |= PATH_RAY_SURFACE_PASS;
+  shadow_flag |= (is_transmission) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval);

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 pass_diffuse_weight = (bounce == 0) ?
-                                           bsdf_eval_pass_diffuse_weight(&bsdf_eval) :
-                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
-    const float3 pass_glossy_weight = (bounce == 0) ?
-                                          bsdf_eval_pass_glossy_weight(&bsdf_eval) :
-                                          INTEGRATOR_STATE(state, path, pass_glossy_weight);
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight;
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
+                                            INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
  }

  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
@@ -287,9 +283,7 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
    if (INTEGRATOR_STATE(state, path, bounce) == 0) {
-      INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = bsdf_eval_pass_diffuse_weight(
-          &bsdf_eval);
-      INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = bsdf_eval_pass_glossy_weight(
+      INTEGRATOR_STATE_WRITE(state, path, diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(
          &bsdf_eval);
    }
  }
@@ -451,7 +445,7 @@ ccl_device bool integrate_surface(KernelGlobals kg,
    }
 #endif

-    shader_prepare_surface_closures(kg, state, &sd, path_flag);
+    shader_prepare_surface_closures(kg, state, &sd);

 #ifdef __HOLDOUT__
    /* Evaluate holdout. */
@@ -498,6 +492,10 @@ ccl_device bool integrate_surface(KernelGlobals kg,
    kernel_write_denoising_features_surface(kg, state, &sd, render_buffer);
 #endif

+#ifdef __SHADOW_CATCHER__
+    kernel_write_shadow_catcher_bounce_data(kg, state, &sd, render_buffer);
+#endif
+
    /* Direct light. */
    PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
    integrate_surface_direct_light(kg, state, &sd, &rng_state);
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -263,12 +263,6 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
 /* Equi-angular sampling as in:
 * "Importance Sampling Techniques for Path Tracing in Participating Media" */

-/* Below this pdf we ignore samples, as they tend to lead to very long distances.
- * This can cause performance issues with BVH traversal in OptiX, leading it to
- * traverse many nodes. Since these contribute very little to the image, just ignore
- * those samples. */
-#  define VOLUME_SAMPLE_PDF_CUTOFF 1e-8f
-
 ccl_device float volume_equiangular_sample(ccl_private const Ray *ccl_restrict ray,
                                           const float3 light_P,
                                           const float xi,
@@ -443,8 +437,7 @@ ccl_device_forceinline void volume_integrate_step_scattering(

  /* Equiangular sampling for direct lighting. */
  if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR && !result.direct_scatter) {
-    if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t &&
-        vstate.equiangular_pdf > VOLUME_SAMPLE_PDF_CUTOFF) {
+    if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t) {
      const float new_dt = result.direct_t - vstate.start_t;
      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);

@@ -481,28 +474,26 @@ ccl_device_forceinline void volume_integrate_step_scattering(
      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
      const float distance_pdf = dot(channel_pdf, coeff.sigma_t * new_transmittance);

-      if (vstate.distance_pdf * distance_pdf > VOLUME_SAMPLE_PDF_CUTOFF) {
-        /* throughput */
-        result.indirect_scatter = true;
-        result.indirect_t = new_t;
-        result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf;
-        shader_copy_volume_phases(&result.indirect_phases, sd);
+      /* throughput */
+      result.indirect_scatter = true;
+      result.indirect_t = new_t;
+      result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf;
+      shader_copy_volume_phases(&result.indirect_phases, sd);

-        if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
-          /* If using distance sampling for direct light, just copy parameters
-           * of indirect light since we scatter at the same point then. */
-          result.direct_scatter = true;
-          result.direct_t = result.indirect_t;
-          result.direct_throughput = result.indirect_throughput;
-          shader_copy_volume_phases(&result.direct_phases, sd);
+      if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+        /* If using distance sampling for direct light, just copy parameters
+         * of indirect light since we scatter at the same point then. */
+        result.direct_scatter = true;
+        result.direct_t = result.indirect_t;
+        result.direct_throughput = result.indirect_throughput;
+        shader_copy_volume_phases(&result.direct_phases, sd);

-          /* Multiple importance sampling. */
-          if (vstate.use_mis) {
-            const float equiangular_pdf = volume_equiangular_pdf(ray, equiangular_light_P, new_t);
-            const float mis_weight = power_heuristic(vstate.distance_pdf * distance_pdf,
-                                                     equiangular_pdf);
-            result.direct_throughput *= 2.0f * mis_weight;
-          }
+        /* Multiple importance sampling. */
+        if (vstate.use_mis) {
+          const float equiangular_pdf = volume_equiangular_pdf(ray, equiangular_light_P, new_t);
+          const float mis_weight = power_heuristic(vstate.distance_pdf * distance_pdf,
+                                                   equiangular_pdf);
+          result.direct_throughput *= 2.0f * mis_weight;
        }
      }
    }
@@ -617,7 +608,7 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
        if (!result.indirect_scatter) {
          const float3 emission = volume_emission_integrate(
              &coeff, closure_flag, transmittance, dt);
-          accum_emission += result.indirect_throughput * emission;
+          accum_emission += emission;
        }
      }

@@ -670,7 +661,7 @@ ccl_device_forceinline void volume_integrate_heterogeneous(

  /* Write accumulated emission. */
  if (!is_zero(accum_emission)) {
-    kernel_accum_emission(kg, state, accum_emission, render_buffer);
+    kernel_accum_emission(kg, state, result.indirect_throughput, accum_emission, render_buffer);
  }

 #  ifdef __DENOISING_FEATURES__
@@ -703,10 +694,8 @@ ccl_device_forceinline bool integrate_volume_sample_light(
  float light_u, light_v;
  path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);

-  if (!light_distribution_sample_from_volume_segment(
-          kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, ls)) {
-    return false;
-  }
+  light_distribution_sample_from_volume_segment(
+      kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, ls);

  if (ls->shader & SHADER_EXCLUDE_SCATTER) {
    return false;
@@ -805,11 +794,10 @@ ccl_device_forceinline void integrate_volume_direct_light(
  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 pass_diffuse_weight = (bounce == 0) ?
-                                           one_float3() :
-                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = zero_float3();
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            one_float3() :
+                                            INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
  }

  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
@@ -888,8 +876,7 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
  INTEGRATOR_STATE_WRITE(state, path, throughput) = throughput_phase;

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_float3();
-    INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_float3();
+    INTEGRATOR_STATE_WRITE(state, path, diffuse_glossy_ratio) = one_float3();
  }

  /* Update path state */
@@ -1036,9 +1023,25 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,
  }
  else {
    /* Continue to background, light or surface. */
-    integrator_intersect_next_kernel_after_volume<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
-        kg, state, &isect, render_buffer);
-    return;
+    if (isect.prim == PRIM_NONE) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      return;
+    }
+    else if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
+          kg, state, &isect, shader, flags);
+      return;
+    }
  }
 #endif /* __VOLUME__ */
 }
--- a/intern/cycles/kernel/integrator/shader_eval.h
+++ b/intern/cycles/kernel/integrator/shader_eval.h
@@ -105,42 +105,8 @@ ccl_device_inline void shader_copy_volume_phases(ccl_private ShaderVolumePhases

 ccl_device_inline void shader_prepare_surface_closures(KernelGlobals kg,
                                                       ConstIntegratorState state,
-                                                       ccl_private ShaderData *sd,
-                                                       const uint32_t path_flag)
+                                                       ccl_private ShaderData *sd)
 {
-  /* Filter out closures. */
-  if (kernel_data.integrator.filter_closures) {
-    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_EMISSION) {
-      sd->closure_emission_background = zero_float3();
-    }
-
-    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIRECT_LIGHT) {
-      sd->flag &= ~SD_BSDF_HAS_EVAL;
-    }
-
-    if (path_flag & PATH_RAY_CAMERA) {
-      for (int i = 0; i < sd->num_closure; i++) {
-        ccl_private ShaderClosure *sc = &sd->closure[i];
-
-        if ((CLOSURE_IS_BSDF_DIFFUSE(sc->type) &&
-             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIFFUSE)) ||
-            (CLOSURE_IS_BSDF_GLOSSY(sc->type) &&
-             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_GLOSSY)) ||
-            (CLOSURE_IS_BSDF_TRANSMISSION(sc->type) &&
-             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSMISSION))) {
-          sc->type = CLOSURE_NONE_ID;
-          sc->sample_weight = 0.0f;
-        }
-        else if ((CLOSURE_IS_BSDF_TRANSPARENT(sc->type) &&
-                  (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSPARENT))) {
-          sc->type = CLOSURE_HOLDOUT_ID;
-          sc->sample_weight = 0.0f;
-          sd->flag |= SD_HOLDOUT;
-        }
-      }
-    }
-  }
-
  /* Defensive sampling.
   *
   * We can likely also do defensive sampling at deeper bounces, particularly
@@ -243,7 +209,8 @@ ccl_device_inline float _shader_bsdf_multi_eval(KernelGlobals kg,
        float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);

        if (bsdf_pdf != 0.0f) {
-          bsdf_eval_accum(result_eval, sc->type, eval * sc->weight);
+          const bool is_diffuse = CLOSURE_IS_BSDF_DIFFUSE(sc->type);
+          bsdf_eval_accum(result_eval, is_diffuse, eval * sc->weight, 1.0f);
          sum_pdf += bsdf_pdf * sc->sample_weight;
        }
      }
@@ -268,7 +235,7 @@ ccl_device_inline
                     ccl_private BsdfEval *bsdf_eval,
                     const uint light_shader_flags)
 {
-  bsdf_eval_init(bsdf_eval, CLOSURE_NONE_ID, zero_float3());
+  bsdf_eval_init(bsdf_eval, false, zero_float3());

  return _shader_bsdf_multi_eval(
      kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
@@ -361,7 +328,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals kg,
  label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);

  if (*pdf != 0.0f) {
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight);
+    const bool is_diffuse = CLOSURE_IS_BSDF_DIFFUSE(sc->type);
+    bsdf_eval_init(bsdf_eval, is_diffuse, eval * sc->weight);

    if (sd->num_closure > 1) {
      const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in);
@@ -687,7 +655,7 @@ ccl_device_inline float _shader_volume_phase_multi_eval(
    float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);

    if (phase_pdf != 0.0f) {
-      bsdf_eval_accum(result_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+      bsdf_eval_accum(result_eval, false, eval, 1.0f);
      sum_pdf += phase_pdf * svc->sample_weight;
    }

@@ -703,7 +671,7 @@ ccl_device float shader_volume_phase_eval(KernelGlobals kg,
                                          const float3 omega_in,
                                          ccl_private BsdfEval *phase_eval)
 {
-  bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, zero_float3());
+  bsdf_eval_init(phase_eval, false, zero_float3());

  return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
 }
@@ -761,7 +729,7 @@ ccl_device int shader_volume_phase_sample(KernelGlobals kg,
  label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf);

  if (*pdf != 0.0f) {
-    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+    bsdf_eval_init(phase_eval, false, eval);
  }

  return label;
@@ -784,7 +752,7 @@ ccl_device int shader_phase_sample_closure(KernelGlobals kg,
  label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);

  if (*pdf != 0.0f)
-    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+    bsdf_eval_init(phase_eval, false, eval);

  return label;
 }
--- a/intern/cycles/kernel/integrator/shadow_catcher.h
+++ b/intern/cycles/kernel/integrator/shadow_catcher.h
@@ -16,7 +16,6 @@

 #pragma once

-#include "kernel/film/write_passes.h"
 #include "kernel/integrator/path_state.h"
 #include "kernel/integrator/state_util.h"

@@ -48,7 +47,7 @@ ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(KernelGlobals
    return false;
  }

-  if (path_flag & PATH_RAY_SHADOW_CATCHER_HIT) {
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
    return false;
  }

@@ -77,6 +76,33 @@ ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg,
  return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0;
 }

+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline bool kernel_shadow_catcher_split(KernelGlobals kg,
+                                                   IntegratorState state,
+                                                   const int object_flags)
+{
+#ifdef __SHADOW_CATCHER__
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, object_flags)) {
+    return false;
+  }
+
+  /* The split is to be done. Mark the current state as such, so that it stops contributing to the
+   * shadow catcher matte pass, but keeps contributing to the combined pass. */
+  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
+
+  /* Split new state from the current one. This new state will only track contribution of shadow
+   * catcher objects ignoring non-catcher objects. */
+  integrator_state_shadow_catcher_split(kg, state);
+
+  return true;
+#else
+  (void)object_flags;
+  return false;
+#endif
+}
+
 #ifdef __SHADOW_CATCHER__

 ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(const uint32_t path_flag)
@@ -89,28 +115,6 @@ ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(const uint32_t
  return path_flag & PATH_RAY_SHADOW_CATCHER_PASS;
 }

-/* Write shadow catcher passes on a bounce from the shadow catcher object. */
-ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
-    KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
-{
-  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
-  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
-
-  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
-                                        kernel_data.film.pass_stride;
-  ccl_global float *buffer = render_buffer + render_buffer_offset;
-
-  /* Count sample for the shadow catcher object. */
-  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
-
-  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
-   * transparency to the matte. */
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
-                          average(throughput));
-}
-
 #endif /* __SHADOW_CATCHER__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -46,9 +46,8 @@ KERNEL_STRUCT_MEMBER(shadow_path,
                     float3,
                     unshadowed_throughput,
                     KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE)
-/* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
 /* Number of intersections found by ray-tracing. */
 KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_END(shadow_path)
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -173,10 +173,10 @@ typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;

 /* Array access on GPU with Structure-of-Arrays. */

-typedef int IntegratorState;
-typedef int ConstIntegratorState;
-typedef int IntegratorShadowState;
-typedef int ConstIntegratorShadowState;
+typedef const int IntegratorState;
+typedef const int ConstIntegratorState;
+typedef const int IntegratorShadowState;
+typedef const int ConstIntegratorShadowState;

 #  define INTEGRATOR_STATE_NULL -1

--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -60,9 +60,8 @@ KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(path, float, continuation_probability, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
 KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
-/* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
 /* Denoising. */
 KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
 /* Shader sorting. */
--- a/intern/cycles/kernel/integrator/state_util.h
+++ b/intern/cycles/kernel/integrator/state_util.h
@@ -326,8 +326,8 @@ ccl_device_inline void integrator_shadow_state_move(KernelGlobals kg,

 /* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
 * after this function. */
-ccl_device_inline IntegratorState integrator_state_shadow_catcher_split(KernelGlobals kg,
-                                                                        IntegratorState state)
+ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
+                                                             IntegratorState state)
 {
 #if defined(__KERNEL_GPU__)
  ConstIntegratorState to_state = atomic_fetch_and_add_uint32(
@@ -337,14 +337,14 @@ ccl_device_inline IntegratorState integrator_state_shadow_catcher_split(KernelGl
 #else
  IntegratorStateCPU *ccl_restrict to_state = state + 1;

-  /* Only copy the required subset for performance. */
+  /* Only copy the required subset, since shadow intersections are big and irrelevant here. */
  to_state->path = state->path;
  to_state->ray = state->ray;
  to_state->isect = state->isect;
  integrator_state_copy_volume_stack(kg, to_state, state);
 #endif

-  return to_state;
+  INTEGRATOR_STATE_WRITE(to_state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
 }

 #ifdef __KERNEL_CPU__
--- a/intern/cycles/kernel/integrator/subsurface.h
+++ b/intern/cycles/kernel/integrator/subsurface.h
@@ -71,10 +71,6 @@ ccl_device int subsurface_bounce(KernelGlobals kg,
  }
 #  endif

-  if (sd->flag & SD_BACKFACING) {
-    path_flag |= PATH_RAY_SUBSURFACE_BACKFACING;
-  }
-
  INTEGRATOR_STATE_WRITE(state, path, throughput) *= weight;
  INTEGRATOR_STATE_WRITE(state, path, flag) = path_flag;

@@ -83,8 +79,7 @@ ccl_device int subsurface_bounce(KernelGlobals kg,

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
    if (INTEGRATOR_STATE(state, path, bounce) == 0) {
-      INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_float3();
-      INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_float3();
+      INTEGRATOR_STATE_WRITE(state, path, diffuse_glossy_ratio) = one_float3();
    }
  }

--- a/intern/cycles/kernel/integrator/subsurface_disk.h
+++ b/intern/cycles/kernel/integrator/subsurface_disk.h
@@ -47,7 +47,6 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
  const float time = INTEGRATOR_STATE(state, ray, time);
  const float3 Ng = INTEGRATOR_STATE(state, subsurface, Ng);
  const int object = INTEGRATOR_STATE(state, isect, object);
-  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);

  /* Read subsurface scattering parameters. */
  const float3 radius = INTEGRATOR_STATE(state, subsurface, radius);
@@ -124,9 +123,6 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
    const int object = ss_isect.hits[hit].object;
    const int object_flag = kernel_tex_fetch(__object_flag, object);
    float3 hit_Ng = ss_isect.Ng[hit];
-    if (path_flag & PATH_RAY_SUBSURFACE_BACKFACING) {
-      hit_Ng = -hit_Ng;
-    }
    if (object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
      hit_Ng = -hit_Ng;
    }
--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -73,7 +73,7 @@ ccl_device_inline bool light_sample(KernelGlobals kg,
    ls->P = zero_float3();
    ls->Ng = zero_float3();
    ls->D = zero_float3();
-    ls->pdf = 1.0f;
+    ls->pdf = true;
    ls->t = FLT_MAX;
    return true;
  }
@@ -131,7 +131,7 @@ ccl_device_inline bool light_sample(KernelGlobals kg,
        float3 dir = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
        ls->eval_fac *= spot_light_attenuation(
            dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng);
-        if (!in_volume_segment && ls->eval_fac == 0.0f) {
+        if (ls->eval_fac == 0.0f) {
          return false;
        }
      }
@@ -170,7 +170,7 @@ ccl_device_inline bool light_sample(KernelGlobals kg,
        float3 sample_axisu = axisu;
        float3 sample_axisv = axisv;

-        if (!in_volume_segment && klight->area.tan_spread > 0.0f) {
+        if (klight->area.tan_spread > 0.0f) {
          if (!light_spread_clamp_area_light(
                  P, Ng, &ls->P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
            return false;
@@ -203,7 +203,7 @@ ccl_device_inline bool light_sample(KernelGlobals kg,

  ls->pdf *= kernel_data.integrator.pdf_lights;

-  return in_volume_segment || (ls->pdf > 0.0f);
+  return (ls->pdf > 0.0f);
 }

 ccl_device bool lights_intersect(KernelGlobals kg,
@@ -353,8 +353,8 @@ ccl_device bool light_sample_from_distant_ray(KernelGlobals kg,
  /* compute pdf */
  float invarea = klight->distant.invarea;
  ls->pdf = invarea / (costheta * costheta * costheta);
-  ls->eval_fac = ls->pdf;
  ls->pdf *= kernel_data.integrator.pdf_lights;
+  ls->eval_fac = ls->pdf;

  return true;
 }
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -199,9 +199,6 @@ ccl_device_inline float3 shadow_ray_offset(KernelGlobals kg,
    if (offset_cutoff > 0.0f) {
      float NgL = dot(Ng, L);
      float offset_amount = 0.0f;
-      if (NL < 0) {
-        NL = -NL;
-      }
      if (NL < offset_cutoff) {
        offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
      }
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -832,21 +832,16 @@ static bool get_object_attribute(const OSLGlobals::Attribute &attr,
 {
  if (attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
      attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float3(make_float3(data[0], data[1], data[2]), type, derivatives, val);
+    return set_attribute_float3(*(float3 *)attr.value.data(), type, derivatives, val);
  }
  else if (attr.type == TypeFloat2) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float2(make_float2(data[0], data[1]), type, derivatives, val);
+    return set_attribute_float2(*(float2 *)attr.value.data(), type, derivatives, val);
  }
  else if (attr.type == TypeDesc::TypeFloat) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float(data[0], type, derivatives, val);
+    return set_attribute_float(*(float *)attr.value.data(), type, derivatives, val);
  }
  else if (attr.type == TypeRGBA || attr.type == TypeDesc::TypeFloat4) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float4(
-        make_float4(data[0], data[1], data[2], data[3]), type, derivatives, val);
+    return set_attribute_float4(*(float4 *)attr.value.data(), type, derivatives, val);
  }
  else if (attr.type == type) {
    size_t datasize = attr.value.datasize();
--- a/intern/cycles/kernel/osl/shader.cpp
+++ b/intern/cycles/kernel/osl/shader.cpp
@@ -132,12 +132,10 @@ static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
  /* Used by render-services. */
  sd->osl_globals = kg;
  if (path_flag & PATH_RAY_SHADOW) {
-    sd->osl_path_state = nullptr;
    sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
  }
  else {
    sd->osl_path_state = (const IntegratorStateCPU *)state;
-    sd->osl_shadow_path_state = nullptr;
  }
 }

--- a/intern/cycles/kernel/svm/aov.h
+++ b/intern/cycles/kernel/svm/aov.h
@@ -23,8 +23,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline bool svm_node_aov_check(const uint32_t path_flag,
                                          ccl_global float *render_buffer)
 {
-  bool is_primary = (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) &&
-                    (!(path_flag & PATH_RAY_SINGLE_PASS_DONE));
+  bool is_primary = (path_flag & PATH_RAY_CAMERA) && (!(path_flag & PATH_RAY_SINGLE_PASS_DONE));

  return ((render_buffer != NULL) && is_primary);
 }
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -279,17 +279,17 @@ enum PathRayFlag {
  PATH_RAY_SUBSURFACE_RANDOM_WALK = (1U << 20U),
  PATH_RAY_SUBSURFACE_DISK = (1U << 21U),
  PATH_RAY_SUBSURFACE_USE_FRESNEL = (1U << 22U),
-  PATH_RAY_SUBSURFACE_BACKFACING = (1U << 23U),
  PATH_RAY_SUBSURFACE = (PATH_RAY_SUBSURFACE_RANDOM_WALK | PATH_RAY_SUBSURFACE_DISK |
-                         PATH_RAY_SUBSURFACE_USE_FRESNEL | PATH_RAY_SUBSURFACE_BACKFACING),
+                         PATH_RAY_SUBSURFACE_USE_FRESNEL),

  /* Contribute to denoising features. */
-  PATH_RAY_DENOISING_FEATURES = (1U << 24U),
+  PATH_RAY_DENOISING_FEATURES = (1U << 23U),

  /* Render pass categories. */
-  PATH_RAY_SURFACE_PASS = (1U << 25U),
+  PATH_RAY_REFLECT_PASS = (1U << 24U),
+  PATH_RAY_TRANSMISSION_PASS = (1U << 25U),
  PATH_RAY_VOLUME_PASS = (1U << 26U),
-  PATH_RAY_ANY_PASS = (PATH_RAY_SURFACE_PASS | PATH_RAY_VOLUME_PASS),
+  PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),

  /* Shadow ray is for a light or surface, or AO. */
  PATH_RAY_SHADOW_FOR_LIGHT = (1U << 27U),
@@ -428,20 +428,8 @@ typedef enum CryptomatteType {
 typedef struct BsdfEval {
  float3 diffuse;
  float3 glossy;
-  float3 sum;
 } BsdfEval;

-/* Closure Filter */
-
-typedef enum FilterClosures {
-  FILTER_CLOSURE_EMISSION = (1 << 0),
-  FILTER_CLOSURE_DIFFUSE = (1 << 1),
-  FILTER_CLOSURE_GLOSSY = (1 << 2),
-  FILTER_CLOSURE_TRANSMISSION = (1 << 3),
-  FILTER_CLOSURE_TRANSPARENT = (1 << 4),
-  FILTER_CLOSURE_DIRECT_LIGHT = (1 << 5),
-} FilterClosures;
-
 /* Shader Flag */

 typedef enum ShaderFlag {
@@ -1198,11 +1186,7 @@ typedef struct KernelIntegrator {
  int has_shadow_catcher;
  float scrambling_distance;

-  /* Closure filter. */
-  int filter_closures;
-
  /* padding */
-  int pad1, pad2, pad3;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);

--- a/intern/cycles/scene/film.cpp
+++ b/intern/cycles/scene/film.cpp
@@ -187,6 +187,8 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
  kfilm->pass_transmission_indirect = PASS_UNUSED;
  kfilm->pass_volume_direct = PASS_UNUSED;
  kfilm->pass_volume_indirect = PASS_UNUSED;
+  kfilm->pass_volume_direct = PASS_UNUSED;
+  kfilm->pass_volume_indirect = PASS_UNUSED;
  kfilm->pass_shadow = PASS_UNUSED;

  /* Mark passes as unused so that the kernel knows the pass is inaccessible. */
@@ -671,12 +673,13 @@ uint Film::get_kernel_features(const Scene *scene) const
      kernel_features |= KERNEL_FEATURE_DENOISING;
    }

-    if (pass_type >= PASS_DIFFUSE && pass_type <= PASS_VOLUME_INDIRECT) {
+    if (pass_type != PASS_NONE && pass_type != PASS_COMBINED &&
+        pass_type <= PASS_CATEGORY_LIGHT_END) {
      kernel_features |= KERNEL_FEATURE_LIGHT_PASSES;
-    }

-    if (pass_type == PASS_SHADOW) {
-      kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
+      if (pass_type == PASS_SHADOW) {
+        kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
+      }
    }

    if (pass_type == PASS_AO) {
--- a/intern/cycles/scene/integrator.cpp
+++ b/intern/cycles/scene/integrator.cpp
@@ -14,13 +14,11 @@
 * limitations under the License.
 */

+#include "scene/integrator.h"
 #include "device/device.h"
-
 #include "scene/background.h"
-#include "scene/bake.h"
 #include "scene/camera.h"
 #include "scene/film.h"
-#include "scene/integrator.h"
 #include "scene/jitter.h"
 #include "scene/light.h"
 #include "scene/object.h"
@@ -65,14 +63,6 @@ NODE_DEFINE(Integrator)
  SOCKET_BOOLEAN(caustics_reflective, "Reflective Caustics", true);
  SOCKET_BOOLEAN(caustics_refractive, "Refractive Caustics", true);
  SOCKET_FLOAT(filter_glossy, "Filter Glossy", 0.0f);
-
-  SOCKET_BOOLEAN(use_direct_light, "Use Direct Light", true);
-  SOCKET_BOOLEAN(use_indirect_light, "Use Indirect Light", true);
-  SOCKET_BOOLEAN(use_diffuse, "Use Diffuse", true);
-  SOCKET_BOOLEAN(use_glossy, "Use Glossy", true);
-  SOCKET_BOOLEAN(use_transmission, "Use Transmission", true);
-  SOCKET_BOOLEAN(use_emission, "Use Emission", true);
-
  SOCKET_INT(seed, "Seed", 0);
  SOCKET_FLOAT(sample_clamp_direct, "Sample Clamp Direct", 0.0f);
  SOCKET_FLOAT(sample_clamp_indirect, "Sample Clamp Indirect", 0.0f);
@@ -168,7 +158,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
  kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
  kintegrator->transparent_max_bounce = transparent_max_bounce + 1;

-  kintegrator->ao_bounces = (ao_factor != 0.0f) ? ao_bounces : 0;
+  kintegrator->ao_bounces = ao_bounces;
  kintegrator->ao_bounces_distance = ao_distance;
  kintegrator->ao_bounces_factor = ao_factor;
  kintegrator->ao_additive_factor = ao_additive_factor;
@@ -194,32 +184,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
  kintegrator->caustics_refractive = caustics_refractive;
  kintegrator->filter_glossy = (filter_glossy == 0.0f) ? FLT_MAX : 1.0f / filter_glossy;

-  kintegrator->filter_closures = 0;
-  if (!use_direct_light) {
-    kintegrator->filter_closures |= FILTER_CLOSURE_DIRECT_LIGHT;
-  }
-  if (!use_indirect_light) {
-    kintegrator->min_bounce = 1;
-    kintegrator->max_bounce = 1;
-  }
-  if (!use_diffuse) {
-    kintegrator->filter_closures |= FILTER_CLOSURE_DIFFUSE;
-  }
-  if (!use_glossy) {
-    kintegrator->filter_closures |= FILTER_CLOSURE_GLOSSY;
-  }
-  if (!use_transmission) {
-    kintegrator->filter_closures |= FILTER_CLOSURE_TRANSMISSION;
-  }
-  if (!use_emission) {
-    kintegrator->filter_closures |= FILTER_CLOSURE_EMISSION;
-  }
-  if (scene->bake_manager->get_baking()) {
-    /* Baking does not need to trace through transparency, we only want to bake
-     * the object itself. */
-    kintegrator->filter_closures |= FILTER_CLOSURE_TRANSPARENT;
-  }
-
  kintegrator->seed = seed;

  kintegrator->sample_clamp_direct = (sample_clamp_direct == 0.0f) ? FLT_MAX :
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jacques Lucke	6feb56e6da	add cmake changes	2021-11-05 14:48:22 +01:00
Jacques Lucke	f64859da9a	fix	2021-11-05 14:46:42 +01:00
Jacques Lucke	f8d2e14709	cleanup	2021-11-05 14:46:32 +01:00
Jacques Lucke	371abaf66c	prepare for unity builds	2021-11-05 13:11:26 +01:00