GHOST: Device discovery API.

Still wip.
Merge branch 'temp-ghost-vulkan-backend' into tmp-vulkan
2021-11-23 15:29:11 +01:00 · 2021-11-23 14:01:01 +01:00 · 2021-11-23 13:23:50 +01:00 · 2021-11-23 13:02:25 +01:00 · 2021-11-23 13:02:00 +01:00 · 2021-11-23 12:39:26 +01:00
692 changed files with 36153 additions and 7850 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -411,6 +411,7 @@ option(WITH_CYCLES                   "Enable Cycles Render Engine" ON)
 option(WITH_CYCLES_OSL               "Build Cycles with OpenShadingLanguage support" ON)
 option(WITH_CYCLES_EMBREE            "Build Cycles with Embree support" ON)
 option(WITH_CYCLES_LOGGING           "Build Cycles with logging support" ON)
+option(WITH_CYCLES_DEBUG             "Build Cycles with options useful for debugging (e.g., MIS)" OFF)

 option(WITH_CYCLES_STANDALONE        "Build Cycles standalone application" OFF)
 option(WITH_CYCLES_STANDALONE_GUI    "Build Cycles standalone with GUI" OFF)
@@ -513,9 +514,13 @@ if(UNIX AND NOT APPLE)
 endif()


+# Vulkan
+option(WITH_VULKAN                    "Enable Vulkan backend (Experimental)" OFF)
+option(WITH_VULKAN_SHADER_COMPILATION "Temporary flag to enable vulkan shader compilation needed to continue development during the migration of GLSL to Vulkan." OFF)
+
 # OpenGL

-option(WITH_OPENGL              "When off limits visibility of the opengl headers to just bf_gpu and gawain (temporary option for development purposes)" ON)
+option(WITH_OPENGL              "When off limits visibility of the opengl headers to just bf_gpu (temporary option for development purposes)" ON)
 option(WITH_GLEW_ES             "Switches to experimental copy of GLEW that has support for OpenGL ES. (temporary option for development purposes)" OFF)
 option(WITH_GL_EGL              "Use the EGL OpenGL system library instead of the platform specific OpenGL system library (CGL, glX, or WGL)"       OFF)
 option(WITH_GL_PROFILE_ES20     "Support using OpenGL ES 2.0. (through either EGL or the AGL/WGL/XGL 'es20' profile)"                               OFF)
@@ -525,6 +530,7 @@ mark_as_advanced(
  WITH_GLEW_ES
  WITH_GL_EGL
  WITH_GL_PROFILE_ES20
+  WITH_VULKAN_SHADER_COMPILATION
 )

 if(WIN32)
@@ -1069,7 +1075,7 @@ if(MSVC)
  add_definitions(-D__LITTLE_ENDIAN__)

  # OSX-Note: as we do cross-compiling with specific set architecture,
-  # endianess-detection and auto-setting is counterproductive
+  # endianness-detection and auto-setting is counterproductive
  # so we just set endianness according CMAKE_OSX_ARCHITECTURES

 elseif(CMAKE_OSX_ARCHITECTURES MATCHES i386 OR CMAKE_OSX_ARCHITECTURES MATCHES x86_64 OR CMAKE_OSX_ARCHITECTURES MATCHES arm64)
@@ -1125,6 +1131,18 @@ if(WITH_OPENVDB)
  list(APPEND OPENVDB_LIBRARIES ${BOOST_LIBRARIES} ${TBB_LIBRARIES})
 endif()

+#-----------------------------------------------------------------------------
+# Configure Vulkan.
+
+if(WITH_VULKAN)
+  list(APPEND BLENDER_GL_LIBRARIES ${Vulkan_LIBRARY})
+
+  add_definitions(-DWITH_VULKAN)
+  if(WITH_VULKAN_SHADER_COMPILATION)
+    add_definitions(-DWITH_VULKAN_SHADER_COMPILATION)
+  endif()
+endif()
+
 #-----------------------------------------------------------------------------
 # Configure OpenGL.

@@ -1759,7 +1777,7 @@ endif()
 set(CMAKE_CXX_STANDARD 17)
 # If C++17 is not available, downgrading to an earlier standard is NOT OK.
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-# Do not enable compiler specific language extentions.
+# Do not enable compiler specific language extensions.
 set(CMAKE_CXX_EXTENSIONS OFF)

 # Make MSVC properly report the value of the __cplusplus preprocessor macro
--- a/2
+++ b/2
@@ -51,7 +51,7 @@ Other Convenience Targets
   * config:        Run cmake configuration tool to set build options.
   * deps:          Build library dependencies (intended only for platform maintainers).

-                    The existance of locally build dependancies overrides the pre-built dependencies from subversion.
+                    The existance of locally build dependencies overrides the pre-built dependencies from subversion.
                    These must be manually removed from '../lib/' to go back to using the pre-compiled libraries.

 Project Files
--- a/build_files/build_environment/cmake/harvest.cmake
+++ b/build_files/build_environment/cmake/harvest.cmake
@@ -17,7 +17,7 @@
 # ***** END GPL LICENSE BLOCK *****

 ########################################################################
-# Copy all generated files to the proper strucure as blender prefers
+# Copy all generated files to the proper structure as blender prefers
 ########################################################################

 if(NOT DEFINED HARVEST_TARGET)
--- a/build_files/build_environment/cmake/nanovdb.cmake
+++ b/build_files/build_environment/cmake/nanovdb.cmake
@@ -42,6 +42,7 @@ ExternalProject_Add(nanovdb
  URL_HASH ${NANOVDB_HASH_TYPE}=${NANOVDB_HASH}
  PREFIX ${BUILD_DIR}/nanovdb
  SOURCE_SUBDIR nanovdb
+  PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/nanovdb/src/nanovdb < ${PATCH_DIR}/nanovdb.diff
  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/nanovdb ${DEFAULT_CMAKE_FLAGS} ${NANOVDB_EXTRA_ARGS}
  INSTALL_DIR ${LIBDIR}/nanovdb
 )
--- a/build_files/build_environment/cmake/options.cmake
+++ b/build_files/build_environment/cmake/options.cmake
@@ -39,7 +39,7 @@ endif()
 set(DOWNLOAD_DIR "${CMAKE_CURRENT_BINARY_DIR}/downloads" CACHE STRING "Path for downloaded files")
 # This path must be hard-coded like this, so that the GNUmakefile knows where it is and can pass it to make_source_archive.py:
 set(PACKAGE_DIR "${CMAKE_CURRENT_BINARY_DIR}/packages")
-option(PACKAGE_USE_UPSTREAM_SOURCES "Use soures upstream to download the package sources, when OFF the blender mirror will be used" ON)
+option(PACKAGE_USE_UPSTREAM_SOURCES "Use sources upstream to download the package sources, when OFF the blender mirror will be used" ON)

 file(TO_CMAKE_PATH ${DOWNLOAD_DIR} DOWNLOAD_DIR)
 file(TO_CMAKE_PATH ${PACKAGE_DIR} PACKAGE_DIR)
--- a/build_files/build_environment/cmake/package_python.cmake
+++ b/build_files/build_environment/cmake/package_python.cmake
@@ -24,7 +24,7 @@ if(MSVC)
    add_custom_command(
      OUTPUT ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
      COMMAND echo packaging python
-      COMMAND echo this should ouput at ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
+      COMMAND echo this should output at ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
      COMMAND ${CMAKE_COMMAND} -E make_directory ${PYTARGET}/libs
      COMMAND ${CMAKE_COMMAND} -E copy ${PYSRC}/libs/python${PYTHON_SHORT_VERSION_NO_DOTS}.lib ${PYTARGET}/libs/python${PYTHON_SHORT_VERSION_NO_DOTS}.lib
      COMMAND ${CMAKE_COMMAND} -E copy ${PYSRC}/python.exe ${PYTARGET}/bin/python.exe
@@ -43,7 +43,7 @@ if(MSVC)
    add_custom_command(
      OUTPUT ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
      COMMAND echo packaging python
-      COMMAND echo this should ouput at ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
+      COMMAND echo this should output at ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
      COMMAND ${CMAKE_COMMAND} -E make_directory ${PYTARGET}/libs
      COMMAND ${CMAKE_COMMAND} -E copy ${PYSRC}/libs/python${PYTHON_SHORT_VERSION_NO_DOTS}${PYTHON_POSTFIX}.lib ${PYTARGET}/libs/python${PYTHON_SHORT_VERSION_NO_DOTS}${PYTHON_POSTFIX}.lib
      COMMAND ${CMAKE_COMMAND} -E copy ${PYSRC}/python${PYTHON_POSTFIX}.exe ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -1826,7 +1826,7 @@ compile_OCIO() {
    # Force linking against static libs
    #rm -f $_inst/lib/*.so*

-    # Additional depencencies
+    # Additional dependencies
    #cp ext/dist/lib/libtinyxml.a $_inst/lib
    #cp ext/dist/lib/libyaml-cpp.a $_inst/lib

--- a/build_files/build_environment/patches/nanovdb.diff
+++ b/build_files/build_environment/patches/nanovdb.diff
@@ -0,0 +1,374 @@
+Index: nanovdb/nanovdb/NanoVDB.h
+===================================================================
+--- a/nanovdb/nanovdb/NanoVDB.h	(revision 62751)
+++ b/nanovdb/nanovdb/NanoVDB.h	(working copy)
+@@ -152,8 +152,8 @@
+ 
+ #endif // __CUDACC_RTC__
+ 
+-#ifdef __CUDACC__
+-// Only define __hostdev__ when using NVIDIA CUDA compiler
+#if defined(__CUDACC__) || defined(__HIP__)
+// Only define __hostdev__ when using NVIDIA CUDA or HIP compiler
+ #define __hostdev__ __host__ __device__
+ #else
+ #define __hostdev__
+@@ -461,7 +461,7 @@
+ /// Maximum floating-point values
+ template<typename T>
+ struct Maximum;
+-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP__)
+ template<>
+ struct Maximum<int>
+ {
+@@ -1006,10 +1006,10 @@
+ using Vec3i = Vec3<int>;
+ 
+ /// @brief Return a single precision floating-point vector of this coordinate
+-Vec3f Coord::asVec3s() const { return Vec3f(float(mVec[0]), float(mVec[1]), float(mVec[2])); }
+inline __hostdev__ Vec3f Coord::asVec3s() const { return Vec3f(float(mVec[0]), float(mVec[1]), float(mVec[2])); }
+ 
+ /// @brief Return a double precision floating-point vector of this coordinate
+-Vec3d Coord::asVec3d() const { return Vec3d(double(mVec[0]), double(mVec[1]), double(mVec[2])); }
+inline __hostdev__ Vec3d Coord::asVec3d() const { return Vec3d(double(mVec[0]), double(mVec[1]), double(mVec[2])); }
+ 
+ // ----------------------------> Vec4 <--------------------------------------
+ 
+@@ -1820,7 +1820,7 @@
+ }; // Map
+ 
+ template<typename Mat4T>
+-void Map::set(const Mat4T& mat, const Mat4T& invMat, double taper)
+__hostdev__ void Map::set(const Mat4T& mat, const Mat4T& invMat, double taper)
+ {
+     float * mf = mMatF, *vf = mVecF;
+     float*  mif = mInvMatF;
+@@ -2170,7 +2170,7 @@
+ }; // Class Grid
+ 
+ template<typename TreeT>
+-int Grid<TreeT>::findBlindDataForSemantic(GridBlindDataSemantic semantic) const
+__hostdev__ int Grid<TreeT>::findBlindDataForSemantic(GridBlindDataSemantic semantic) const
+ {
+     for (uint32_t i = 0, n = blindDataCount(); i < n; ++i)
+         if (blindMetaData(i).mSemantic == semantic)
+@@ -2328,7 +2328,7 @@
+ }; // Tree class
+ 
+ template<typename RootT>
+-void Tree<RootT>::extrema(ValueType& min, ValueType& max) const
+__hostdev__ void Tree<RootT>::extrema(ValueType& min, ValueType& max) const
+ {
+     min = this->root().valueMin();
+     max = this->root().valueMax();
+@@ -2336,7 +2336,7 @@
+ 
+ template<typename RootT>
+ template<typename NodeT>
+-const NodeT* Tree<RootT>::getNode(uint32_t i) const
+__hostdev__ const NodeT* Tree<RootT>::getNode(uint32_t i) const
+ {
+     static_assert(is_same<TreeNodeT<NodeT::LEVEL>, NodeT>::value, "Tree::getNode: unvalid node type");
+     NANOVDB_ASSERT(i < DataType::mCount[NodeT::LEVEL]);
+@@ -2345,7 +2345,7 @@
+ 
+ template<typename RootT>
+ template<int LEVEL>
+-const typename TreeNode<Tree<RootT>, LEVEL>::type* Tree<RootT>::getNode(uint32_t i) const
+__hostdev__ const typename TreeNode<Tree<RootT>, LEVEL>::type* Tree<RootT>::getNode(uint32_t i) const
+ {
+     NANOVDB_ASSERT(i < DataType::mCount[LEVEL]);
+     return reinterpret_cast<const TreeNodeT<LEVEL>*>(reinterpret_cast<const uint8_t*>(this) + DataType::mBytes[LEVEL]) + i;
+@@ -2353,7 +2353,7 @@
+ 
+ template<typename RootT>
+ template<typename NodeT>
+-NodeT* Tree<RootT>::getNode(uint32_t i)
+__hostdev__ NodeT* Tree<RootT>::getNode(uint32_t i)
+ {
+     static_assert(is_same<TreeNodeT<NodeT::LEVEL>, NodeT>::value, "Tree::getNode: invalid node type");
+     NANOVDB_ASSERT(i < DataType::mCount[NodeT::LEVEL]);
+@@ -2362,7 +2362,7 @@
+ 
+ template<typename RootT>
+ template<int LEVEL>
+-typename TreeNode<Tree<RootT>, LEVEL>::type* Tree<RootT>::getNode(uint32_t i)
+__hostdev__ typename TreeNode<Tree<RootT>, LEVEL>::type* Tree<RootT>::getNode(uint32_t i)
+ {
+     NANOVDB_ASSERT(i < DataType::mCount[LEVEL]);
+     return reinterpret_cast<TreeNodeT<LEVEL>*>(reinterpret_cast<uint8_t*>(this) + DataType::mBytes[LEVEL]) + i;
+@@ -2370,7 +2370,7 @@
+ 
+ template<typename RootT>
+ template<typename NodeT>
+-uint32_t Tree<RootT>::getNodeID(const NodeT& node) const
+__hostdev__ uint32_t Tree<RootT>::getNodeID(const NodeT& node) const
+ {
+     static_assert(is_same<TreeNodeT<NodeT::LEVEL>, NodeT>::value, "Tree::getNodeID: invalid node type");
+     const NodeT* first = reinterpret_cast<const NodeT*>(reinterpret_cast<const uint8_t*>(this) + DataType::mBytes[NodeT::LEVEL]);
+@@ -2380,7 +2380,7 @@
+ 
+ template<typename RootT>
+ template<typename NodeT>
+-uint32_t Tree<RootT>::getLinearOffset(const NodeT& node) const
+__hostdev__ uint32_t Tree<RootT>::getLinearOffset(const NodeT& node) const
+ {
+     return this->getNodeID(node) + DataType::mPFSum[NodeT::LEVEL];
+ }
+@@ -3366,7 +3366,7 @@
+ }; // LeafNode class
+ 
+ template<typename ValueT, typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+-inline void LeafNode<ValueT, CoordT, MaskT, LOG2DIM>::updateBBox()
+inline __hostdev__ void LeafNode<ValueT, CoordT, MaskT, LOG2DIM>::updateBBox()
+ {
+     static_assert(LOG2DIM == 3, "LeafNode::updateBBox: only supports LOGDIM = 3!");
+     if (!this->isActive()) return;
+Index: nanovdb/nanovdb/util/SampleFromVoxels.h
+===================================================================
+--- a/nanovdb/nanovdb/util/SampleFromVoxels.h	(revision 62751)
+++ b/nanovdb/nanovdb/util/SampleFromVoxels.h	(working copy)
+@@ -22,7 +22,7 @@
+ #define NANOVDB_SAMPLE_FROM_VOXELS_H_HAS_BEEN_INCLUDED
+ 
+ // Only define __hostdev__ when compiling as NVIDIA CUDA
+-#ifdef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIP__)
+ #define __hostdev__ __host__ __device__
+ #else
+ #include <cmath> // for floor
+@@ -136,7 +136,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename Vec3T>
+-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const Vec3T& xyz) const
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const Vec3T& xyz) const
+ {
+     const CoordT ijk = Round<CoordT>(xyz);
+     if (ijk != mPos) {
+@@ -147,7 +147,7 @@
+ }
+ 
+ template<typename TreeOrAccT>
+-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const CoordT& ijk) const
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, true>::operator()(const CoordT& ijk) const
+ {
+     if (ijk != mPos) {
+         mPos = ijk;
+@@ -158,7 +158,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename Vec3T>
+-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, false>::operator()(const Vec3T& xyz) const
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 0, false>::operator()(const Vec3T& xyz) const
+ {
+     return mAcc.getValue(Round<CoordT>(xyz));
+ }
+@@ -195,7 +195,7 @@
+ }; // TrilinearSamplerBase
+ 
+ template<typename TreeOrAccT>
+-void TrilinearSampler<TreeOrAccT>::stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const
+__hostdev__ void TrilinearSampler<TreeOrAccT>::stencil(CoordT& ijk, ValueT (&v)[2][2][2]) const
+ {
+     v[0][0][0] = mAcc.getValue(ijk); // i, j, k
+ 
+@@ -224,7 +224,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-typename TreeOrAccT::ValueType TrilinearSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
+__hostdev__ typename TreeOrAccT::ValueType TrilinearSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
+ {
+ #if 0
+   auto lerp = [](ValueT a, ValueT b, ValueT w){ return fma(w, b-a, a); };// = w*(b-a) + a
+@@ -239,7 +239,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-Vec3T<typename TreeOrAccT::ValueType> TrilinearSampler<TreeOrAccT>::gradient(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
+__hostdev__ Vec3T<typename TreeOrAccT::ValueType> TrilinearSampler<TreeOrAccT>::gradient(const Vec3T<RealT> &uvw, const ValueT (&v)[2][2][2])
+ {
+     static_assert(std::is_floating_point<ValueT>::value, "TrilinearSampler::gradient requires a floating-point type");
+ #if 0
+@@ -270,7 +270,7 @@
+ }
+ 
+ template<typename TreeOrAccT>
+-bool TrilinearSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[2][2][2])
+__hostdev__ bool TrilinearSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[2][2][2])
+ {
+     static_assert(std::is_floating_point<ValueT>::value, "TrilinearSampler::zeroCrossing requires a floating-point type");
+     const bool less = v[0][0][0] < ValueT(0);
+@@ -363,7 +363,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(Vec3T<RealT> xyz) const
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(Vec3T<RealT> xyz) const
+ {
+     this->cache(xyz);
+     return BaseT::sample(xyz, mVal);
+@@ -370,7 +370,7 @@
+ }
+ 
+ template<typename TreeOrAccT>
+-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(const CoordT &ijk) const
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, true>::operator()(const CoordT &ijk) const
+ {
+     return  ijk == mPos ? mVal[0][0][0] : BaseT::mAcc.getValue(ijk);
+ }
+@@ -377,7 +377,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, true>::gradient(Vec3T<RealT> xyz) const
+__hostdev__ Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, true>::gradient(Vec3T<RealT> xyz) const
+ {
+     this->cache(xyz);
+     return BaseT::gradient(xyz, mVal);
+@@ -393,7 +393,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-void SampleFromVoxels<TreeOrAccT, 1, true>::cache(Vec3T<RealT>& xyz) const
+__hostdev__ void SampleFromVoxels<TreeOrAccT, 1, true>::cache(Vec3T<RealT>& xyz) const
+ {
+     CoordT ijk = Floor<CoordT>(xyz);
+     if (ijk != mPos) {
+@@ -406,7 +406,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
+ {
+     ValueT val[2][2][2];
+     CoordT ijk = Floor<CoordT>(xyz);
+@@ -418,7 +418,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 1, false>::operator()(Vec3T<RealT> xyz) const
+ {
+     auto lerp = [](ValueT a, ValueT b, RealT w) { return a + ValueT(w) * (b - a); };
+ 
+@@ -463,7 +463,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-inline Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, false>::gradient(Vec3T<RealT> xyz) const
+inline __hostdev__ Vec3T<typename TreeOrAccT::ValueType> SampleFromVoxels<TreeOrAccT, 1, false>::gradient(Vec3T<RealT> xyz) const
+ {
+     ValueT val[2][2][2];
+     CoordT ijk = Floor<CoordT>(xyz);
+@@ -473,7 +473,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-bool SampleFromVoxels<TreeOrAccT, 1, false>::zeroCrossing(Vec3T<RealT> xyz) const
+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 1, false>::zeroCrossing(Vec3T<RealT> xyz) const
+ {
+     ValueT val[2][2][2];
+     CoordT ijk = Floor<CoordT>(xyz);
+@@ -510,7 +510,7 @@
+ }; // TriquadraticSamplerBase
+ 
+ template<typename TreeOrAccT>
+-void TriquadraticSampler<TreeOrAccT>::stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const
+__hostdev__ void TriquadraticSampler<TreeOrAccT>::stencil(const CoordT &ijk, ValueT (&v)[3][3][3]) const
+ {
+     CoordT p(ijk[0] - 1, 0, 0);
+     for (int dx = 0; dx < 3; ++dx, ++p[0]) {
+@@ -526,7 +526,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-typename TreeOrAccT::ValueType TriquadraticSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[3][3][3])
+__hostdev__ typename TreeOrAccT::ValueType TriquadraticSampler<TreeOrAccT>::sample(const Vec3T<RealT> &uvw, const ValueT (&v)[3][3][3])
+ {
+     auto kernel = [](const ValueT* value, double weight)->ValueT {
+         return weight * (weight * (0.5f * (value[0] + value[2]) - value[1]) + 
+@@ -545,7 +545,7 @@
+ }
+ 
+ template<typename TreeOrAccT>
+-bool TriquadraticSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[3][3][3])
+__hostdev__ bool TriquadraticSampler<TreeOrAccT>::zeroCrossing(const ValueT (&v)[3][3][3])
+ {
+     static_assert(std::is_floating_point<ValueT>::value, "TrilinearSampler::zeroCrossing requires a floating-point type");
+     const bool less = v[0][0][0] < ValueT(0);
+@@ -624,7 +624,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(Vec3T<RealT> xyz) const
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(Vec3T<RealT> xyz) const
+ {
+     this->cache(xyz);
+     return BaseT::sample(xyz, mVal);
+@@ -631,7 +631,7 @@
+ }
+ 
+ template<typename TreeOrAccT>
+-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(const CoordT &ijk) const
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, true>::operator()(const CoordT &ijk) const
+ {
+     return  ijk == mPos ? mVal[1][1][1] : BaseT::mAcc.getValue(ijk);
+ }
+@@ -646,7 +646,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-void SampleFromVoxels<TreeOrAccT, 2, true>::cache(Vec3T<RealT>& xyz) const
+__hostdev__ void SampleFromVoxels<TreeOrAccT, 2, true>::cache(Vec3T<RealT>& xyz) const
+ {
+     CoordT ijk = Floor<CoordT>(xyz);
+     if (ijk != mPos) {
+@@ -657,7 +657,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, false>::operator()(Vec3T<RealT> xyz) const
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 2, false>::operator()(Vec3T<RealT> xyz) const
+ {
+     ValueT val[3][3][3];
+     CoordT ijk = Floor<CoordT>(xyz);
+@@ -667,7 +667,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-bool SampleFromVoxels<TreeOrAccT, 2, false>::zeroCrossing(Vec3T<RealT> xyz) const
+__hostdev__ bool SampleFromVoxels<TreeOrAccT, 2, false>::zeroCrossing(Vec3T<RealT> xyz) const
+ {
+     ValueT val[3][3][3];
+     CoordT ijk = Floor<CoordT>(xyz);
+@@ -710,7 +710,7 @@
+ }; // TricubicSampler
+ 
+ template<typename TreeOrAccT>
+-void TricubicSampler<TreeOrAccT>::stencil(const CoordT& ijk, ValueT (&C)[64]) const
+__hostdev__ void TricubicSampler<TreeOrAccT>::stencil(const CoordT& ijk, ValueT (&C)[64]) const
+ {
+     auto fetch = [&](int i, int j, int k) -> ValueT& { return C[((i + 1) << 4) + ((j + 1) << 2) + k + 1]; };
+ 
+@@ -929,7 +929,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 3, true>::operator()(Vec3T<RealT> xyz) const
+__hostdev__ typename TreeOrAccT::ValueType SampleFromVoxels<TreeOrAccT, 3, true>::operator()(Vec3T<RealT> xyz) const
+ {
+     this->cache(xyz);
+     return BaseT::sample(xyz, mC);
+@@ -937,7 +937,7 @@
+ 
+ template<typename TreeOrAccT>
+ template<typename RealT, template<typename...> class Vec3T>
+-void SampleFromVoxels<TreeOrAccT, 3, true>::cache(Vec3T<RealT>& xyz) const
+__hostdev__ void SampleFromVoxels<TreeOrAccT, 3, true>::cache(Vec3T<RealT>& xyz) const
+ {
+     CoordT ijk = Floor<CoordT>(xyz);
+     if (ijk != mPos) {
--- a/build_files/cmake/Modules/FindShaderC.cmake
+++ b/build_files/cmake/Modules/FindShaderC.cmake
@@ -0,0 +1,66 @@
+# - Find SHADERC library
+# Find the native Haru includes and library
+# This module defines
+#  SHADERC_INCLUDE_DIRS, where to find hpdf.h, set when
+#                        SHADERC_INCLUDE_DIR is found.
+#  SHADERC_LIBRARIES, libraries to link against to use Haru.
+#  SHADERC_ROOT_DIR, The base directory to search for Haru.
+#                    This can also be an environment variable.
+#  SHADERC_FOUND, If false, do not try to use Haru.
+#
+# also defined, but not for general use are
+#  SHADERC_LIBRARY, where to find the Haru library.
+
+#=============================================================================
+# Copyright 2021 Blender Foundation.
+#
+# Distributed under the OSI-approved BSD 3-Clause License,
+# see accompanying file BSD-3-Clause-license.txt for details.
+#=============================================================================
+
+# If SHADERC_ROOT_DIR was defined in the environment, use it.
+if(NOT SHADERC_ROOT_DIR AND NOT $ENV{SHADERC_ROOT_DIR} STREQUAL "")
+  set(SHADERC_ROOT_DIR $ENV{SHADERC_ROOT_DIR})
+endif()
+
+set(_shaderc_SEARCH_DIRS
+  ${SHADERC_ROOT_DIR}
+  /opt/lib/haru
+)
+
+find_path(SHADERC_INCLUDE_DIR
+  NAMES
+    shaderc.hpp
+  HINTS
+    ${_shaderc_SEARCH_DIRS}
+  PATH_SUFFIXES
+    include/shaderc
+    include
+)
+
+find_library(SHADERC_LIBRARY
+  NAMES
+    shaderc_combined
+    shaderc
+  HINTS
+    ${_shaderc_SEARCH_DIRS}
+  PATH_SUFFIXES
+    lib64 lib
+)
+
+# Handle the QUIETLY and REQUIRED arguments and set SHADERC_FOUND to TRUE if
+# all listed variables are TRUE.
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ShaderC DEFAULT_MSG SHADERC_LIBRARY SHADERC_INCLUDE_DIR)
+
+if(SHADERC_FOUND)
+  set(SHADERC_LIBRARIES ${SHADERC_LIBRARY})
+  set(SHADERC_INCLUDE_DIRS ${SHADERC_INCLUDE_DIR})
+endif()
+
+mark_as_advanced(
+  SHADERC_INCLUDE_DIR
+  SHADERC_LIBRARY
+)
+
+unset(_shaderc_SEARCH_DIRS)
--- a/build_files/cmake/cmake_netbeans_project.py
+++ b/build_files/cmake/cmake_netbeans_project.py
@@ -180,7 +180,7 @@ def create_nb_project_main():
        f.write('    </logicalFolder>\n')

        f.write('  </logicalFolder>\n')
-        # default, but this dir is infact not in blender dir so we can ignore it
+        # default, but this dir is in fact not in blender dir so we can ignore it
        # f.write('  <sourceFolderFilter>^(nbproject)$</sourceFolderFilter>\n')
        f.write(r'  <sourceFolderFilter>^(nbproject|__pycache__|.*\.py|.*\.html|.*\.blend)$</sourceFolderFilter>\n')

--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -529,7 +529,7 @@ function(SETUP_LIBDIRS)

  # NOTE: For all new libraries, use absolute library paths.
  # This should eventually be phased out.
-  # APPLE plaform uses full paths for linking libraries, and avoids link_directories.
+  # APPLE platform uses full paths for linking libraries, and avoids link_directories.
  if(NOT MSVC AND NOT APPLE)
    link_directories(${JPEG_LIBPATH} ${PNG_LIBPATH} ${ZLIB_LIBPATH} ${FREETYPE_LIBPATH})

--- a/build_files/cmake/platform/platform_unix.cmake
+++ b/build_files/cmake/platform/platform_unix.cmake
@@ -102,6 +102,11 @@ find_package_wrapper(ZLIB REQUIRED)
 find_package_wrapper(Zstd REQUIRED)
 find_package_wrapper(Freetype REQUIRED)

+if(WITH_VULKAN)
+  find_package_wrapper(Vulkan REQUIRED)
+  find_package(ShaderC REQUIRED)
+endif()
+
 if(WITH_PYTHON)
  # No way to set py35, remove for now.
  # find_package(PythonLibs)
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@@ -874,5 +874,32 @@ if(WITH_HARU)
  endif()
 endif()

+if(WITH_VULKAN)
+  if(EXISTS ${LIBDIR}/vulkan)
+    set(Vulkan_FOUND On)
+    set(Vulkan_ROOT_DIR ${LIBDIR}/vulkan)
+    set(Vulkan_INCLUDE_DIR ${Vulkan_ROOT_DIR}/include)
+    set(Vulkan_INCLUDE_DIRS ${Vulkan_INCLUDE_DIR})
+    set(Vulkan_LIBRARY ${Vulkan_ROOT_DIR}/lib/vulkan-1.lib)
+    set(Vulkan_LIBRARIES ${Vulkan_LIBRARY})
+  else()
+    message(WARNING "vulkan was not found, disabling WITH_VULKAN")
+    set(WITH_VULKAN OFF)
+  endif()
+endif()
+
+if(WITH_VULKAN)
+  if(EXISTS ${LIBDIR}/shaderc)
+    set(SHADERC_ROOT_DIR ${LIBDIR}/shaderc)
+    set(SHADERC_INCLUDE_DIR ${SHADERC_ROOT_DIR}/include)
+    set(SHADERC_INCLUDE_DIRS ${SHADERC_INCLUDE_DIR})
+    set(SHADERC_LIBRARY optimized ${SHADERC_ROOT_DIR}/lib/shaderc_shared.lib debug ${SHADERC_ROOT_DIR}/lib/shaderc_shared_d.lib)
+    set(SHADERC_LIBRARIES ${SHADERC_LIBRARY})
+  else()
+    message(WARNING "shaderc was not found, disabling WITH_VULKAN")
+    set(WITH_VULKAN OFF)
+  endif()
+endif()
+
 set(ZSTD_INCLUDE_DIRS ${LIBDIR}/zstd/include)
 set(ZSTD_LIBRARIES ${LIBDIR}/zstd/lib/zstd_static.lib)
--- a/build_files/cmake/platform/platform_win32_bundle_crt.cmake
+++ b/build_files/cmake/platform/platform_win32_bundle_crt.cmake
@@ -27,7 +27,7 @@ if(WITH_WINDOWS_BUNDLE_CRT)
  # Install the CRT to the blender.crt Sub folder.
  install(FILES ${CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS} DESTINATION ./blender.crt COMPONENT Libraries)

-  # Generating the manifest is a relativly expensive operation since
+  # Generating the manifest is a relatively expensive operation since
  # it is collecting an sha1 hash for every file required. so only do
  # this work when the libs have either changed or the manifest does
  # not exist yet.
--- a/doc/python_api/examples/bpy.app.timers.5.py
+++ b/doc/python_api/examples/bpy.app.timers.5.py
@@ -11,7 +11,7 @@ import queue

 execution_queue = queue.Queue()

-# This function can savely be called in another thread.
+# This function can safely be called in another thread.
 # The function will be executed when the timer runs the next time.
 def run_in_main_thread(function):
    execution_queue.put(function)
--- a/doc/python_api/examples/bpy.types.Operator.1.py
+++ b/doc/python_api/examples/bpy.types.Operator.1.py
@@ -42,8 +42,13 @@ class SimpleMouseOperator(bpy.types.Operator):
        self.y = event.mouse_y
        return self.execute(context)

+# Only needed if you want to add into a dynamic menu
+def menu_func(self, context):
+    self.layout.operator(SimpleMouseOperator.bl_idname, text="Simple Mouse Operator")

+# Register and add to the view menu (required to also use F3 search "Simple Mouse Operator" for quick access)
 bpy.utils.register_class(SimpleMouseOperator)
+bpy.types.VIEW3D_MT_view.append(menu_func)

 # Test call to the newly defined operator.
 # Here we call the operator and invoke it, meaning that the settings are taken
--- a/doc/python_api/examples/bpy.types.Operator.2.py
+++ b/doc/python_api/examples/bpy.types.Operator.2.py
@@ -43,7 +43,7 @@ def menu_func(self, context):
    self.layout.operator(ExportSomeData.bl_idname, text="Text Export Operator")


-# Register and add to the file selector
+# Register and add to the file selector (required to also use F3 search "Text Export Operator" for quick access)
 bpy.utils.register_class(ExportSomeData)
 bpy.types.TOPBAR_MT_file_export.append(menu_func)

--- a/doc/python_api/examples/bpy.types.Operator.3.py
+++ b/doc/python_api/examples/bpy.types.Operator.3.py
@@ -27,8 +27,14 @@ class DialogOperator(bpy.types.Operator):
        wm = context.window_manager
        return wm.invoke_props_dialog(self)

+# Only needed if you want to add into a dynamic menu
+def menu_func(self, context):
+    self.layout.operator(DialogOperator.bl_idname, text="Dialog Operator")

+
+# Register and add to the object menu (required to also use F3 search "Dialog Operator" for quick access)
 bpy.utils.register_class(DialogOperator)
+bpy.types.VIEW3D_MT_object.append(menu_func)

 # Test call.
 bpy.ops.object.dialog_operator('INVOKE_DEFAULT')
--- a/doc/python_api/examples/bpy.types.Operator.4.py
+++ b/doc/python_api/examples/bpy.types.Operator.4.py
@@ -41,8 +41,13 @@ class CustomDrawOperator(bpy.types.Operator):

        col.prop(self, "my_string")

+# Only needed if you want to add into a dynamic menu
+def menu_func(self, context):
+    self.layout.operator(CustomDrawOperator.bl_idname, text="Custom Draw Operator")

+# Register and add to the object menu (required to also use F3 search "Custom Draw Operator" for quick access)
 bpy.utils.register_class(CustomDrawOperator)
+bpy.types.VIEW3D_MT_object.append(menu_func)

 # test call
 bpy.ops.object.custom_draw('INVOKE_DEFAULT')
--- a/doc/python_api/examples/bpy.types.Operator.5.py
+++ b/doc/python_api/examples/bpy.types.Operator.5.py
@@ -55,8 +55,13 @@ class ModalOperator(bpy.types.Operator):
        context.window_manager.modal_handler_add(self)
        return {'RUNNING_MODAL'}

+# Only needed if you want to add into a dynamic menu
+def menu_func(self, context):
+    self.layout.operator(ModalOperator.bl_idname, text="Modal Operator")

+# Register and add to the object menu (required to also use F3 search "Modal Operator" for quick access)
 bpy.utils.register_class(ModalOperator)
+bpy.types.VIEW3D_MT_object.append(menu_func)

 # test call
 bpy.ops.object.modal_operator('INVOKE_DEFAULT')
--- a/doc/python_api/examples/bpy.types.Operator.6.py
+++ b/doc/python_api/examples/bpy.types.Operator.6.py
@@ -31,8 +31,13 @@ class SearchEnumOperator(bpy.types.Operator):
        context.window_manager.invoke_search_popup(self)
        return {'RUNNING_MODAL'}

+# Only needed if you want to add into a dynamic menu
+def menu_func(self, context):
+    self.layout.operator(SearchEnumOperator.bl_idname, text="Search Enum Operator")

+# Register and add to the object menu (required to also use F3 search "Search Enum Operator" for quick access)
 bpy.utils.register_class(SearchEnumOperator)
+bpy.types.VIEW3D_MT_object.append(menu_func)

 # test call
 bpy.ops.object.search_enum_operator('INVOKE_DEFAULT')
--- a/doc/python_api/examples/bpy.types.Operator.py
+++ b/doc/python_api/examples/bpy.types.Operator.py
@@ -22,8 +22,13 @@ class HelloWorldOperator(bpy.types.Operator):
        print("Hello World")
        return {'FINISHED'}

+# Only needed if you want to add into a dynamic menu
+def menu_func(self, context):
+    self.layout.operator(HelloWorldOperator.bl_idname, text="Hello World Operator")

+# Register and add to the view menu (required to also use F3 search "Hello World Operator" for quick access)
 bpy.utils.register_class(HelloWorldOperator)
+bpy.types.VIEW3D_MT_view.append(menu_func)

 # test call to the newly defined operator
 bpy.ops.wm.hello_world()
--- a/doc/python_api/rst/info_gotcha.rst
+++ b/doc/python_api/rst/info_gotcha.rst
@@ -728,7 +728,7 @@ Abusing RNA property callbacks
 ------------------------------

 Python-defined RNA properties can have custom callbacks. Trying to perform complex operations
-from there, like calling an operator, may work, but is not officialy recommended nor supported.
+from there, like calling an operator, may work, but is not officially recommended nor supported.

 Main reason is that those callback should be very fast, but additionally, it may for example
 create issues with undo/redo system (most operators store an history step, and editing an RNA
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -116,3 +116,7 @@ endif()
 if (WITH_COMPOSITOR)
  add_subdirectory(smaa_areatex)
 endif()
+
+if(WITH_VULKAN)
+  add_subdirectory(vulkan_memory_allocator)
+endif()
--- a/extern/hipew/include/hipew.h
+++ b/extern/hipew/include/hipew.h
@@ -804,31 +804,29 @@ typedef enum hipDeviceP2PAttr {
 } hipDeviceP2PAttr;

 typedef struct HIP_MEMCPY3D {
-  size_t srcXInBytes;
-  size_t srcY;
-  size_t srcZ;
-  size_t srcLOD;
+  unsigned int srcXInBytes;
+  unsigned int srcY;
+  unsigned int srcZ;
+  unsigned int srcLOD;
  hipMemoryType srcMemoryType;
  const void* srcHost;
  hipDeviceptr_t srcDevice;
-  hArray * srcArray;
-  void* reserved0;
-  size_t srcPitch;
-  size_t srcHeight;
-  size_t dstXInBytes;
-  size_t dstY;
-  size_t dstZ;
-  size_t dstLOD;
+  hArray srcArray;
+  unsigned int srcPitch;
+  unsigned int srcHeight;
+  unsigned int dstXInBytes;
+  unsigned int dstY;
+  unsigned int dstZ;
+  unsigned int dstLOD;
  hipMemoryType dstMemoryType;
  void* dstHost;
  hipDeviceptr_t dstDevice;
-  hArray * dstArray;
-  void* reserved1;
-  size_t dstPitch;
-  size_t dstHeight;
-  size_t WidthInBytes;
-  size_t Height;
-  size_t Depth;
+  hArray dstArray;
+  unsigned int dstPitch;
+  unsigned int dstHeight;
+  unsigned int WidthInBytes;
+  unsigned int Height;
+  unsigned int Depth;
 } HIP_MEMCPY3D;

 typedef struct HIP_MEMCPY3D_PEER_st {
@@ -879,7 +877,7 @@ typedef struct HIP_RESOURCE_DESC_st {
  hipResourceType resType;
  union {
    struct {
-      hArray * h_Array;
+      hArray h_Array;
    } array;
    struct {
      hipMipmappedArray_t hMipmappedArray;
@@ -1074,9 +1072,10 @@ typedef enum hiprtcResult {
 typedef hipError_t HIPAPI thipGetErrorName(hipError_t error, const char** pStr);
 typedef hipError_t HIPAPI thipInit(unsigned int Flags);
 typedef hipError_t HIPAPI thipDriverGetVersion(int* driverVersion);
-typedef hipError_t HIPAPI thipGetDevice(hipDevice_t* device, int ordinal);
+typedef hipError_t HIPAPI thipGetDevice(int* device);
 typedef hipError_t HIPAPI thipGetDeviceCount(int* count);
 typedef hipError_t HIPAPI thipGetDeviceProperties(hipDeviceProp_t* props, int deviceId);
+typedef hipError_t HIPAPI thipDeviceGet(hipDevice_t* device, int ordinal);
 typedef hipError_t HIPAPI thipDeviceGetName(char* name, int len, hipDevice_t dev);
 typedef hipError_t HIPAPI thipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
 typedef hipError_t HIPAPI thipDeviceComputeCapability(int* major, int* minor, hipDevice_t dev);
@@ -1209,6 +1208,7 @@ extern thipDriverGetVersion *hipDriverGetVersion;
 extern thipGetDevice *hipGetDevice;
 extern thipGetDeviceCount *hipGetDeviceCount;
 extern thipGetDeviceProperties *hipGetDeviceProperties;
+extern thipDeviceGet* hipDeviceGet;
 extern thipDeviceGetName *hipDeviceGetName;
 extern thipDeviceGetAttribute *hipDeviceGetAttribute;
 extern thipDeviceComputeCapability *hipDeviceComputeCapability;
--- a/extern/hipew/src/hipew.c
+++ b/extern/hipew/src/hipew.c
@@ -71,6 +71,7 @@ thipDriverGetVersion *hipDriverGetVersion;
 thipGetDevice *hipGetDevice;
 thipGetDeviceCount *hipGetDeviceCount;
 thipGetDeviceProperties *hipGetDeviceProperties;
+thipDeviceGet* hipDeviceGet;
 thipDeviceGetName *hipDeviceGetName;
 thipDeviceGetAttribute *hipDeviceGetAttribute;
 thipDeviceComputeCapability *hipDeviceComputeCapability;
@@ -255,6 +256,7 @@ static int hipewHipInit(void) {
  HIP_LIBRARY_FIND_CHECKED(hipGetDevice);
  HIP_LIBRARY_FIND_CHECKED(hipGetDeviceCount);
  HIP_LIBRARY_FIND_CHECKED(hipGetDeviceProperties);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGet);
  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetName);
  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetAttribute);
  HIP_LIBRARY_FIND_CHECKED(hipDeviceComputeCapability);
--- a/extern/vulkan_memory_allocator/CMakeLists.txt
+++ b/extern/vulkan_memory_allocator/CMakeLists.txt
@@ -0,0 +1,42 @@
+# ***** BEGIN GPL LICENSE BLOCK *****
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+# The Original Code is Copyright (C) 2012, Blender Foundation
+# All rights reserved.
+# ***** END GPL LICENSE BLOCK *****
+
+
+set(INC
+  .
+)
+
+set(INC_SYS
+  ${Vulkan_INCLUDE_DIRS}
+)
+
+set(SRC
+  vk_mem_alloc_impl.cc
+
+  vk_mem_alloc.h
+)
+
+blender_add_lib(extern_vulkan_memory_allocator "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
+
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+  target_compile_options(extern_vulkan_memory_allocator
+    PRIVATE "-Wno-nullability-completeness"
+  )
+endif()
--- a/extern/vulkan_memory_allocator/LICENSE.txt
+++ b/extern/vulkan_memory_allocator/LICENSE.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/extern/vulkan_memory_allocator/README.blender
+++ b/extern/vulkan_memory_allocator/README.blender
@@ -0,0 +1,5 @@
+Project: VulkanMemoryAllocator
+URL: https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator
+License: MIT
+Upstream version: 4b047fd
+Local modifications: None
--- a/extern/vulkan_memory_allocator/README.md
+++ b/extern/vulkan_memory_allocator/README.md
@@ -0,0 +1,134 @@
+# Vulkan Memory Allocator
+
+Easy to integrate Vulkan memory allocation library.
+
+**Documentation:** See [Vulkan Memory Allocator](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/) (generated from Doxygen-style comments in [src/vk_mem_alloc.h](src/vk_mem_alloc.h))
+
+**License:** MIT. See [LICENSE.txt](LICENSE.txt)
+
+**Changelog:** See [CHANGELOG.md](CHANGELOG.md)
+
+**Product page:** [Vulkan Memory Allocator on GPUOpen](https://gpuopen.com/gaming-product/vulkan-memory-allocator/)
+
+**Build status:**
+
+- Windows: [![Build status](https://ci.appveyor.com/api/projects/status/4vlcrb0emkaio2pn/branch/master?svg=true)](https://ci.appveyor.com/project/adam-sawicki-amd/vulkanmemoryallocator/branch/master)  
+- Linux: [![Build Status](https://travis-ci.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator.svg?branch=master)](https://travis-ci.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator)
+
+# Problem
+
+Memory allocation and resource (buffer and image) creation in Vulkan is difficult (comparing to older graphics API-s, like D3D11 or OpenGL) for several reasons:
+
+- It requires a lot of boilerplate code, just like everything else in Vulkan, because it is a low-level and high-performance API.
+- There is additional level of indirection: `VkDeviceMemory` is allocated separately from creating `VkBuffer`/`VkImage` and they must be bound together.
+- Driver must be queried for supported memory heaps and memory types. Different IHVs provide different types of it.
+- It is recommended practice to allocate bigger chunks of memory and assign parts of them to particular resources.
+
+# Features
+
+This library can help game developers to manage memory allocations and resource creation by offering some higher-level functions:
+
+1. Functions that help to choose correct and optimal memory type based on intended usage of the memory.
+   - Required or preferred traits of the memory are expressed using higher-level description comparing to Vulkan flags.
+2. Functions that allocate memory blocks, reserve and return parts of them (`VkDeviceMemory` + offset + size) to the user.
+   - Library keeps track of allocated memory blocks, used and unused ranges inside them, finds best matching unused ranges for new allocations, respects all the rules of alignment and buffer/image granularity.
+3. Functions that can create an image/buffer, allocate memory for it and bind them together - all in one call.
+
+Additional features:
+
+- Well-documented - description of all functions and structures provided, along with chapters that contain general description and example code.
+- Thread-safety: Library is designed to be used in multithreaded code. Access to a single device memory block referred by different buffers and textures (binding, mapping) is synchronized internally.
+- Configuration: Fill optional members of CreateInfo structure to provide custom CPU memory allocator, pointers to Vulkan functions and other parameters.
+- Customization: Predefine appropriate macros to provide your own implementation of all external facilities used by the library, from assert, mutex, and atomic, to vector and linked list. 
+- Support for memory mapping, reference-counted internally. Support for persistently mapped memory: Just allocate with appropriate flag and you get access to mapped pointer.
+- Support for non-coherent memory. Functions that flush/invalidate memory. `nonCoherentAtomSize` is respected automatically.
+- Support for resource aliasing (overlap).
+- Support for sparse binding and sparse residency: Convenience functions that allocate or free multiple memory pages at once.
+- Custom memory pools: Create a pool with desired parameters (e.g. fixed or limited maximum size) and allocate memory out of it.
+- Linear allocator: Create a pool with linear algorithm and use it for much faster allocations and deallocations in free-at-once, stack, double stack, or ring buffer fashion.
+- Support for Vulkan 1.0, 1.1, 1.2.
+- Support for extensions (and equivalent functionality included in new Vulkan versions):
+   - VK_EXT_memory_budget: Used internally if available to query for current usage and budget. If not available, it falls back to an estimation based on memory heap sizes.
+   - VK_KHR_dedicated_allocation: Just enable it and it will be used automatically by the library.
+   - VK_AMD_device_coherent_memory
+   - VK_KHR_buffer_device_address
+- Defragmentation of GPU and CPU memory: Let the library move data around to free some memory blocks and make your allocations better compacted.
+- Lost allocations: Allocate memory with appropriate flags and let the library remove allocations that are not used for many frames to make room for new ones.
+- Statistics: Obtain detailed statistics about the amount of memory used, unused, number of allocated blocks, number of allocations etc. - globally, per memory heap, and per memory type.
+- Debug annotations: Associate string with name or opaque pointer to your own data with every allocation.
+- JSON dump: Obtain a string in JSON format with detailed map of internal state, including list of allocations and gaps between them.
+- Convert this JSON dump into a picture to visualize your memory. See [tools/VmaDumpVis](tools/VmaDumpVis/README.md).
+- Debugging incorrect memory usage: Enable initialization of all allocated memory with a bit pattern to detect usage of uninitialized or freed memory. Enable validation of a magic number before and after every allocation to detect out-of-bounds memory corruption.
+- Record and replay sequence of calls to library functions to a file to check correctness, measure performance, and gather statistics.
+
+# Prequisites
+
+- Self-contained C++ library in single header file. No external dependencies other than standard C and C++ library and of course Vulkan. STL containers are not used by default.
+- Public interface in C, in same convention as Vulkan API. Implementation in C++.
+- Error handling implemented by returning `VkResult` error codes - same way as in Vulkan.
+- Interface documented using Doxygen-style comments.
+- Platform-independent, but developed and tested on Windows using Visual Studio. Continuous integration setup for Windows and Linux. Used also on Android, MacOS, and other platforms.
+
+# Example
+
+Basic usage of this library is very simple. Advanced features are optional. After you created global `VmaAllocator` object, a complete code needed to create a buffer may look like this:
+
+```cpp
+VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
+bufferInfo.size = 65536;
+bufferInfo.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+
+VmaAllocationCreateInfo allocInfo = {};
+allocInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY;
+
+VkBuffer buffer;
+VmaAllocation allocation;
+vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, &allocation, nullptr);
+```
+
+With this one function call:
+
+1. `VkBuffer` is created.
+2. `VkDeviceMemory` block is allocated if needed.
+3. An unused region of the memory block is bound to this buffer.
+
+`VmaAllocation` is an object that represents memory assigned to this buffer. It can be queried for parameters like Vulkan memory handle and offset.
+
+# Binaries
+
+The release comes with precompiled binary executables for "VulkanSample" application which contains test suite and "VmaReplay" tool. They are compiled using Visual Studio 2019, so they require appropriate libraries to work, including "MSVCP140.dll", "VCRUNTIME140.dll", "VCRUNTIME140_1.dll". If their launch fails with error message telling about those files missing, please download and install [Microsoft Visual C++ Redistributable for Visual Studio 2015, 2017 and 2019](https://support.microsoft.com/en-us/help/2977003/the-latest-supported-visual-c-downloads), "x64" version.
+
+# Read more
+
+See **[Documentation](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/)**.
+
+# Software using this library
+
+- **[Detroit: Become Human](https://gpuopen.com/learn/porting-detroit-3/)**
+- **[Vulkan Samples](https://github.com/LunarG/VulkanSamples)** - official Khronos Vulkan samples. License: Apache-style.
+- **[Anvil](https://github.com/GPUOpen-LibrariesAndSDKs/Anvil)** - cross-platform framework for Vulkan. License: MIT.
+- **[Filament](https://github.com/google/filament)** - physically based rendering engine for Android, Windows, Linux and macOS, from Google. Apache License 2.0.
+- **[Atypical Games - proprietary game engine](https://developer.samsung.com/galaxy-gamedev/gamedev-blog/infinitejet.html)**
+- **[Flax Engine](https://flaxengine.com/)**
+- **[Lightweight Java Game Library (LWJGL)](https://www.lwjgl.org/)** - includes binding of the library for Java. License: BSD.
+- **[PowerVR SDK](https://github.com/powervr-graphics/Native_SDK)** - C++ cross-platform 3D graphics SDK, from Imagination. License: MIT.
+- **[Skia](https://github.com/google/skia)** - complete 2D graphic library for drawing Text, Geometries, and Images, from Google.
+- **[The Forge](https://github.com/ConfettiFX/The-Forge)** - cross-platform rendering framework. Apache License 2.0.
+- **[VK9](https://github.com/disks86/VK9)** - Direct3D 9 compatibility layer using Vulkan. Zlib lincese.
+- **[vkDOOM3](https://github.com/DustinHLand/vkDOOM3)** - Vulkan port of GPL DOOM 3 BFG Edition. License: GNU GPL.
+- **[vkQuake2](https://github.com/kondrak/vkQuake2)** - vanilla Quake 2 with Vulkan support. License: GNU GPL.
+- **[Vulkan Best Practice for Mobile Developers](https://github.com/ARM-software/vulkan_best_practice_for_mobile_developers)** from ARM. License: MIT.
+- **[RPCS3](https://github.com/RPCS3/rpcs3)** - PlayStation 3 emulator/debugger. License: GNU GPLv2.
+
+[Many other projects on GitHub](https://github.com/search?q=AMD_VULKAN_MEMORY_ALLOCATOR_H&type=Code) and some game development studios that use Vulkan in their games.
+
+# See also
+
+- **[D3D12 Memory Allocator](https://github.com/GPUOpen-LibrariesAndSDKs/D3D12MemoryAllocator)** - equivalent library for Direct3D 12. License: MIT.
+- **[Awesome Vulkan](https://github.com/vinjn/awesome-vulkan)** - a curated list of awesome Vulkan libraries, debuggers and resources.
+- **[VulkanMemoryAllocator-Hpp](https://github.com/malte-v/VulkanMemoryAllocator-Hpp)** - C++ binding for this library. License: CC0-1.0.
+- **[PyVMA](https://github.com/realitix/pyvma)** - Python wrapper for this library. Author: Jean-Sébastien B. (@realitix). License: Apache 2.0.
+- **[vk-mem](https://github.com/gwihlidal/vk-mem-rs)** - Rust binding for this library. Author: Graham Wihlidal. License: Apache 2.0 or MIT.
+- **[Haskell bindings](https://hackage.haskell.org/package/VulkanMemoryAllocator)**, **[github](https://github.com/expipiplus1/vulkan/tree/master/VulkanMemoryAllocator)** - Haskell bindings for this library. Author: Joe Hermaszewski (@expipiplus1). License BSD-3-Clause.
+- **[vma_sample_sdl](https://github.com/rextimmy/vma_sample_sdl)** - SDL port of the sample app of this library (with the goal of running it on multiple platforms, including MacOS). Author: @rextimmy. License: MIT.
+- **[vulkan-malloc](https://github.com/dylanede/vulkan-malloc)** - Vulkan memory allocation library for Rust. Based on version 1 of this library. Author: Dylan Ede (@dylanede). License: MIT / Apache 2.0.
--- a/extern/vulkan_memory_allocator/vk_mem_alloc.h
+++ b/extern/vulkan_memory_allocator/vk_mem_alloc.h
--- a/extern/vulkan_memory_allocator/vk_mem_alloc_impl.cc
+++ b/extern/vulkan_memory_allocator/vk_mem_alloc_impl.cc
@@ -0,0 +1,3 @@
+#define VMA_IMPLEMENTATION
+
+#include "vk_mem_alloc.h"
--- a/intern/CMakeLists.txt
+++ b/intern/CMakeLists.txt
@@ -85,3 +85,7 @@ endif()
 if(UNIX AND NOT APPLE)
  add_subdirectory(libc_compat)
 endif()
+
+if(WITH_VULKAN)
+  add_subdirectory(shader_compiler)
+endif()
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -226,6 +226,9 @@ add_definitions(
  -DCCL_NAMESPACE_END=}
 )

+if(WITH_CYCLES_DEBUG)
+  add_definitions(-DWITH_CYCLES_DEBUG)
+endif()
 if(WITH_CYCLES_STANDALONE_GUI)
  add_definitions(-DWITH_CYCLES_STANDALONE_GUI)
 endif()
@@ -334,7 +337,7 @@ else()
 endif()

 # Warnings
-if(CMAKE_COMPILER_IS_GNUCXX)
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_C_COMPILER_ID MATCHES "Clang")
  ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_float_conversion "-Werror=float-conversion")
  ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_double_promotion "-Werror=double-promotion")
  ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_no_error_unused_macros "-Wno-error=unused-macros")
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -218,6 +218,12 @@ enum_denoising_prefilter = (
    ('ACCURATE', "Accurate", "Prefilter noisy guiding passes before denoising color. Improves quality when guiding passes are noisy using extra processing time", 3),
 )

+enum_direct_light_sampling_type = (
+    ('MULTIPLE_IMPORTANCE_SAMPLING', "Multiple Importance Sampling", "Multiple importance sampling is used to combine direct light contributions from next-event estimation and forward path tracing", 0),
+    ('FORWARD_PATH_TRACING', "Forward Path Tracing", "Direct light contributions are only sampled using forward path tracing", 1),
+    ('NEXT_EVENT_ESTIMATION', "Next-Event Estimation", "Direct light contributions are only sampled using next-event estimation", 2),
+)
+
 def update_render_passes(self, context):
    scene = context.scene
    view_layer = context.view_layer
@@ -353,7 +359,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        name="Scrambling Distance",
        default=1.0,
        min=0.0, max=1.0,
-        description="Lower values give faster rendering with GPU rendering and less noise with all devices at the cost of possible artifacts if set too low. Only works when not using adaptive sampling",
+        description="Reduce randomization between pixels to improve GPU rendering performance, at the cost of possible rendering artifacts if set too low. Only works when not using adaptive sampling",
    )
    preview_scrambling_distance: BoolProperty(
        name="Scrambling Distance viewport",
@@ -361,10 +367,10 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        description="Uses the Scrambling Distance value for the viewport. Faster but may flicker",
    )

-    adaptive_scrambling_distance: BoolProperty(
-        name="Adaptive Scrambling Distance",
+    auto_scrambling_distance: BoolProperty(
+        name="Automatic Scrambling Distance",
        default=False,
-        description="Uses a formula to adapt the scrambling distance strength based on the sample count",
+        description="Automatically reduce the randomization between pixels to improve GPU rendering performance, at the cost of possible rendering artifacts. Only works when not using adaptive sampling",
    )

    use_layer_samples: EnumProperty(
@@ -422,6 +428,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        default=0,
    )

+    direct_light_sampling_type: EnumProperty(
+        name="Direct Light Sampling Type",
+        description="The type of strategy used for sampling direct light contributions",
+        items=enum_direct_light_sampling_type,
+        default='MULTIPLE_IMPORTANCE_SAMPLING',
+    )
+
    min_light_bounces: IntProperty(
        name="Min Light Bounces",
        description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, "
@@ -777,8 +790,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
    )

    use_auto_tile: BoolProperty(
-        name="Auto Tiles",
-        description="Automatically render high resolution images in tiles to reduce memory usage, using the specified tile size. Tiles are cached to disk while rendering to save memory",
+        name="Using Tiling",
+        description="Render high resolution images in tiles to reduce memory usage, using the specified tile size. Tiles are cached to disk while rendering to save memory",
        default=True,
    )
    tile_size: IntProperty(
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -295,13 +295,13 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):

        layout.separator()

-        col = layout.column(align=True)
-        col.active = not (cscene.use_adaptive_sampling and cscene.use_preview_adaptive_sampling)
-        col.prop(cscene, "scrambling_distance", text="Scrambling Distance")
-        col.prop(cscene, "adaptive_scrambling_distance", text="Adaptive")
-        sub = col.row(align=True)
+        heading = layout.column(align=True, heading="Scrambling Distance")
+        heading.active = not (cscene.use_adaptive_sampling and cscene.use_preview_adaptive_sampling)
+        heading.prop(cscene, "auto_scrambling_distance", text="Automatic")
+        sub = heading.row()
        sub.active = not cscene.use_preview_adaptive_sampling
        sub.prop(cscene, "preview_scrambling_distance", text="Viewport")
+        heading.prop(cscene, "scrambling_distance", text="Multiplier")

        layout.separator()

--- a/intern/cycles/blender/curves.cpp
+++ b/intern/cycles/blender/curves.cpp
@@ -199,7 +199,7 @@ static bool ObtainCacheParticleUV(Hair *hair,
          b_mesh->uv_layers.begin(l);

          float2 uv = zero_float2();
-          if (b_mesh->uv_layers.length())
+          if (!b_mesh->uv_layers.empty())
            b_psys.uv_on_emitter(psmd, *b_pa, pa_no, uv_num, &uv.x);
          CData->curve_uv.push_back_slow(uv);

@@ -261,7 +261,7 @@ static bool ObtainCacheParticleVcol(Hair *hair,
          b_mesh->vertex_colors.begin(l);

          float4 vcol = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
-          if (b_mesh->vertex_colors.length())
+          if (!b_mesh->vertex_colors.empty())
            b_psys.mcol_on_emitter(psmd, *b_pa, pa_no, vcol_num, &vcol.x);
          CData->curve_vcol.push_back_slow(vcol);

--- a/intern/cycles/blender/display_driver.cpp
+++ b/intern/cycles/blender/display_driver.cpp
@@ -334,7 +334,7 @@ bool BlenderDisplayDriver::update_begin(const Params &params,

  /* Update PBO dimensions if needed.
   *
-   * NOTE: Allocate the PBO for the the size which will fit the final render resolution (as in,
+   * NOTE: Allocate the PBO for the size which will fit the final render resolution (as in,
   * at a resolution divider 1. This was we don't need to recreate graphics interoperability
   * objects which are costly and which are tied to the specific underlying buffer size.
   * The downside of this approach is that when graphics interoperability is not used we are
--- a/intern/cycles/blender/mesh.cpp
+++ b/intern/cycles/blender/mesh.cpp
@@ -555,7 +555,7 @@ static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh,
 /* Create uv map attributes. */
 static void attr_create_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh)
 {
-  if (b_mesh.uv_layers.length() != 0) {
+  if (!b_mesh.uv_layers.empty()) {
    for (BL::MeshUVLoopLayer &l : b_mesh.uv_layers) {
      const bool active_render = l.active_render();
      AttributeStandard uv_std = (active_render) ? ATTR_STD_UV : ATTR_STD_NONE;
@@ -619,7 +619,7 @@ static void attr_create_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh)

 static void attr_create_subd_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, bool subdivide_uvs)
 {
-  if (b_mesh.uv_layers.length() != 0) {
+  if (!b_mesh.uv_layers.empty()) {
    BL::Mesh::uv_layers_iterator l;
    int i = 0;

@@ -951,7 +951,7 @@ static void create_mesh(Scene *scene,
  N = attr_N->data_float3();

  /* create generated coordinates from undeformed coordinates */
-  const bool need_default_tangent = (subdivision == false) && (b_mesh.uv_layers.length() == 0) &&
+  const bool need_default_tangent = (subdivision == false) && (b_mesh.uv_layers.empty()) &&
                                    (mesh->need_attribute(scene, ATTR_STD_UV_TANGENT));
  if (mesh->need_attribute(scene, ATTR_STD_GENERATED) || need_default_tangent) {
    Attribute *attr = attributes.add(ATTR_STD_GENERATED);
--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -129,7 +129,7 @@ void BlenderSession::create_session()
  /* reset status/progress */
  last_status = "";
  last_error = "";
-  last_progress = -1.0f;
+  last_progress = -1.0;
  start_resize_time = 0.0;

  /* create session */
@@ -606,19 +606,6 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
  pass->set_type(bake_type_to_pass(bake_type, bake_filter));
  pass->set_include_albedo((bake_filter & BL::BakeSettings::pass_filter_COLOR));

-  if (pass->get_type() == PASS_COMBINED) {
-    /* Filtering settings for combined pass. */
-    Integrator *integrator = scene->integrator;
-    integrator->set_use_direct_light((bake_filter & BL::BakeSettings::pass_filter_DIRECT) != 0);
-    integrator->set_use_indirect_light((bake_filter & BL::BakeSettings::pass_filter_INDIRECT) !=
-                                       0);
-    integrator->set_use_diffuse((bake_filter & BL::BakeSettings::pass_filter_DIFFUSE) != 0);
-    integrator->set_use_glossy((bake_filter & BL::BakeSettings::pass_filter_GLOSSY) != 0);
-    integrator->set_use_transmission((bake_filter & BL::BakeSettings::pass_filter_TRANSMISSION) !=
-                                     0);
-    integrator->set_use_emission((bake_filter & BL::BakeSettings::pass_filter_EMIT) != 0);
-  }
-
  session->set_display_driver(nullptr);
  session->set_output_driver(make_unique<BlenderOutputDriver>(b_engine));

@@ -628,6 +615,24 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
    sync->sync_camera(b_render, b_camera_override, width, height, "");
    sync->sync_data(
        b_render, b_depsgraph, b_v3d, b_camera_override, width, height, &python_thread_state);
+
+    /* Filtering settings for combined pass. */
+    if (pass->get_type() == PASS_COMBINED) {
+      Integrator *integrator = scene->integrator;
+      integrator->set_use_direct_light((bake_filter & BL::BakeSettings::pass_filter_DIRECT) != 0);
+      integrator->set_use_indirect_light((bake_filter & BL::BakeSettings::pass_filter_INDIRECT) !=
+                                         0);
+      integrator->set_use_diffuse((bake_filter & BL::BakeSettings::pass_filter_DIFFUSE) != 0);
+      integrator->set_use_glossy((bake_filter & BL::BakeSettings::pass_filter_GLOSSY) != 0);
+      integrator->set_use_transmission(
+          (bake_filter & BL::BakeSettings::pass_filter_TRANSMISSION) != 0);
+      integrator->set_use_emission((bake_filter & BL::BakeSettings::pass_filter_EMIT) != 0);
+    }
+
+    /* Always use transpanent background for baking. */
+    scene->background->set_transparent(true);
+
+    /* Load built-in images from Blender. */
    builtin_images_load();
  }

@@ -854,7 +859,7 @@ void BlenderSession::get_status(string &status, string &substatus)
  session->progress.get_status(status, substatus);
 }

-void BlenderSession::get_progress(float &progress, double &total_time, double &render_time)
+void BlenderSession::get_progress(double &progress, double &total_time, double &render_time)
 {
  session->progress.get_time(total_time, render_time);
  progress = session->progress.get_progress();
@@ -862,10 +867,10 @@ void BlenderSession::get_progress(float &progress, double &total_time, double &r

 void BlenderSession::update_bake_progress()
 {
-  float progress = session->progress.get_progress();
+  double progress = session->progress.get_progress();

  if (progress != last_progress) {
-    b_engine.update_progress(progress);
+    b_engine.update_progress((float)progress);
    last_progress = progress;
  }
 }
@@ -874,7 +879,7 @@ void BlenderSession::update_status_progress()
 {
  string timestatus, status, substatus;
  string scene_status = "";
-  float progress;
+  double progress;
  double total_time, remaining_time = 0, render_time;
  float mem_used = (float)session->stats.mem_used / 1024.0f / 1024.0f;
  float mem_peak = (float)session->stats.mem_peak / 1024.0f / 1024.0f;
@@ -918,7 +923,7 @@ void BlenderSession::update_status_progress()
    last_status_time = current_time;
  }
  if (progress != last_progress) {
-    b_engine.update_progress(progress);
+    b_engine.update_progress((float)progress);
    last_progress = progress;
  }

--- a/intern/cycles/blender/session.h
+++ b/intern/cycles/blender/session.h
@@ -82,7 +82,7 @@ class BlenderSession {
  void tag_redraw();
  void tag_update();
  void get_status(string &status, string &substatus);
-  void get_progress(float &progress, double &total_time, double &render_time);
+  void get_progress(double &progress, double &total_time, double &render_time);
  void test_cancel();
  void update_status_progress();
  void update_bake_progress();
@@ -108,7 +108,7 @@ class BlenderSession {

  string last_status;
  string last_error;
-  float last_progress;
+  double last_progress;
  double last_status_time;

  int width, height;
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -365,8 +365,8 @@ void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)

  int samples = get_int(cscene, "samples");
  float scrambling_distance = get_float(cscene, "scrambling_distance");
-  bool adaptive_scrambling_distance = get_boolean(cscene, "adaptive_scrambling_distance");
-  if (adaptive_scrambling_distance) {
+  bool auto_scrambling_distance = get_boolean(cscene, "auto_scrambling_distance");
+  if (auto_scrambling_distance) {
    scrambling_distance *= 4.0f / sqrtf(samples);
  }

@@ -392,6 +392,12 @@ void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)
    integrator->set_ao_bounces(0);
  }

+#ifdef WITH_CYCLES_DEBUG
+  DirectLightSamplingType direct_light_sampling_type = (DirectLightSamplingType)get_enum(
+      cscene, "direct_light_sampling_type", DIRECT_LIGHT_SAMPLING_NUM, DIRECT_LIGHT_SAMPLING_MIS);
+  integrator->set_direct_light_sampling_type(direct_light_sampling_type);
+#endif
+
  const DenoiseParams denoise_params = get_denoise_params(b_scene, b_view_layer, background);
  integrator->set_use_denoise(denoise_params.use);

@@ -872,7 +878,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,

  /* Time limit. */
  if (background) {
-    params.time_limit = get_float(cscene, "time_limit");
+    params.time_limit = (double)get_float(cscene, "time_limit");
  }
  else {
    /* For the viewport it kind of makes more sense to think in terms of the noise floor, which is
--- a/intern/cycles/blender/util.h
+++ b/intern/cycles/blender/util.h
@@ -303,7 +303,7 @@ static inline string image_user_file_path(BL::ImageUser &iuser,
  string filepath_str = string(filepath);
  if (load_tiled && ima.source() == BL::Image::source_TILED) {
    string udim;
-    if (ima.tiles.length() > 0) {
+    if (!ima.tiles.empty()) {
      udim = to_string(ima.tiles[0].number());
    }
    string_replace(filepath_str, udim, "<UDIM>");
@@ -647,7 +647,7 @@ static inline Mesh::SubdivisionType object_subdivision_type(BL::Object &b_ob,
 {
  PointerRNA cobj = RNA_pointer_get(&b_ob.ptr, "cycles");

-  if (cobj.data && b_ob.modifiers.length() > 0 && experimental) {
+  if (cobj.data && !b_ob.modifiers.empty() && experimental) {
    BL::Modifier mod = b_ob.modifiers[b_ob.modifiers.length() - 1];
    bool enabled = preview ? mod.show_viewport() : mod.show_render();

--- a/intern/cycles/bvh/embree.cpp
+++ b/intern/cycles/bvh/embree.cpp
@@ -303,7 +303,7 @@ static void rtc_error_func(void *, enum RTCError, const char *str)
  VLOG(1) << str;
 }

-static double progress_start_time = 0.0f;
+static double progress_start_time = 0.0;

 static bool rtc_progress_func(void *user_ptr, const double n)
 {
--- a/intern/cycles/bvh/node.cpp
+++ b/intern/cycles/bvh/node.cpp
@@ -153,7 +153,7 @@ void BVHNode::update_time()
 namespace {

 struct DumpTraversalContext {
-  /* Descriptor of wile where writing is happening. */
+  /* Descriptor of while where writing is happening. */
  FILE *stream;
  /* Unique identifier of the node current. */
  int id;
--- a/intern/cycles/bvh/node.h
+++ b/intern/cycles/bvh/node.h
@@ -178,7 +178,7 @@ class InnerNode : public BVHNode {
    reset_unused_children();
  }

-  /* NOTE: This function is only used during binary BVH builder, and it
+  /* NOTE: This function is only used during binary BVH builder, and it's
   * supposed to be configured to have 2 children which will be filled-in in a
   * bit. But this is important to have children reset to NULL. */
  explicit InnerNode(const BoundBox &bounds) : BVHNode(bounds), num_children_(0)
--- a/intern/cycles/bvh/optix.cpp
+++ b/intern/cycles/bvh/optix.cpp
@@ -30,15 +30,17 @@ BVHOptiX::BVHOptiX(const BVHParams &params_,
    : BVH(params_, geometry_, objects_),
      device(device),
      traversable_handle(0),
-      as_data(device, params_.top_level ? "optix tlas" : "optix blas", false),
-      motion_transform_data(device, "optix motion transform", false)
+      as_data(make_unique<device_only_memory<char>>(
+          device, params.top_level ? "optix tlas" : "optix blas", false)),
+      motion_transform_data(
+          make_unique<device_only_memory<char>>(device, "optix motion transform", false))
 {
 }

 BVHOptiX::~BVHOptiX()
 {
-  // Acceleration structure memory is delayed freed on device, since deleting the
-  // BVH may happen while still being used for rendering.
+  /* Acceleration structure memory is delayed freed on device, since deleting the
+   * BVH may happen while still being used for rendering. */
  device->release_optix_bvh(this);
 }

--- a/intern/cycles/bvh/optix.h
+++ b/intern/cycles/bvh/optix.h
@@ -25,14 +25,16 @@

 #  include "device/memory.h"

+#  include "util/unique_ptr.h"
+
 CCL_NAMESPACE_BEGIN

 class BVHOptiX : public BVH {
 public:
  Device *device;
  uint64_t traversable_handle;
-  device_only_memory<char> as_data;
-  device_only_memory<char> motion_transform_data;
+  unique_ptr<device_only_memory<char>> as_data;
+  unique_ptr<device_only_memory<char>> motion_transform_data;

 protected:
  friend class BVH;
--- a/intern/cycles/cmake/macros.cmake
+++ b/intern/cycles/cmake/macros.cmake
@@ -88,7 +88,7 @@ endmacro()

 function(cycles_link_directories)
  if(APPLE)
-    # APPLE plaform uses full paths for linking libraries, and avoids link_directories.
+    # APPLE platform uses full paths for linking libraries, and avoids link_directories.
    return()
  endif()

--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -93,11 +93,6 @@ CPUDevice::~CPUDevice()
  texture_info.free();
 }

-bool CPUDevice::show_samples() const
-{
-  return (info.cpu_threads == 1);
-}
-
 BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
 {
  BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -60,8 +60,6 @@ class CPUDevice : public Device {
  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
  ~CPUDevice();

-  virtual bool show_samples() const override;
-
  virtual BVHLayoutMask get_bvh_layout_mask() const override;

  /* Returns true if the texture info was copied to the device (meaning, some more
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -46,12 +46,6 @@ bool CUDADevice::have_precompiled_kernels()
  return path_exists(cubins_path);
 }

-bool CUDADevice::show_samples() const
-{
-  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-  return true;
-}
-
 BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
 {
  return BVH_LAYOUT_BVH2;
@@ -242,6 +236,10 @@ string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
  cflags += " -DWITH_NANOVDB";
 #  endif

+#  ifdef WITH_CYCLES_DEBUG
+  cflags += " -DWITH_CYCLES_DEBUG";
+#  endif
+
  return cflags;
 }

@@ -777,6 +775,7 @@ void CUDADevice::generic_free(device_memory &mem)
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
    thread_scoped_lock lock(cuda_mem_map_mutex);
+    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
    const CUDAMem &cmem = cuda_mem_map[&mem];

    /* If cmem.use_mapped_host is true, reference counting is used
@@ -1143,6 +1142,7 @@ void CUDADevice::tex_free(device_texture &mem)
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
    thread_scoped_lock lock(cuda_mem_map_mutex);
+    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
    const CUDAMem &cmem = cuda_mem_map[&mem];

    if (cmem.texobject) {
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -76,8 +76,6 @@ class CUDADevice : public Device {

  static bool have_precompiled_kernels();

-  virtual bool show_samples() const override;
-
  virtual BVHLayoutMask get_bvh_layout_mask() const override;

  void set_error(const string &error) override;
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -149,10 +149,6 @@ class Device {
    fprintf(stderr, "%s\n", error.c_str());
    fflush(stderr);
  }
-  virtual bool show_samples() const
-  {
-    return false;
-  }
  virtual BVHLayoutMask get_bvh_layout_mask() const = 0;

  /* statistics */
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -47,12 +47,6 @@ bool HIPDevice::have_precompiled_kernels()
  return path_exists(fatbins_path);
 }

-bool HIPDevice::show_samples() const
-{
-  /* The HIPDevice only processes one tile at a time, so showing samples is fine. */
-  return true;
-}
-
 BVHLayoutMask HIPDevice::get_bvh_layout_mask() const
 {
  return BVH_LAYOUT_BVH2;
@@ -99,7 +93,7 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  }

  /* Setup device and context. */
-  result = hipGetDevice(&hipDevice, hipDevId);
+  result = hipDeviceGet(&hipDevice, hipDevId);
  if (result != hipSuccess) {
    set_error(string_printf("Failed to get HIP device handle from ordinal (%s)",
                            hipewErrorString(result)));
@@ -744,6 +738,7 @@ void HIPDevice::generic_free(device_memory &mem)
  if (mem.device_pointer) {
    HIPContextScope scope(this);
    thread_scoped_lock lock(hip_mem_map_mutex);
+    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
    const HIPMem &cmem = hip_mem_map[&mem];

    /* If cmem.use_mapped_host is true, reference counting is used
@@ -986,16 +981,16 @@ void HIPDevice::tex_alloc(device_texture &mem)
            << string_human_readable_number(mem.memory_size()) << " bytes. ("
            << string_human_readable_size(mem.memory_size()) << ")";

-    hip_assert(hipArray3DCreate(&array_3d, &desc));
+    hip_assert(hipArray3DCreate((hArray *)&array_3d, &desc));

    if (!array_3d) {
      return;
    }

    HIP_MEMCPY3D param;
-    memset(&param, 0, sizeof(param));
+    memset(&param, 0, sizeof(HIP_MEMCPY3D));
    param.dstMemoryType = hipMemoryTypeArray;
-    param.dstArray = &array_3d;
+    param.dstArray = array_3d;
    param.srcMemoryType = hipMemoryTypeHost;
    param.srcHost = mem.host_pointer;
    param.srcPitch = src_pitch;
@@ -1061,12 +1056,13 @@ void HIPDevice::tex_alloc(device_texture &mem)

  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    /* Bindless textures. */
    hipResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));

    if (array_3d) {
      resDesc.resType = hipResourceTypeArray;
-      resDesc.res.array.h_Array = &array_3d;
+      resDesc.res.array.h_Array = array_3d;
      resDesc.flags = 0;
    }
    else if (mem.data_height > 0) {
@@ -1111,6 +1107,7 @@ void HIPDevice::tex_free(device_texture &mem)
  if (mem.device_pointer) {
    HIPContextScope scope(this);
    thread_scoped_lock lock(hip_mem_map_mutex);
+    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
    const HIPMem &cmem = hip_mem_map[&mem];

    if (cmem.texobject) {
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -75,8 +75,6 @@ class HIPDevice : public Device {

  static bool have_precompiled_kernels();

-  virtual bool show_samples() const override;
-
  virtual BVHLayoutMask get_bvh_layout_mask() const override;

  void set_error(const string &error) override;
--- a/intern/cycles/device/memory.cpp
+++ b/intern/cycles/device/memory.cpp
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN

 device_memory::device_memory(Device *device, const char *name, MemoryType type)
    : data_type(device_type_traits<uchar>::data_type),
-      data_elements(device_type_traits<uchar>::num_elements_cpu),
+      data_elements(device_type_traits<uchar>::num_elements),
      data_size(0),
      device_size(0),
      data_width(0),
@@ -44,45 +44,6 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type)
 {
 }

-device_memory::device_memory(device_memory &&other) noexcept
-    : data_type(other.data_type),
-      data_elements(other.data_elements),
-      data_size(other.data_size),
-      device_size(other.device_size),
-      data_width(other.data_width),
-      data_height(other.data_height),
-      data_depth(other.data_depth),
-      type(other.type),
-      name(other.name),
-      device(other.device),
-      device_pointer(other.device_pointer),
-      host_pointer(other.host_pointer),
-      shared_pointer(other.shared_pointer),
-      shared_counter(other.shared_counter),
-      original_device_ptr(other.original_device_ptr),
-      original_device_size(other.original_device_size),
-      original_device(other.original_device),
-      need_realloc_(other.need_realloc_),
-      modified(other.modified)
-{
-  other.data_elements = 0;
-  other.data_size = 0;
-  other.device_size = 0;
-  other.data_width = 0;
-  other.data_height = 0;
-  other.data_depth = 0;
-  other.device = 0;
-  other.device_pointer = 0;
-  other.host_pointer = 0;
-  other.shared_pointer = 0;
-  other.shared_counter = 0;
-  other.original_device_ptr = 0;
-  other.original_device_size = 0;
-  other.original_device = 0;
-  other.need_realloc_ = false;
-  other.modified = false;
-}
-
 device_memory::~device_memory()
 {
  assert(shared_pointer == 0);
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -81,155 +81,140 @@ static constexpr size_t datatype_size(DataType datatype)

 template<typename T> struct device_type_traits {
  static const DataType data_type = TYPE_UNKNOWN;
-  static const size_t num_elements_cpu = sizeof(T);
-  static const size_t num_elements_gpu = sizeof(T);
+  static const size_t num_elements = sizeof(T);
 };

 template<> struct device_type_traits<uchar> {
  static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uchar) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uchar2> {
  static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(uchar2) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uchar3> {
  static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 3;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 3;
+  static_assert(sizeof(uchar3) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uchar4> {
  static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(uchar4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint> {
  static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uint) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint2> {
  static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(uint2) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint3> {
  static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 3;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 3;
+  static_assert(sizeof(uint3) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint4> {
  static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(uint4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<int> {
  static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(int) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<int2> {
  static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(int2) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<int3> {
  static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(int3) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<int4> {
  static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(int4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<float> {
  static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(float) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<float2> {
  static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(float2) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<float3> {
+  /* float3 has different size depending on the device, can't use it for interchanging
+   * memory between CPU and GPU.
+   *
+   * Leave body empty to trigger a compile error if used. */
+};
+
+template<> struct device_type_traits<packed_float3> {
  static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 3;
+  static_assert(sizeof(packed_float3) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<float4> {
  static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(float4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<half> {
  static const DataType data_type = TYPE_HALF;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(half) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<ushort4> {
  static const DataType data_type = TYPE_UINT16;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(ushort4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint16_t> {
  static const DataType data_type = TYPE_UINT16;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uint16_t) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<half4> {
  static const DataType data_type = TYPE_HALF;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(half4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint64_t> {
  static const DataType data_type = TYPE_UINT64;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uint64_t) == num_elements * datatype_size(data_type));
 };

 /* Device Memory
@@ -281,11 +266,16 @@ class device_memory {

  /* Only create through subclasses. */
  device_memory(Device *device, const char *name, MemoryType type);
-  device_memory(device_memory &&other) noexcept;

-  /* No copying allowed. */
+  /* No copying and allowed.
+   *
+   * This is because device implementation might need to register device memory in an allocation
+   * map of some sort and use pointer as a key to identify blocks. Moving data from one place to
+   * another bypassing device allocation routines will make those maps hard to maintain. */
  device_memory(const device_memory &) = delete;
+  device_memory(device_memory &&other) noexcept = delete;
  device_memory &operator=(const device_memory &) = delete;
+  device_memory &operator=(device_memory &&) = delete;

  /* Host allocation on the device. All host_pointer memory should be
   * allocated with these functions, for devices that support using
@@ -320,9 +310,7 @@ template<typename T> class device_only_memory : public device_memory {
      : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
  {
    data_type = device_type_traits<T>::data_type;
-    data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu :
-                                          device_type_traits<T>::num_elements_gpu,
-                        1);
+    data_elements = max(device_type_traits<T>::num_elements, 1);
  }

  device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other))
@@ -378,15 +366,11 @@ template<typename T> class device_only_memory : public device_memory {

 template<typename T> class device_vector : public device_memory {
 public:
-  /* Can only use this for types that have the same size on CPU and GPU. */
-  static_assert(device_type_traits<T>::num_elements_cpu ==
-                device_type_traits<T>::num_elements_gpu);
-
  device_vector(Device *device, const char *name, MemoryType type)
      : device_memory(device, name, type)
  {
    data_type = device_type_traits<T>::data_type;
-    data_elements = device_type_traits<T>::num_elements_cpu;
+    data_elements = device_type_traits<T>::num_elements;
    modified = true;
    need_realloc_ = true;

--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -109,14 +109,6 @@ class MultiDevice : public Device {
    return error_msg;
  }

-  virtual bool show_samples() const override
-  {
-    if (devices.size() > 1) {
-      return false;
-    }
-    return devices.front().device->show_samples();
-  }
-
  virtual BVHLayoutMask get_bvh_layout_mask() const override
  {
    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -1032,7 +1032,7 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
    return false;
  }

-  device_only_memory<char> &out_data = bvh->as_data;
+  device_only_memory<char> &out_data = *bvh->as_data;
  if (operation == OPTIX_BUILD_OPERATION_BUILD) {
    assert(out_data.device == this);
    out_data.alloc_to_device(sizes.outputSizeInBytes);
@@ -1123,7 +1123,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
      operation = OPTIX_BUILD_OPERATION_UPDATE;
    }
    else {
-      bvh_optix->as_data.free();
+      bvh_optix->as_data->free();
      bvh_optix->traversable_handle = 0;
    }

@@ -1344,9 +1344,9 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    unsigned int num_instances = 0;
    unsigned int max_num_instances = 0xFFFFFFFF;

-    bvh_optix->as_data.free();
+    bvh_optix->as_data->free();
    bvh_optix->traversable_handle = 0;
-    bvh_optix->motion_transform_data.free();
+    bvh_optix->motion_transform_data->free();

    optixDeviceContextGetProperty(context,
                                  OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
@@ -1379,8 +1379,8 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
        }
      }

-      assert(bvh_optix->motion_transform_data.device == this);
-      bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+      assert(bvh_optix->motion_transform_data->device == this);
+      bvh_optix->motion_transform_data->alloc_to_device(total_motion_transform_size);
    }

    for (Object *ob : bvh->objects) {
@@ -1441,7 +1441,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)

        motion_transform_offset = align_up(motion_transform_offset,
                                           OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data->device_pointer +
                                           motion_transform_offset;
        motion_transform_offset += motion_transform_size;

--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -23,6 +23,7 @@
 #  include "device/optix/queue.h"
 #  include "device/optix/util.h"
 #  include "kernel/types.h"
+#  include "util/unique_ptr.h"

 CCL_NAMESPACE_BEGIN

@@ -76,7 +77,7 @@ class OptiXDevice : public CUDADevice {
  device_only_memory<KernelParamsOptiX> launch_params;
  OptixTraversableHandle tlas_handle = 0;

-  vector<device_only_memory<char>> delayed_free_bvh_memory;
+  vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
  thread_mutex delayed_free_bvh_mutex;

  class Denoiser {
--- a/intern/cycles/doc/license/readme.txt
+++ b/intern/cycles/doc/license/readme.txt
@@ -3,7 +3,7 @@ This program uses code from various sources, the default license is Apache 2.0
 for all code, with the following exceptions.

 Modified BSD License
-* Code adapated from Open Shading Language
+* Code adapted from Open Shading Language
 * Sobol direction vectors
 * Matrix inversion code from OpenEXR
 * MD5 Hash code
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -296,13 +296,13 @@ static BufferParams scale_buffer_params(const BufferParams &params, int resoluti

  scaled_params.window_x = params.window_x / resolution_divider;
  scaled_params.window_y = params.window_y / resolution_divider;
-  scaled_params.window_width = params.window_width / resolution_divider;
-  scaled_params.window_height = params.window_height / resolution_divider;
+  scaled_params.window_width = max(1, params.window_width / resolution_divider);
+  scaled_params.window_height = max(1, params.window_height / resolution_divider);

  scaled_params.full_x = params.full_x / resolution_divider;
  scaled_params.full_y = params.full_y / resolution_divider;
-  scaled_params.full_width = params.full_width / resolution_divider;
-  scaled_params.full_height = params.full_height / resolution_divider;
+  scaled_params.full_width = max(1, params.full_width / resolution_divider);
+  scaled_params.full_height = max(1, params.full_height / resolution_divider);

  scaled_params.update_offset_stride();

@@ -850,7 +850,8 @@ void PathTrace::progress_update_if_needed(const RenderWork &render_work)
 {
  if (progress_ != nullptr) {
    const int2 tile_size = get_render_tile_size();
-    const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples;
+    const uint64_t num_samples_added = uint64_t(tile_size.x) * tile_size.y *
+                                       render_work.path_trace.num_samples;
    const int current_sample = render_work.path_trace.start_sample +
                               render_work.path_trace.num_samples -
                               render_work.path_trace.sample_offset;
--- a/intern/cycles/integrator/path_trace_display.h
+++ b/intern/cycles/integrator/path_trace_display.h
@@ -76,7 +76,7 @@ class PathTraceDisplay {

  /* Copy buffer of rendered pixels of a given size into a given position of the texture.
   *
-   * This function does not acquire a lock. The reason for this is is to allow use of this function
+   * This function does not acquire a lock. The reason for this is to allow use of this function
   * for partial updates from different devices. In this case the caller will acquire the lock
   * once, update all the slices and release
   * the lock once. This will ensure that draw() will never use partially updated texture. */
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -840,6 +840,26 @@ int RenderScheduler::get_num_samples_to_path_trace() const
      num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy);
    }

+    /* When time limit is used clamp the calculated number of samples to keep occupancy.
+     * This is because time limit causes the last render iteration to happen with less number of
+     * samples, which conflicts with the occupancy (lower number of samples causes lower
+     * occupancy, also the calculation is based on number of previously rendered samples).
+     *
+     * When time limit is not used the number of samples per render iteration is either increasing
+     * or stays the same, so there is no need to clamp number of samples calculated for occupancy.
+     */
+    if (time_limit_ != 0.0 && state_.start_render_time != 0.0) {
+      const double remaining_render_time = max(
+          0.0, time_limit_ - (time_dt() - state_.start_render_time));
+      const double time_per_sample_average = path_trace_time_.get_average();
+      const double predicted_render_time = num_samples_to_occupy * time_per_sample_average;
+
+      if (predicted_render_time > remaining_render_time) {
+        num_samples_to_occupy = lround(num_samples_to_occupy *
+                                       (remaining_render_time / predicted_render_time));
+      }
+    }
+
    num_samples_to_render = max(num_samples_to_render,
                                min(num_samples_to_occupy, max_num_samples_to_render));
  }
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -273,6 +273,7 @@ set(SRC_KERNEL_UTIL_HEADERS
 )

 set(SRC_KERNEL_TYPES_HEADERS
+  tables.h
  textures.h
  types.h
 )
@@ -410,12 +411,8 @@ if(WITH_CYCLES_CUDA_BINARIES)
      -I ${CMAKE_CURRENT_SOURCE_DIR}/..
      -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
      --use_fast_math
-      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
-
-    if(${experimental})
-      set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__)
-      set(name ${name}_experimental)
-    endif()
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file}
+      -Wno-deprecated-gpu-targets)

    if(WITH_NANOVDB)
      set(cuda_flags ${cuda_flags}
@@ -423,6 +420,10 @@ if(WITH_CYCLES_CUDA_BINARIES)
        -I "${NANOVDB_INCLUDE_DIR}")
    endif()

+    if(WITH_CYCLES_DEBUG)
+      set(cuda_flags ${cuda_flags} -D WITH_CYCLES_DEBUG)
+    endif()
+
    if(WITH_CYCLES_CUBIN_COMPILER)
      string(SUBSTRING ${arch} 3 -1 CUDA_ARCH)

@@ -571,13 +572,14 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
      -ffast-math
      -o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file})

-    if(${experimental})
-      set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__)
-      set(name ${name}_experimental)
+    if(WITH_NANOVDB)
+      set(hip_flags ${hip_flags}
+        -D WITH_NANOVDB
+        -I "${NANOVDB_INCLUDE_DIR}")
    endif()

    if(WITH_CYCLES_DEBUG)
-      set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__)
+      set(hip_flags ${hip_flags} -D WITH_CYCLES_DEBUG)
    endif()

    add_custom_command(
@@ -618,6 +620,10 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
        -I "${NANOVDB_INCLUDE_DIR}")
    endif()

+    if(WITH_CYCLES_DEBUG)
+      set(cuda_flags ${cuda_flags} -D WITH_CYCLES_DEBUG)
+    endif()
+
    if(WITH_CYCLES_CUBIN_COMPILER)
      # Needed to find libnvrtc-builtins.so. Can't do it from inside
      # cycles_cubin_cc since the env variable is read before main()
@@ -706,7 +712,7 @@ if(WITH_COMPILER_ASAN)
    string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -fno-sanitize=all")
    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-sanitize=vptr")
  elseif(CMAKE_C_COMPILER_ID MATCHES "Clang")
-    # With OSL, Cycles disables rtti in some modules, wich then breaks at linking
+    # With OSL, Cycles disables rtti in some modules, which then breaks at linking
    # when trying to use vptr sanitizer (included into 'undefined' general option).
    string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -fno-sanitize=vptr")
    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-sanitize=vptr")
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -97,7 +97,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *
    swapped = false;
    for (int j = 0; j < num_hits - 1; ++j) {
      if (hits[j].t > hits[j + 1].t) {
-        struct Intersection tmp_hit = hits[j];
+        Intersection tmp_hit = hits[j];
        float3 tmp_Ng = Ng[j];
        hits[j] = hits[j + 1];
        Ng[j] = Ng[j + 1];
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -438,7 +438,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals kg,
  if (label & LABEL_TRANSMIT) {
    float threshold_squared = kernel_data.background.transparent_roughness_squared_threshold;

-    if (threshold_squared >= 0.0f) {
+    if (threshold_squared >= 0.0f && !(label & LABEL_DIFFUSE)) {
      if (bsdf_get_specular_roughness_squared(sc) <= threshold_squared) {
        label |= LABEL_TRANSMIT_TRANSPARENT;
      }
--- a/intern/cycles/kernel/device/cpu/globals.h
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -18,6 +18,7 @@

 #pragma once

+#include "kernel/tables.h"
 #include "kernel/types.h"
 #include "kernel/util/profiling.h"

--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -52,8 +52,9 @@ typedef unsigned long long uint64_t;
 #endif
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -85,7 +86,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)

 /* GPU texture objects */

--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -21,6 +21,9 @@
 #include "kernel/device/gpu/parallel_sorted_index.h"
 #include "kernel/device/gpu/work_stealing.h"

+/* Include constant tables before entering Metal's context class scope (context_begin.h) */
+#include "kernel/tables.h"
+
 #ifdef __KERNEL_METAL__
 #  include "kernel/device/metal/context_begin.h"
 #endif
@@ -464,7 +467,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
  if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask));
+    atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
  }
 }

@@ -892,6 +895,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  const auto can_split_mask = ccl_gpu_ballot(can_split);
  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
  if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask));
+    atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
  }
 }
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -85,8 +85,8 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;

    /* For each thread within a warp compute how many other active states precede it. */
-    const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
-                                            ccl_gpu_thread_mask(thread_warp));
+    const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+                                        ccl_gpu_thread_mask(thread_warp));

    /* Last thread in warp stores number of active states for each warp. */
    if (thread_warp == ccl_gpu_warp_size - 1) {
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -45,8 +45,9 @@ typedef unsigned long long uint64_t;
 #define ccl_device_forceinline __device__ __forceinline__
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -84,7 +85,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot(predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
-#define ccl_gpu_popc(x) __popc(x)

 /* GPU texture objects */
 typedef hipTextureObject_t ccl_gpu_tex_object;
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -34,6 +34,7 @@ using namespace metal;

 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wuninitialized"

 /* Qualifiers */

@@ -42,8 +43,9 @@ using namespace metal;
 #define ccl_device_forceinline ccl_device
 #define ccl_device_noinline ccl_device __attribute__((noinline))
 #define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_global device
-#define ccl_static_constant static constant constexpr
+#define ccl_inline_constant static constant constexpr
 #define ccl_device_constant constant
 #define ccl_constant const device
 #define ccl_gpu_shared threadgroup
@@ -64,7 +66,7 @@ using namespace metal;
 #define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)

 #define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
-#define ccl_gpu_popc(x) popcount(x)
+#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);

 // clang-format off

@@ -123,7 +125,6 @@ kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
-  INIT_DEBUG_BUFFER \
  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
 } \
 void kernel_gpu_##name::run(thread MetalKernelContext& context, \
@@ -150,6 +151,31 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \

 // clang-format on

+/* volumetric lambda functions - use function objects for lambda-like functionality */
+#define VOLUME_READ_LAMBDA(function_call) \
+  struct FnObjectRead { \
+    KernelGlobals kg; \
+    ccl_private MetalKernelContext *context; \
+    int state; \
+\
+    VolumeStack operator()(const int i) const \
+    { \
+      return context->function_call; \
+    } \
+  } volume_read_lambda_pass{kg, this, state};
+
+#define VOLUME_WRITE_LAMBDA(function_call) \
+  struct FnObjectWrite { \
+    KernelGlobals kg; \
+    ccl_private MetalKernelContext *context; \
+    int state; \
+\
+    void operator()(const int i, VolumeStack entry) const \
+    { \
+      context->function_call; \
+    } \
+  } volume_write_lambda_pass{kg, this, state};
+
 /* make_type definitions with Metal style element initializers */
 #ifdef make_float2
 #  undef make_float2
@@ -204,6 +230,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
 #define sinhf(x) sinh(float(x))
 #define coshf(x) cosh(float(x))
 #define tanhf(x) tanh(float(x))
+#define saturatef(x) saturate(float(x))

 /* Use native functions with possibly lower precision for performance,
 * no issues found so far. */
@@ -217,6 +244,8 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \

 #define NULL 0

+#define __device__
+
 /* texture bindings and sampler setup */

 struct Texture2DParamsMetal {
@@ -231,6 +260,9 @@ struct MetalAncillaries {
  device Texture3DParamsMetal *textures_3d;
 };

+#include "util/half.h"
+#include "util/types.h"
+
 enum SamplerType {
  SamplerFilterNearest_AddressRepeat,
  SamplerFilterNearest_AddressClampEdge,
--- a/intern/cycles/kernel/device/metal/globals.h
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN

 typedef struct KernelParamsMetal {

-#define KERNEL_TEX(type, name) ccl_constant type *name;
+#define KERNEL_TEX(type, name) ccl_global const type *name;
 #include "kernel/textures.h"
 #undef KERNEL_TEX

--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -49,10 +49,11 @@ typedef unsigned long long uint64_t;
  __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
 #define ccl_device_inline ccl_device
 #define ccl_device_forceinline ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -86,7 +87,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)

 /* GPU texture objects */

--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -21,6 +21,8 @@

 #include "kernel/device/gpu/image.h"  /* Texture lookup uses normal CUDA intrinsics. */

+#include "kernel/tables.h"
+
 #include "kernel/integrator/state.h"
 #include "kernel/integrator/state_flow.h"
 #include "kernel/integrator/state_util.h"
@@ -44,7 +46,7 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
 ccl_device_forceinline int get_object_id()
 {
 #ifdef __OBJECT_MOTION__
-  /* Always get the the instance ID from the TLAS
+  /* Always get the instance ID from the TLAS
   * There might be a motion transform node between TLAS and BLAS which does not have one. */
  return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
 #else
@@ -159,9 +161,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()

  /* Record geometric normal. */
  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0));
-  const float3 tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1));
-  const float3 tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0);
+  const float3 tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1);
+  const float3 tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));

  /* Continue tracing (without this the trace call would return after the first hit). */
--- a/intern/cycles/kernel/film/accumulate.h
+++ b/intern/cycles/kernel/film/accumulate.h
@@ -160,7 +160,8 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg,

  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);

-  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
+  return atomic_fetch_and_add_uint32(
+             (ccl_global uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
         sample_offset;
 }

@@ -501,7 +502,7 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,

    /* Write shadow pass. */
    if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
-        (path_flag & PATH_RAY_CAMERA)) {
+        (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
      const float3 unshadowed_throughput = INTEGRATOR_STATE(
          state, shadow_path, unshadowed_throughput);
      const float3 shadowed_throughput = INTEGRATOR_STATE(state, shadow_path, throughput);
@@ -552,7 +553,7 @@ ccl_device_inline void kernel_accum_background(KernelGlobals kg,
                                               const bool is_transparent_background_ray,
                                               ccl_global float *ccl_restrict render_buffer)
 {
-  float3 contribution = INTEGRATOR_STATE(state, path, throughput) * L;
+  float3 contribution = float3(INTEGRATOR_STATE(state, path, throughput)) * L;
  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);

  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
--- a/intern/cycles/kernel/film/passes.h
+++ b/intern/cycles/kernel/film/passes.h
@@ -177,7 +177,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals kg,
 #ifdef __PASSES__
  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);

-  if (!(path_flag & PATH_RAY_CAMERA)) {
+  if (!(path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
    return;
  }

--- a/intern/cycles/kernel/geom/attribute.h
+++ b/intern/cycles/kernel/geom/attribute.h
@@ -27,7 +27,12 @@ CCL_NAMESPACE_BEGIN
 * Lookup of attributes is different between OSL and SVM, as OSL is ustring
 * based while for SVM we use integer ids. */

-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd);
+/* Patch index for triangle, -1 if not subdivision triangle */
+
+ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
+}

 ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
@@ -106,9 +111,9 @@ ccl_device Transform primitive_attribute_matrix(KernelGlobals kg,
 {
  Transform tfm;

-  tfm.x = kernel_tex_fetch(__attributes_float3, desc.offset + 0);
-  tfm.y = kernel_tex_fetch(__attributes_float3, desc.offset + 1);
-  tfm.z = kernel_tex_fetch(__attributes_float3, desc.offset + 2);
+  tfm.x = kernel_tex_fetch(__attributes_float4, desc.offset + 0);
+  tfm.y = kernel_tex_fetch(__attributes_float4, desc.offset + 1);
+  tfm.z = kernel_tex_fetch(__attributes_float4, desc.offset + 2);

  return tfm;
 }
--- a/intern/cycles/kernel/geom/curve.h
+++ b/intern/cycles/kernel/geom/curve.h
@@ -126,8 +126,8 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
    int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
    int k1 = k0 + 1;

-    float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
-    float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
+    float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
+    float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);

 #  ifdef __RAY_DIFFERENTIALS__
    if (dx)
@@ -149,7 +149,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset));
+      return kernel_tex_fetch(__attributes_float3, offset);
    }
    else {
      return make_float3(0.0f, 0.0f, 0.0f);
@@ -168,8 +168,8 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
    int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
    int k1 = k0 + 1;

-    float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
-    float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);
+    float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + k0);
+    float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + k1);

 #  ifdef __RAY_DIFFERENTIALS__
    if (dx)
@@ -191,7 +191,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return kernel_tex_fetch(__attributes_float3, offset);
+      return kernel_tex_fetch(__attributes_float4, offset);
    }
    else {
      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
--- a/intern/cycles/kernel/geom/motion_curve.h
+++ b/intern/cycles/kernel/geom/motion_curve.h
@@ -48,8 +48,8 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals kg,

    offset += step * numkeys;

-    keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
-    keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
+    keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
+    keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
  }
 }

@@ -106,10 +106,10 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals kg,

    offset += step * numkeys;

-    keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
-    keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
-    keys[2] = kernel_tex_fetch(__attributes_float3, offset + k2);
-    keys[3] = kernel_tex_fetch(__attributes_float3, offset + k3);
+    keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
+    keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
+    keys[2] = kernel_tex_fetch(__attributes_float4, offset + k2);
+    keys[3] = kernel_tex_fetch(__attributes_float4, offset + k3);
  }
 }

--- a/intern/cycles/kernel/geom/motion_triangle.h
+++ b/intern/cycles/kernel/geom/motion_triangle.h
@@ -43,9 +43,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
 {
  if (step == numsteps) {
    /* center step: regular vertex location */
-    verts[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-    verts[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-    verts[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+    verts[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+    verts[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+    verts[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
  }
  else {
    /* center step not store in this array */
@@ -54,9 +54,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,

    offset += step * numverts;

-    verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
-    verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
-    verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
+    verts[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
+    verts[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
+    verts[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
  }
 }

@@ -70,9 +70,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,
 {
  if (step == numsteps) {
    /* center step: regular vertex location */
-    normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-    normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-    normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+    normals[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+    normals[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+    normals[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
  }
  else {
    /* center step is not stored in this array */
@@ -81,9 +81,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,

    offset += step * numverts;

-    normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
-    normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
-    normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
+    normals[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
+    normals[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
+    normals[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
  }
 }

--- a/intern/cycles/kernel/geom/motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/motion_triangle_intersect.h
@@ -163,19 +163,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals kg,
  motion_triangle_vertices(kg, fobject, prim, time, verts);
  /* Ray-triangle intersection, unoptimized. */
  float t, u, v;
-  if (ray_triangle_intersect(P,
-                             dir,
-                             tmax,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                             (ssef *)verts,
-#else
-                             verts[0],
-                             verts[1],
-                             verts[2],
-#endif
-                             &u,
-                             &v,
-                             &t)) {
+  if (ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
 #ifdef __VISIBILITY_FLAG__
    /* Visibility flag test. we do it here under the assumption
     * that most triangles are culled by node flags.
@@ -229,19 +217,7 @@ ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals kg,
  motion_triangle_vertices(kg, local_object, prim, time, verts);
  /* Ray-triangle intersection, unoptimized. */
  float t, u, v;
-  if (!ray_triangle_intersect(P,
-                              dir,
-                              tmax,
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                              (ssef *)verts,
-#  else
-                              verts[0],
-                              verts[1],
-                              verts[2],
-#  endif
-                              &u,
-                              &v,
-                              &t)) {
+  if (!ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
    return false;
  }

--- a/intern/cycles/kernel/geom/patch.h
+++ b/intern/cycles/kernel/geom/patch.h
@@ -380,7 +380,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals kg,
    *dv = make_float3(0.0f, 0.0f, 0.0f);

  for (int i = 0; i < num_control; i++) {
-    float3 v = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + indices[i]));
+    float3 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]);

    val += v * weights[i];
    if (du)
@@ -417,7 +417,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals kg,
    *dv = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

  for (int i = 0; i < num_control; i++) {
-    float4 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]);
+    float4 v = kernel_tex_fetch(__attributes_float4, offset + indices[i]);

    val += v * weights[i];
    if (du)
--- a/intern/cycles/kernel/geom/primitive.h
+++ b/intern/cycles/kernel/geom/primitive.h
@@ -284,18 +284,33 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
    int numverts, numkeys;
    object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);

-    /* lookup attributes */
-    motion_pre = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
-
-    desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE) ? numverts : numkeys;
-    motion_post = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
-
 #ifdef __HAIR__
-    if (is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
-      object_position_transform(kg, sd, &motion_pre);
-      object_position_transform(kg, sd, &motion_post);
+    if (is_curve_primitive) {
+      motion_pre = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
+      desc.offset += numkeys;
+      motion_post = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
+
+      /* Curve */
+      if ((sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+        object_position_transform(kg, sd, &motion_pre);
+        object_position_transform(kg, sd, &motion_post);
+      }
    }
+    else
 #endif
+        if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+      /* Triangle */
+      if (subd_triangle_patch(kg, sd) == ~0) {
+        motion_pre = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+        desc.offset += numverts;
+        motion_post = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+      }
+      else {
+        motion_pre = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+        desc.offset += numverts;
+        motion_post = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+      }
+    }
  }

  /* object motion. note that depending on the mesh having motion vectors, this
--- a/intern/cycles/kernel/geom/subd_triangle.h
+++ b/intern/cycles/kernel/geom/subd_triangle.h
@@ -20,13 +20,6 @@

 CCL_NAMESPACE_BEGIN

-/* Patch index for triangle, -1 if not subdivision triangle */
-
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
-}
-
 /* UV coords of triangle within patch */

 ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg,
@@ -443,8 +436,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
    if (dy)
      *dy = make_float3(0.0f, 0.0f, 0.0f);

-    return float4_to_float3(
-        kernel_tex_fetch(__attributes_float3, desc.offset + subd_triangle_patch_face(kg, patch)));
+    return kernel_tex_fetch(__attributes_float3,
+                            desc.offset + subd_triangle_patch_face(kg, patch));
  }
  else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
    float2 uv[3];
@@ -452,10 +445,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,

    uint4 v = subd_triangle_patch_indices(kg, patch);

-    float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.x));
-    float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.y));
-    float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.z));
-    float3 f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.w));
+    float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x);
+    float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y);
+    float3 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z);
+    float3 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w);

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
      f1 = (f1 + f0) * 0.5f;
@@ -484,10 +477,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,

    float3 f0, f1, f2, f3;

-    f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset));
-    f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset));
-    f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset));
-    f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset));
+    f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset);
+    f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset);
+    f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset);
+    f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset);

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
      f1 = (f1 + f0) * 0.5f;
@@ -513,7 +506,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
    if (dy)
      *dy = make_float3(0.0f, 0.0f, 0.0f);

-    return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset));
+    return kernel_tex_fetch(__attributes_float3, desc.offset);
  }
  else {
    if (dx)
@@ -590,7 +583,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
    if (dy)
      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

-    return kernel_tex_fetch(__attributes_float3,
+    return kernel_tex_fetch(__attributes_float4,
                            desc.offset + subd_triangle_patch_face(kg, patch));
  }
  else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
@@ -599,10 +592,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,

    uint4 v = subd_triangle_patch_indices(kg, patch);

-    float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x);
-    float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y);
-    float4 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z);
-    float4 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w);
+    float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + v.x);
+    float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + v.y);
+    float4 f2 = kernel_tex_fetch(__attributes_float4, desc.offset + v.z);
+    float4 f3 = kernel_tex_fetch(__attributes_float4, desc.offset + v.w);

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
      f1 = (f1 + f0) * 0.5f;
@@ -642,10 +635,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
          color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, corners[3] + desc.offset)));
    }
    else {
-      f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset);
-      f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset);
-      f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset);
-      f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset);
+      f0 = kernel_tex_fetch(__attributes_float4, corners[0] + desc.offset);
+      f1 = kernel_tex_fetch(__attributes_float4, corners[1] + desc.offset);
+      f2 = kernel_tex_fetch(__attributes_float4, corners[2] + desc.offset);
+      f3 = kernel_tex_fetch(__attributes_float4, corners[3] + desc.offset);
    }

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
@@ -672,7 +665,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
    if (dy)
      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

-    return kernel_tex_fetch(__attributes_float3, desc.offset);
+    return kernel_tex_fetch(__attributes_float4, desc.offset);
  }
  else {
    if (dx)
--- a/intern/cycles/kernel/geom/triangle.h
+++ b/intern/cycles/kernel/geom/triangle.h
@@ -29,9 +29,9 @@ ccl_device_inline float3 triangle_normal(KernelGlobals kg, ccl_private ShaderDat
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-  const float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  const float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  const float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  const float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  const float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  const float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);

  /* return normal */
  if (sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
@@ -54,9 +54,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
  /* compute point */
  float t = 1.0f - u - v;
  *P = (u * v0 + v * v1 + t * v2);
@@ -78,9 +78,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
 ccl_device_inline void triangle_vertices(KernelGlobals kg, int prim, float3 P[3])
 {
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
 }

 /* Triangle vertex locations and vertex normals */
@@ -91,12 +91,12 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals kg,
                                                     float3 N[3])
 {
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
-  N[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  N[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  N[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
+  N[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  N[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  N[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
 }

 /* Interpolate smooth vertex normal from vertices */
@@ -106,9 +106,9 @@ triangle_smooth_normal(KernelGlobals kg, float3 Ng, int prim, float u, float v)
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);

  float3 N = safe_normalize((1.0f - u - v) * n2 + u * n0 + v * n1);

@@ -120,9 +120,9 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);

  /* ensure that the normals are in object space */
  if (sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED) {
@@ -145,9 +145,9 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals kg,
 {
  /* fetch triangle vertex coordinates */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  const float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  const float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  const float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  const float3 p0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  const float3 p1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  const float3 p2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);

  /* compute derivatives of P w.r.t. uv */
  *dPdu = (p0 - p2);
@@ -267,15 +267,15 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,

    if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
      const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
-      f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
-      f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
+      f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
+      f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
+      f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
    }
    else {
      const int tri = desc.offset + sd->prim * 3;
-      f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
-      f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
-      f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
+      f0 = kernel_tex_fetch(__attributes_float3, tri + 0);
+      f1 = kernel_tex_fetch(__attributes_float3, tri + 1);
+      f2 = kernel_tex_fetch(__attributes_float3, tri + 2);
    }

 #ifdef __RAY_DIFFERENTIALS__
@@ -298,7 +298,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                               desc.offset;
-      return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset));
+      return kernel_tex_fetch(__attributes_float3, offset);
    }
    else {
      return make_float3(0.0f, 0.0f, 0.0f);
@@ -318,16 +318,16 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,

    if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
      const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
-      f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
-      f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
+      f0 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.x);
+      f1 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.y);
+      f2 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.z);
    }
    else {
      const int tri = desc.offset + sd->prim * 3;
      if (desc.element == ATTR_ELEMENT_CORNER) {
-        f0 = kernel_tex_fetch(__attributes_float3, tri + 0);
-        f1 = kernel_tex_fetch(__attributes_float3, tri + 1);
-        f2 = kernel_tex_fetch(__attributes_float3, tri + 2);
+        f0 = kernel_tex_fetch(__attributes_float4, tri + 0);
+        f1 = kernel_tex_fetch(__attributes_float4, tri + 1);
+        f2 = kernel_tex_fetch(__attributes_float4, tri + 2);
      }
      else {
        f0 = color_srgb_to_linear_v4(
@@ -359,7 +359,7 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                               desc.offset;
-      return kernel_tex_fetch(__attributes_float3, offset);
+      return kernel_tex_fetch(__attributes_float4, offset);
    }
    else {
      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
--- a/intern/cycles/kernel/geom/triangle_intersect.h
+++ b/intern/cycles/kernel/geom/triangle_intersect.h
@@ -37,27 +37,11 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg,
 {
  const int prim = kernel_tex_fetch(__prim_index, prim_addr);
  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex];
-#else
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
-#endif
  float t, u, v;
-  if (ray_triangle_intersect(P,
-                             dir,
-                             tmax,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                             ssef_verts,
-#else
-                             float4_to_float3(tri_a),
-                             float4_to_float3(tri_b),
-                             float4_to_float3(tri_c),
-#endif
-                             &u,
-                             &v,
-                             &t)) {
+  if (ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
 #ifdef __VISIBILITY_FLAG__
    /* Visibility flag test. we do it here under the assumption
     * that most triangles are culled by node flags.
@@ -106,27 +90,11 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,

  const int prim = kernel_tex_fetch(__prim_index, prim_addr);
  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex];
-#  else
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)),
-               tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)),
-               tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
-#  endif
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
  float t, u, v;
-  if (!ray_triangle_intersect(P,
-                              dir,
-                              tmax,
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                              ssef_verts,
-#  else
-                              tri_a,
-                              tri_b,
-                              tri_c,
-#  endif
-                              &u,
-                              &v,
-                              &t)) {
+  if (!ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
    return false;
  }

@@ -178,11 +146,6 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
  isect->t = t;

  /* Record geometric normal. */
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)),
-               tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)),
-               tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
-#  endif
  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));

  return false;
@@ -223,9 +186,9 @@ ccl_device_inline float3 triangle_refine(KernelGlobals kg,
  P = P + D * t;

  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+  const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+                      tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+                      tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
  float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
  float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
  float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
@@ -280,9 +243,9 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals kg,

 #  ifdef __INTERSECTION_REFINE__
  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+  const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+                      tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+                      tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
  float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
  float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
  float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
--- a/intern/cycles/kernel/geom/volume.h
+++ b/intern/cycles/kernel/geom/volume.h
@@ -75,7 +75,7 @@ ccl_device float4 volume_attribute_float4(KernelGlobals kg,
                                          const AttributeDescriptor desc)
 {
  if (desc.element & (ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
-    return kernel_tex_fetch(__attributes_float3, desc.offset);
+    return kernel_tex_fetch(__attributes_float4, desc.offset);
  }
  else if (desc.element == ATTR_ELEMENT_VOXEL) {
    /* todo: optimize this so we don't have to transform both here and in
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -71,14 +71,16 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
  /* Setup render buffers. */
  const int index = INTEGRATOR_STATE(state, path, render_pixel_index);
  const int pass_stride = kernel_data.film.pass_stride;
-  render_buffer += index * pass_stride;
+  ccl_global float *buffer = render_buffer + index * pass_stride;

-  ccl_global float *primitive = render_buffer + kernel_data.film.pass_bake_primitive;
-  ccl_global float *differential = render_buffer + kernel_data.film.pass_bake_differential;
+  ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive;
+  ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential;

  const int seed = __float_as_uint(primitive[0]);
  int prim = __float_as_uint(primitive[1]);
  if (prim == -1) {
+    /* Accumulate transparency for empty pixels. */
+    kernel_accum_transparent(kg, state, 0, 1.0f, buffer);
    return false;
  }

--- a/intern/cycles/kernel/integrator/path_state.h
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -122,7 +122,7 @@ ccl_device_inline void path_state_next(KernelGlobals kg, IntegratorState state,
    /* volume scatter */
    flag |= PATH_RAY_VOLUME_SCATTER;
    flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
-    if (bounce == 1) {
+    if (!(flag & PATH_RAY_ANY_PASS)) {
      flag |= PATH_RAY_VOLUME_PASS;
    }

@@ -184,7 +184,7 @@ ccl_device_inline void path_state_next(KernelGlobals kg, IntegratorState state,
    }

    /* Render pass categories. */
-    if (bounce == 1) {
+    if (!(flag & PATH_RAY_ANY_PASS) && !(flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
      flag |= PATH_RAY_SURFACE_PASS;
    }
  }
@@ -208,9 +208,7 @@ ccl_device_inline bool path_state_volume_next(IntegratorState state)
  }

  /* Random number generator next bounce. */
-  if (volume_bounds_bounce > 1) {
-    INTEGRATOR_STATE_WRITE(state, path, rng_offset) += PRNG_BOUNCE_NUM;
-  }
+  INTEGRATOR_STATE_WRITE(state, path, rng_offset) += PRNG_BOUNCE_NUM;

  return true;
 }
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -20,7 +20,6 @@
 #include "kernel/integrator/shader_eval.h"
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"
-#include "kernel/sample/mis.h"

 CCL_NAMESPACE_BEGIN

@@ -81,8 +80,7 @@ ccl_device float3 integrator_eval_background_shader(KernelGlobals kg,
    /* multiple importance sampling, get background light pdf for ray
     * direction, and compute weight with respect to BSDF pdf */
    const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D);
-    const float mis_weight = power_heuristic(mis_ray_pdf, pdf);
-
+    const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
    L *= mis_weight;
  }
 #  endif
@@ -169,7 +167,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
        /* multiple importance sampling, get regular light pdf,
         * and compute weight with respect to BSDF pdf */
        const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-        const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+        const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
        light_eval *= mis_weight;
      }

--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -84,7 +84,7 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
    /* multiple importance sampling, get regular light pdf,
     * and compute weight with respect to BSDF pdf */
    const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-    const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+    const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
    light_eval *= mis_weight;
  }

--- a/intern/cycles/kernel/integrator/shade_shadow.h
+++ b/intern/cycles/kernel/integrator/shade_shadow.h
@@ -95,8 +95,8 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,

  shader_setup_from_volume(kg, shadow_sd, &ray);

-  const float step_size = volume_stack_step_size(
-      kg, [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); });
+  VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i));
+  const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass);

  volume_shadow_heterogeneous(kg, state, &ray, shadow_sd, throughput, step_size);
 }
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -27,8 +27,6 @@
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"

-#include "kernel/sample/mis.h"
-
 CCL_NAMESPACE_BEGIN

 ccl_device_forceinline void integrate_surface_shader_setup(KernelGlobals kg,
@@ -95,8 +93,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
    /* Multiple importance sampling, get triangle light pdf,
     * and compute weight with respect to BSDF pdf. */
    float pdf = triangle_light_pdf(kg, sd, t);
-    float mis_weight = power_heuristic(bsdf_pdf, pdf);
-
+    float mis_weight = light_sample_mis_weight_forward(kg, bsdf_pdf, pdf);
    L *= mis_weight;
  }

@@ -155,7 +152,7 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
  bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf);

  if (ls.shader & SHADER_USE_MIS) {
-    const float mis_weight = power_heuristic(ls.pdf, bsdf_pdf);
+    const float mis_weight = light_sample_mis_weight_nee(kg, ls.pdf, bsdf_pdf);
    bsdf_eval_mul(&bsdf_eval, mis_weight);
  }

@@ -195,12 +192,13 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval);

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 pass_diffuse_weight = (bounce == 0) ?
-                                           bsdf_eval_pass_diffuse_weight(&bsdf_eval) :
-                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
-    const float3 pass_glossy_weight = (bounce == 0) ?
-                                          bsdf_eval_pass_glossy_weight(&bsdf_eval) :
-                                          INTEGRATOR_STATE(state, path, pass_glossy_weight);
+    const packed_float3 pass_diffuse_weight =
+        (bounce == 0) ? packed_float3(bsdf_eval_pass_diffuse_weight(&bsdf_eval)) :
+                        INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+    const packed_float3 pass_glossy_weight = (bounce == 0) ?
+                                                 packed_float3(
+                                                     bsdf_eval_pass_glossy_weight(&bsdf_eval)) :
+                                                 INTEGRATOR_STATE(state, path, pass_glossy_weight);
    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight;
  }
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -27,8 +27,6 @@
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"

-#include "kernel/sample/mis.h"
-
 CCL_NAMESPACE_BEGIN

 #ifdef __VOLUME__
@@ -78,9 +76,8 @@ ccl_device_inline bool shadow_volume_shader_sample(KernelGlobals kg,
                                                   ccl_private ShaderData *ccl_restrict sd,
                                                   ccl_private float3 *ccl_restrict extinction)
 {
-  shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, [=](const int i) {
-    return integrator_state_read_shadow_volume_stack(state, i);
-  });
+  VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i))
+  shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, volume_read_lambda_pass);

  if (!(sd->flag & SD_EXTINCTION)) {
    return false;
@@ -98,9 +95,8 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals kg,
                                            ccl_private VolumeShaderCoefficients *coeff)
 {
  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-  shader_eval_volume<false>(kg, state, sd, path_flag, [=](const int i) {
-    return integrator_state_read_volume_stack(state, i);
-  });
+  VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
+  shader_eval_volume<false>(kg, state, sd, path_flag, volume_read_lambda_pass);

  if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) {
    return false;
@@ -263,6 +259,12 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
 /* Equi-angular sampling as in:
 * "Importance Sampling Techniques for Path Tracing in Participating Media" */

+/* Below this pdf we ignore samples, as they tend to lead to very long distances.
+ * This can cause performance issues with BVH traversal in OptiX, leading it to
+ * traverse many nodes. Since these contribute very little to the image, just ignore
+ * those samples. */
+#  define VOLUME_SAMPLE_PDF_CUTOFF 1e-8f
+
 ccl_device float volume_equiangular_sample(ccl_private const Ray *ccl_restrict ray,
                                           const float3 light_P,
                                           const float xi,
@@ -437,7 +439,8 @@ ccl_device_forceinline void volume_integrate_step_scattering(

  /* Equiangular sampling for direct lighting. */
  if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR && !result.direct_scatter) {
-    if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t) {
+    if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t &&
+        vstate.equiangular_pdf > VOLUME_SAMPLE_PDF_CUTOFF) {
      const float new_dt = result.direct_t - vstate.start_t;
      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);

@@ -474,26 +477,28 @@ ccl_device_forceinline void volume_integrate_step_scattering(
      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
      const float distance_pdf = dot(channel_pdf, coeff.sigma_t * new_transmittance);

-      /* throughput */
-      result.indirect_scatter = true;
-      result.indirect_t = new_t;
-      result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf;
-      shader_copy_volume_phases(&result.indirect_phases, sd);
+      if (vstate.distance_pdf * distance_pdf > VOLUME_SAMPLE_PDF_CUTOFF) {
+        /* throughput */
+        result.indirect_scatter = true;
+        result.indirect_t = new_t;
+        result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf;
+        shader_copy_volume_phases(&result.indirect_phases, sd);

-      if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
-        /* If using distance sampling for direct light, just copy parameters
-         * of indirect light since we scatter at the same point then. */
-        result.direct_scatter = true;
-        result.direct_t = result.indirect_t;
-        result.direct_throughput = result.indirect_throughput;
-        shader_copy_volume_phases(&result.direct_phases, sd);
+        if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+          /* If using distance sampling for direct light, just copy parameters
+           * of indirect light since we scatter at the same point then. */
+          result.direct_scatter = true;
+          result.direct_t = result.indirect_t;
+          result.direct_throughput = result.indirect_throughput;
+          shader_copy_volume_phases(&result.direct_phases, sd);

-        /* Multiple importance sampling. */
-        if (vstate.use_mis) {
-          const float equiangular_pdf = volume_equiangular_pdf(ray, equiangular_light_P, new_t);
-          const float mis_weight = power_heuristic(vstate.distance_pdf * distance_pdf,
-                                                   equiangular_pdf);
-          result.direct_throughput *= 2.0f * mis_weight;
+          /* Multiple importance sampling. */
+          if (vstate.use_mis) {
+            const float equiangular_pdf = volume_equiangular_pdf(ray, equiangular_light_P, new_t);
+            const float mis_weight = power_heuristic(vstate.distance_pdf * distance_pdf,
+                                                     equiangular_pdf);
+            result.direct_throughput *= 2.0f * mis_weight;
+          }
        }
      }
    }
@@ -761,7 +766,7 @@ ccl_device_forceinline void integrate_volume_direct_light(
  const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval);

  if (ls->shader & SHADER_USE_MIS) {
-    float mis_weight = power_heuristic(ls->pdf, phase_pdf);
+    float mis_weight = light_sample_mis_weight_nee(kg, ls->pdf, phase_pdf);
    bsdf_eval_mul(&phase_eval, mis_weight);
  }

@@ -794,9 +799,10 @@ ccl_device_forceinline void integrate_volume_direct_light(
  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 pass_diffuse_weight = (bounce == 0) ?
-                                           one_float3() :
-                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+    const packed_float3 pass_diffuse_weight = (bounce == 0) ?
+                                                  packed_float3(one_float3()) :
+                                                  INTEGRATOR_STATE(
+                                                      state, path, pass_diffuse_weight);
    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = zero_float3();
  }
@@ -921,8 +927,8 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
                                                VOLUME_SAMPLE_DISTANCE;

  /* Step through volume. */
-  const float step_size = volume_stack_step_size(
-      kg, [=](const int i) { return integrator_state_read_volume_stack(state, i); });
+  VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
+  const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass);

  /* TODO: expensive to zero closures? */
  VolumeIntegrateResult result = {};
--- a/intern/cycles/kernel/integrator/shader_eval.h
+++ b/intern/cycles/kernel/integrator/shader_eval.h
@@ -122,23 +122,20 @@ ccl_device_inline void shader_prepare_surface_closures(KernelGlobals kg,
      for (int i = 0; i < sd->num_closure; i++) {
        ccl_private ShaderClosure *sc = &sd->closure[i];

-        if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
-          if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIFFUSE) {
-            sc->type = CLOSURE_NONE_ID;
-            sc->sample_weight = 0.0f;
-          }
+        if ((CLOSURE_IS_BSDF_DIFFUSE(sc->type) &&
+             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIFFUSE)) ||
+            (CLOSURE_IS_BSDF_GLOSSY(sc->type) &&
+             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_GLOSSY)) ||
+            (CLOSURE_IS_BSDF_TRANSMISSION(sc->type) &&
+             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSMISSION))) {
+          sc->type = CLOSURE_NONE_ID;
+          sc->sample_weight = 0.0f;
        }
-        else if (CLOSURE_IS_BSDF_GLOSSY(sc->type)) {
-          if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_GLOSSY) {
-            sc->type = CLOSURE_NONE_ID;
-            sc->sample_weight = 0.0f;
-          }
-        }
-        else if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type)) {
-          if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSMISSION) {
-            sc->type = CLOSURE_NONE_ID;
-            sc->sample_weight = 0.0f;
-          }
+        else if ((CLOSURE_IS_BSDF_TRANSPARENT(sc->type) &&
+                  (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSPARENT))) {
+          sc->type = CLOSURE_HOLDOUT_ID;
+          sc->sample_weight = 0.0f;
+          sd->flag |= SD_HOLDOUT;
        }
      }
    }
--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -40,15 +40,15 @@ KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_T
 /* enum PathRayFlag */
 KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput for shadow pass. */
 KERNEL_STRUCT_MEMBER(shadow_path,
-                     float3,
+                     packed_float3,
                     unshadowed_throughput,
                     KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE)
 /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Number of intersections found by ray-tracing. */
 KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_END(shadow_path)
@@ -56,8 +56,8 @@ KERNEL_STRUCT_END(shadow_path)
 /********************************** Shadow Ray *******************************/

 KERNEL_STRUCT_BEGIN(shadow_ray)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -59,12 +59,12 @@ KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
 /* Continuation probability for path termination. */
 KERNEL_STRUCT_MEMBER(path, float, continuation_probability, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
-KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
 /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Denoising. */
-KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
+KERNEL_STRUCT_MEMBER(path, packed_float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
 /* Shader sorting. */
 /* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */
 KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING)
@@ -73,8 +73,8 @@ KERNEL_STRUCT_END(path)
 /************************************** Ray ***********************************/

 KERNEL_STRUCT_BEGIN(ray)
-KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
@@ -96,10 +96,10 @@ KERNEL_STRUCT_END(isect)
 /*************** Subsurface closure state for subsurface kernel ***************/

 KERNEL_STRUCT_BEGIN(subsurface)
-KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE)
-KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, albedo, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, radius, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
-KERNEL_STRUCT_MEMBER(subsurface, float3, Ng, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, Ng, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_END(subsurface)

 /********************************** Volume Stack ******************************/
--- a/Show More
+++ b/Show More